added requirements and some things

2018-09-17 14:47:50 +02:00 · 2018-09-17 14:47:50 +02:00 · ab578ae0c6
commit ab578ae0c6
parent c2066d6adb
13 changed files with 726 additions and 237 deletions
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -6,11 +6,11 @@ BagOfWords counts word stems in an article
 and adds new words to the global vocabulary. 

 Anm.:
-The multinomial Naive Bayes classifier is suitable 
-for classification with discrete features (e.g., 
-word counts for text classification). 
-The multinomial distribution normally requires 
-integer feature counts. However, in practice, 
+The multinomial Naive Bayes classifier is suitable
+for classification with discrete features (e.g.,
+word counts for text classification).
+The multinomial distribution normally requires
+integer feature counts. However, in practice,
 fractional counts such as tf-idf may also work.
 => durch 'relative_word_frequencies' als Paramter berücksichtigt
 '''
@ -32,14 +32,14 @@ class BagOfWords:
    def extract_words(text):
        '''takes article as argument, removes numbers,
        returns list of single words, recurrences included.
-        '''                   
+        '''
        stop_words = BagOfWords.set_stop_words()
        # replace punctuation marks with spaces
-        words = re.sub(r'\W', ' ', text)      
+        words = re.sub(r'\W', ' ', text)
        # split str into list of single words
-        words = words.split()        
+        words = words.split()
        # list of all words to return
-        words_cleaned = []        
+        words_cleaned = []
        for word in words:
            # remove numbers
            if word.isalpha():
@ -50,18 +50,18 @@ class BagOfWords:
                    # add every word in lowercase
                    words_cleaned.append(word.lower())
        return words_cleaned
-        
+
    def reduce_word_to_stem(word):
        '''takes normal word as input, returns the word's stem
        '''
        stemmer = PorterStemmer()
        # replace word by its stem
-        word = stemmer.stem(word)           
+        word = stemmer.stem(word)
        return word
-        
+
    def make_matrix(series, vocab, relative_word_frequencies=True):
        '''calculates word stem frequencies in input articles.
-        returns matrix (DataFrame) with relative word frequencies 
+        returns matrix (DataFrame) with relative word frequencies
        (0 <= values < 1) if relative_word_frequencies=True or absolute
        word frequencies (int) if relative_word_frequencies=False.
        (rows: different articles, colums: different words in vocab)
@ -69,14 +69,14 @@ class BagOfWords:
        print('# BOW: calculating matrix')
        print('#')
        # create list of tuples
-        vectors = []       
+        vectors = []
        for i in range(len(series)):
            # extract text of single article
            text = series.iloc[i]
            # extract its words
            words = BagOfWords.extract_words(text)
-            # count words in single article 
-            word_count = len(words)     
+            # count words in single article
+            word_count = len(words)
            vector = []
            for i, v in enumerate(vocab):
                vector.append(0)
@ -88,14 +88,14 @@ class BagOfWords:
                        else:
                            # absolute word frequency
                            vector[i] += 1
-                            
+
            # add single vector as tuple
-            vectors.append(tuple(vector))           
-        df_vectors = pd.DataFrame.from_records(vectors, 
-                                               index=None, 
-                                               columns=vocab)            
+            vectors.append(tuple(vector))
+        df_vectors = pd.DataFrame.from_records(vectors,
+                                               index=None,
+                                               columns=vocab)
        return df_vectors
-        
+
    def make_vocab(series):
        '''adds words of input articles to a global vocabulary.
        input: dataframe of all articles, return value: list of words
@ -110,56 +110,56 @@ class BagOfWords:
        # sort list
        vocab.sort()
        return vocab
-        
+
    def set_stop_words():
        '''creates list of all words that will be ignored
-        '''   
+        '''
        # stopwords
-        stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 
+        stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
                      'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
-                      'aren\'t', 'as', 'at', 'be', 'because', 'been', 
-                      'before', 'being', 'below', 'between', 'both', 'but', 
-                      'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn', 
-                      'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing', 
-                      'don', 'don\'t', 'down', 'during', 'each', 'few', 
+                      'aren\'t', 'as', 'at', 'be', 'because', 'been',
+                      'before', 'being', 'below', 'between', 'both', 'but',
+                      'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
+                      'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
+                      'don', 'don\'t', 'down', 'during', 'each', 'few',
                      'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
-                      'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t', 
+                      'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
                      'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
-                      'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 
+                      'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
                      'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
-                      'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 
-                      'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn', 
+                      'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
+                      'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
                      'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
-                      'on', 'once', 'only', 'or', 'other', 'our', 'ours', 
-                      'ourselves', 'out', 'over', 'own', 're', 's', 'same', 
-                      'shan', 'shan\'t', 'she', 'she\'s', 'should', 
-                      'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some', 
+                      'on', 'once', 'only', 'or', 'other', 'our', 'ours',
+                      'ourselves', 'out', 'over', 'own', 're', 's', 'same',
+                      'shan', 'shan\'t', 'she', 'she\'s', 'should',
+                      'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
                      'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
-                      'theirs', 'them', 'themselves', 'then', 'there', 
-                      'these', 'they', 'this', 'those', 'through', 'to', 
-                      'too', 'under', 'until', 'up', 've', 'very', 'was', 
-                      'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t', 
-                      'what', 'when', 'where', 'which', 'while', 'who', 
-                      'whom', 'why', 'will', 'with', 'won', 'won\'t', 
-                      'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 
-                      'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 
-                      'yourselves']    
-               
-        ##=> ist das sinnvoll?:         
+                      'theirs', 'them', 'themselves', 'then', 'there',
+                      'these', 'they', 'this', 'those', 'through', 'to',
+                      'too', 'under', 'until', 'up', 've', 'very', 'was',
+                      'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
+                      'what', 'when', 'where', 'which', 'while', 'who',
+                      'whom', 'why', 'will', 'with', 'won', 'won\'t',
+                      'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
+                      'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
+                      'yourselves']
+
+        ##=> ist das sinnvoll?:
        #add specific words
-        #stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 
-        #                   'wednesday', 'thursday', 'friday'])          
+        #stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
+        #                   'wednesday', 'thursday', 'friday'])
        #remove the word 'not' from stop words
-        #stop_words.remove('not')       
-        
+        #stop_words.remove('not')
+
        for i in range(len(stop_words)):
-        
+
            # remove punctuation marks and strip endings from abbreviations
            #stop_words[i] = re.split(r'\W', stop_words[i])[0]
-            
+
            # reduce word to stem
            stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
        # transform list to set to eliminate duplicates
-        stop_words = set(stop_words)    
-        
+        stop_words = set(stop_words)
+
        return stop_words
--- a/CosineSimilarity.py
+++ b/CosineSimilarity.py
@ -3,16 +3,16 @@ Cosine Similarity
 =================

 CosineSimilarity measures the similarity between to articles.
-It calculates c: the cosine of the angle between the articles 
+It calculates c: the cosine of the angle between the articles
 vectors dict_1 and dict_2.
-c = (dict_1 * dict_2) / (|dict_1| * |dict_2|). 
-c = 1, if articles are equal => identicalness is 100% 
+c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
+c = 1, if articles are equal => identicalness is 100%
 0 > c > 1, else => identicalness is (c*100)%
 (The greater c, the more similar two articles are.)
 '''

-#TODO: uses dictionaries of each article 
-# => ToDo: has to be changed as we are now using vectors
+#TODO:uses dictionaries of each article
+#=>ToDo:has to be changed as we are now using vectors

 import math

@ -23,47 +23,47 @@ class CosineSimilarity:
    def cos_sim(dict_1, dict_2):

        # list of all different words
-        vocab = []        
-        
+        vocab = []
+
        # insert words of 1st article into vocab
        for key in dict_1.keys():
            if key not in vocab:
                vocab.append(key)
-                
+
        # insert words of 2nd article into vocab
        for key in dict_2.keys():
            if key not in vocab:
                vocab.append(key)
-                
-        # delete first entry ('sum_words')       
+
+        # delete first entry ('sum_words')
        vocab.pop(0)
-        
+
        # create vectors
        vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
        vector_2 = CosineSimilarity.create_vector(dict_2, vocab)
-        
-        # start calculation       
+
+        # start calculation
        # calculate numerator of formula
-        sum_1 = 0    
-        
+        sum_1 = 0
+
        for i in range (0,len(vector_1)):
-            sum_1 += vector_1[i] * vector_2[i]  
-            
-        # calculate denominator of formula 
+            sum_1 += vector_1[i] * vector_2[i]
+
+        # calculate denominator of formula
        sum_2 = 0
-        
+
        for entry in vector_1:
            sum_2 += entry ** 2
-        
-        sum_3 = 0    
+
+        sum_3 = 0
        for entry in vector_2:
            sum_3 += entry ** 2
-     
+
        return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
-        
+
    def create_vector(dict, vocab):
-        # word frequency vector 
-        vector = []     
+        # word frequency vector
+        vector = []
        for word in vocab:
            # check if word occurs in article
            if word in dict:
--- a/CsvHandler.py
+++ b/CsvHandler.py
@ -12,17 +12,17 @@ import pandas as pd
 class CsvHandler:

    def read_csv(csv_file):
-        df = pd.read_csv(csv_file, 
-                         sep='|', 
-                         header=0, 
-                         engine='python', 
+        df = pd.read_csv(csv_file,
+                         sep='|',
+                         header=0,
+                         engine='python',
                         usecols=[1,2,4], #use only 'Title', 'Text' and 'Label'
-                         decimal='.', 
+                         decimal='.',
                         quotechar='\'',
                         #nrows = 200,
                         quoting=csv.QUOTE_NONE)
        return df
-        
+
    def write_csv(df, file_name):
        df.to_csv(file_name, sep='|')
        print('### saved {} articles in {}'.format(len(df), file_name))
--- a/DecisionTree.py
+++ b/DecisionTree.py
@ -2,14 +2,15 @@
 Decision Tree Classifier
 ========================

-Decision Tree Classifier takes as input two arrays: 
+Decision Tree Classifier takes as input two arrays:
 array X of size [n_samples, n_features], holding the training samples,
-and array y of integer values, size [n_samples], 
+and array y of integer values, size [n_samples],
 holding the class labels for the training samples.
 '''
 import operator

 from BagOfWords import BagOfWords
+from CsvHandler import CsvHandler

 import graphviz
 import numpy as np
@ -21,71 +22,80 @@ from sklearn.model_selection import StratifiedKFold

 class DecisionTree:

+    print('# starting program')
+    print('#')
+
+    file = 'classification_labelled_corrected.csv'
+
+    # read csv file
+    print('# reading dataset')
+    print('#')
+    dataset = CsvHandler.read_csv(file)
+
    def make_tree(dataset):
-    
        print('# starting decision tree')
        print('#')
-    
+
        X = dataset['Title'] + ' ' + dataset['Text']
        y = dataset['Label']
-        
+
        #count_vector = CountVectorizer()
-        
+
        # use stratified k-fold cross-validation as split method
        skf = StratifiedKFold(n_splits = 10, shuffle=True) 
-        
+
        # lists for metrics predicted on test/train set     
        f1_scores = []
-        f1_scores_train = []  
-        
+        f1_scores_train = []
+
        classifier = tree.DecisionTreeClassifier()
-        
+
        # dict of most important words of each fold
        important_words = {}
-        
+
        # for each fold
-        for train, test in skf.split(X,y):     
-                            
+        for train, test in skf.split(X,y):
+
            # BOW
-            vocab = BagOfWords.make_vocab(X[train])           
+            vocab = BagOfWords.make_vocab(X[train])
            # fit the training data and then return the matrix
-            training_data = BagOfWords.make_matrix(X[train], vocab)           
+            training_data = BagOfWords.make_matrix(X[train], vocab)
            # transform testing data and return the matrix
-            testing_data = BagOfWords.make_matrix(X[test], vocab) 
-            
+            testing_data = BagOfWords.make_matrix(X[test], vocab)
+
            # #fit the training data and then return the matrix
-            # training_data = count_vector.fit_transform(X[train], y[train]).toarray()          
+            # training_data = count_vector.fit_transform(X[train], y[train]).toarray()
            # #transform testing data and return the matrix
-            # testing_data = count_vector.transform(X[test]).toarray() 
-            
+            # testing_data = count_vector.transform(X[test]).toarray()
+
            # # apply select percentile
-            # selector = SelectPercentile(percentile=25)           
+            # selector = SelectPercentile(percentile=25)
            # selector.fit(training_data, y[train])
-            
-            # training_data_r = selector.transform(training_data)           
+
+            # training_data_r = selector.transform(training_data)
            # testing_data_r = selector.transform(testing_data)
-            
+
            # fit classifier
            classifier.fit(training_data, y[train])
-            
-            #predict class                      
+
+            #predict class
            predictions_train = classifier.predict(training_data)
            predictions_test = classifier.predict(testing_data)
-            
-            #store metrics predicted on test/train set          
-            f1_scores.append(f1_score(y[test], predictions_test))      
+
+            #store metrics predicted on test/train set
+            f1_scores.append(f1_score(y[test], predictions_test))
            f1_scores_train.append(f1_score(y[train], predictions_train))
-            
+
            # search for important features
            feature_importances = np.array(classifier.feature_importances_)
            important_indices = feature_importances.argsort()[-50:][::-1]
-            
+
            for i in important_indices:
                if vocab[i] in important_words:
                    important_words[vocab[i]] += feature_importances[i]
                else:
                    important_words[vocab[i]] = feature_importances[i]
-          
+
        print('20 most important words in training set:')
        print()
        sorted_i_w = sorted(important_words.items(), key=operator.itemgetter(1))
@ -93,17 +103,19 @@ class DecisionTree:
        i_w = [x[0] for x in sorted_i_w]
        print(i_w[:20])
        print()
-          
-        #print metrics of test set    
+
+        #print metrics of test set
        print('prediction of testing set:')
        print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
-                format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))       
+                format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
        print()
        # print('overfit testing: prediction of training set')
        # print('F1 score: min = {}, max = {}, average = {}'.
                # format(min(f1_scores_train), max(f1_scores_train),
                # sum(f1_scores_train)/float(len(f1_scores_train))))
        # print()
-        
        print('# ending decision tree')
-        print('#')
+        print('#')
+
+    DecisionTree.make_tree(dataset)
+    print('# ending program')
--- a/NER.py
+++ b/NER.py
@ -9,32 +9,32 @@ from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
 from nltk.tree import Tree 

 ''' TODO: falsch klassifiert:
-[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '), 
+[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
 ('PERSON', 'Maybank Kim Eng Securities '), ('PERSON', 'Krung Thai Bank '),
- ('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),  
+ ('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),
 ('PERSON', 'Keppel Corp '), ('ORGANIZATION', 'Companies ')]
 '''

 class NER:
      
-    def get_ne_with_label(text):   
+    def get_ne_with_label(text):
        labels = []
        names = []
        # TODO: letztes Wort wird nicht erkannt
        for chunk in ne_chunk(pos_tag(word_tokenize(text + 'lastword.'))):
-            if hasattr(chunk, 'label'):               
-                name = '' 
+            if hasattr(chunk, 'label'):
+                name = ''
                for c in chunk:
                    name += c[0] + ' '
-                if name not in names:    
+                if name not in names:
                    names.append(name.strip())
                    labels.append(chunk.label())
                    #print(chunk.label(), ' '.join(c[0] for c in chunk))
-        return list(zip(labels, names))               
-        
+        return list(zip(labels, names))
+
 test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets 
                \nmostly fell in light volumes on Tuesday as energy shares 
-                tracked \nfalls in global oil prices, while weaknesses in banking shares 
+                tracked \nfalls in global oil prices, while weaknesses in banking shares
                \namid concerns about loans to an ailing steel firm sent the Thai 
                \nindex to a one-week closing low. \nBangkok's SET index shed nearly 
                1 percent after four \nsessions of gains. The index closed at 1,379.32, 
@ -56,5 +56,5 @@ test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
                region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and 
                Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell 
                \namid uncertainty over global demand. \nFor Asian Companies click.'''
-                
+
 print(NER.get_ne_with_label(test_article))
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -2,17 +2,18 @@
 Naive Bayes Classifier
 ====================== 

-Naive Bayes is a probabilistic classifier that is able to predict a 
-probability distribution over a set of classes, rather than only 
-outputting the most likely class that the observation should belong to.
-'Naive' means, that it assumes that the value of a particular feature 
-(word in an article) is independent of the value of any other feature, 
-given the label. It considers each of these features to contribute 
+Naive Bayes is a probabilistic classifier that is able to predict a
+probability distribution over a set of classes, rather than only
+outputting the most likely class that the observation should belong to
+'Naive' means, that it assumes that the value of a particular feature
+(word in an article) is independent of the value of any other feature,
+given the label. It considers each of these features to contribute
 independently to the probability that it belongs to its category,
-regardless of any possible correlations between these features. 
+regardless of any possible correlations between these features.
 '''

 from BagOfWords import BagOfWords
+from CsvReader import CsvReader

 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
@ -22,98 +23,108 @@ from sklearn.naive_bayes import GaussianNB

 class NaiveBayes:

+    print('# starting program')
+    print('#')
+
+    file = 'classification_labelled_corrected.csv'
+
+    # read csv file
+    print('# reading dataset')
+    print('#')
+    dataset = CsvHandler.read_csv(file)
+
    def make_naive_bayes(dataset):
-        '''fits naive bayes model with StratifiedKFold, 
+        '''fits naive bayes model with StratifiedKFold,
        uses my BOW
-        '''           
+        '''
        print('# starting naive bayes')
        print('#')
-        
+
        # split data into text and label set
        # join title and text
-        X = dataset['Title'] + ' ' + dataset['Text']        
+        X = dataset['Title'] + ' ' + dataset['Text']
        y = dataset['Label']
-        
+
        cv = CountVectorizer()
-        
+
        # use stratified k-fold cross-validation as split method
-        skf = StratifiedKFold(n_splits = 10, shuffle=True)      
-        
-        classifier = GaussianNB()    
-        
+        skf = StratifiedKFold(n_splits = 10, shuffle=True)
+
+        classifier = GaussianNB()
+
        # lists for metrics
        recall_scores = []
        precision_scores = []
        f1_scores = []
-        
+
        # for each fold
        n = 0
-        for train, test in skf.split(X,y):  
-        
+        for train, test in skf.split(X,y):
+
            n += 1
            print('# split no. ' + str(n))
-            
+
            # eigenes BOW => schlechtere ergebnisse
-            vocab = BagOfWords.make_vocab(X[train])   
+            vocab = BagOfWords.make_vocab(X[train])
            # fit the training data and then return the matrix
-            training_data = BagOfWords.make_matrix(X[train], vocab)            
+            training_data = BagOfWords.make_matrix(X[train], vocab)
            # transform testing data and return the matrix
            testing_data = BagOfWords.make_matrix(X[test], vocab)
-            
+
            # # # using CountVectorizer:
            # # fit the training data and then return the matrix
            # training_data = cv.fit_transform(X[train], y[train]).toarray()
            # # transform testing data and return the matrix
-            # testing_data = cv.transform(X[test]).toarray()    
-            
+            # testing_data = cv.transform(X[test]).toarray()
+
            # # apply select percentile
-            # selector = SelectPercentile(percentile=25)           
+            # selector = SelectPercentile(percentile=25)
            # selector.fit(training_data, y[train])
-            
-            # training_data_r = selector.transform(training_data)           
+
+            # training_data_r = selector.transform(training_data)
            # testing_data_r = selector.transform(testing_data)
-            
+
            # #fit classifier
-            # classifier.fit(training_data_r, y[train])            
-            # #predict class                      
+            # classifier.fit(training_data_r, y[train])
+            # #predict class
            # predictions_train = classifier.predict(training_data_r)
            # predictions_test = classifier.predict(testing_data_r)
-            
+
            #fit classifier
-            classifier.fit(training_data, y[train])            
-            #predict class                      
+            classifier.fit(training_data, y[train])
+            #predict class
            predictions_train = classifier.predict(training_data)
            predictions_test = classifier.predict(testing_data)
-            
+
            #print and store metrics
            rec = recall_score(y[test], predictions_test)
            print('rec: ' + str(rec))
-            recall_scores.append(rec)  
+            recall_scores.append(rec)
            prec = precision_score(y[train], predictions_train)
            print('prec: ' + str(prec))
            print('#')
            precision_scores.append(prec)
            # equation for f1 score
            f1_scores.append(2 * (prec * rec)/(prec + rec))
-          
+
        ##########################
-        #print metrics of test set    
+        #print metrics of test set
        print('-------------------------')
        print('prediction of testing set:')
        print('Precision score: min = {}, max = {}, average = {}'
                .format(min(precision_scores),
                        max(precision_scores),
-                        sum(precision_scores)/float(len(precision_scores))))      
+                        sum(precision_scores)/float(len(precision_scores))))
        print('Recall score: min = {}, max = {}, average = {}'
                .format(min(recall_scores),
                        max(recall_scores),
-                        sum(recall_scores)/float(len(recall_scores))))      
+                        sum(recall_scores)/float(len(recall_scores))))
        print('F1 score: min = {}, max = {}, average = {}'
                .format(min(f1_scores), 
                        max(f1_scores),
-                        sum(f1_scores)/float(len(f1_scores))))    
+                        sum(f1_scores)/float(len(f1_scores))))
        print()
-        
+
        ##### nur für overfit testing ###########
        #print('overfit testing: prediction of training set')
        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
@ -124,28 +135,28 @@ class NaiveBayes:
        print('# ending naive bayes')
        print('#') 
        
-    ######## nur für resubstitutionsfehler benötigt ########       
+    ######## nur für resubstitutionsfehler benötigt ########
    def analyze_errors(dataset):
        '''calculates resubstitution error
        shows indices of false classified articles
        uses Gaussian Bayes with train test split
-        '''   
-        X_train_test = dataset['Title'] + ' ' + dataset['Text']      
+        '''
+        X_train_test = dataset['Title'] + ' ' + dataset['Text']
        y_train_test = dataset['Label']
        
-        count_vector = CountVectorizer()       
+        count_vector = CountVectorizer()
        # fit the training data and then return the matrix
        training_data = count_vector.fit_transform(X_train_test).toarray()
        # transform testing data and return the matrix
        testing_data = count_vector.transform(X_train_test).toarray()

        # Naive Bayes
-        classifier = GaussianNB()      
+        classifier = GaussianNB()
        # fit classifier
        classifier.fit(training_data, y_train_test)
-        
+
        # Predict class
-        predictions = classifier.predict(testing_data)        
+        predictions = classifier.predict(testing_data)
        print('Errors at index:')
        print()
        n = 0
@ -157,6 +168,9 @@ class NaiveBayes:
                .format(i, predictions[i], y_train_test[i]))
                print(X_train_test[i])
                print(y_train_test[i])
-                print()        
-        #print metrics               
-        print('F1 score: ', format(f1_score(y_train_test, predictions)))
+                print()
+        #print metrics
+        print('F1 score: ', format(f1_score(y_train_test, predictions)))
+
+    print('#')
+    print('# ending program')
--- a/README.md
+++ b/README.md
@ -1,3 +1,13 @@
 # thesis-anne
+my python classes for text mining, machine learning models, …

-my python classes for text mining, machine learning models, … 
+# Requirements
+pandas==0.20.1
+nltk==3.2.5
+webhoseio==0.5
+numpy==1.14.0
+graphviz==0.9
+scikit_learn==0.19.2
+
+# Installation under (UBUNTU?)
+apt-get install XX
--- a/Requester.py
+++ b/Requester.py
@ -12,12 +12,12 @@ import re
 from datetime import datetime

 import pandas as pd
-import webhoseio   
+import webhoseio

-from CsvHandler import CsvHandler    
+from CsvHandler import CsvHandler

 class Requester:
-                
+
    def save_articles_from_webhoseio():
        ''' create DataFrame of articles with
        Timestamp, Title, Text, SiteSection
@ -25,14 +25,14 @@ class Requester:
        '''
        datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
        filestring = 'download_articles_{}.csv'.format(datestring)
-    
+
        # print message
        print('# retrieving articles from webhose.io')
    
        # personal API key
-         webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
+        webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")

-        # webhose.io query 
+        # webhose.io query
        # suboptimal: usage of search terms :-(
        query_params = {
            "q": "thread.title:(merger OR merges OR merge OR merged OR "
@ -47,25 +47,25 @@ class Requester:
                 "has_video:false",
            "ts": "1527411742661",
            "sort": "crawled"}
-    
+
        output = webhoseio.query("filterWebContent", query_params)
-        
+
        sum_posts = output['totalResults']
        print('# total sum of posts: ' + str(sum_posts))
-        
+
        # 100 articles per batch (download)
-        num_downloads = int(sum_posts / 100)       
+        num_downloads = int(sum_posts / 100)
        print('# collecting first {} articles'.format(num_downloads * 100))
        print('# sorting out other sources than reuters')
-        
+
        # twodimensional list of all articles
        list_articles = []

        for n in range(num_downloads):
            # save next 100 articles
-            for i in range(100):  
+            for i in range(100):
                # check if correct source 'reuters'
-                if not re.search(r'reuters', 
+                if not re.search(r'reuters',
                                 output['posts'][i]['thread']['site_section']):
                    continue
                else:
@ -73,21 +73,21 @@ class Requester:
                    article.append(output['posts'][i]['published'])
                    article.append(output['posts'][i]['title'].replace('|', ' '))
                    # remove white spaces and separators
-                    text = output['posts'][i]['text'].replace('\n', ' ')
-                           .replace('\r', ' ').replace('|', ' ')                
+                    text = output['posts'][i]['text'].replace('\n', ' ')\
+                           .replace('\r', ' ').replace('|', ' ')
                    section = output['posts'][i]['thread']['site_section']
                    article.append(text)
                    # remove '\r' at end of some urls
-                    section = section.replace('\r', '') 
+                    section = section.replace('\r', '')
                    article.append(section)
                    # add article to list
                    list_articles.append(article)
-                
+
            # Get the next batch of 100 posts
            output = webhoseio.get_next()
-        
+
        # create DataFrame
-        df = pd.DataFrame(data=list_articles, 
+        df = pd.DataFrame(data=list_articles,
                          columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
        # save csv
        CsvHandler.write_csv(df, filestring)
--- a/SVM.py
+++ b/SVM.py
@ -2,14 +2,14 @@
 Support Vector Machines (SVM) Classifier
 ========================================

-The SVM training algorithm builds a model from the training data that assigns 
-the test samples to one category ('merger' or 'not merger'), 
-making it a non-probabilistic binary linear classifier. 
-An SVM model is a representation of the samples as points in space, 
-mapped so that the examples of the separate categories are divided 
-by a clear gap that is as wide as possible. 
-New samples are then mapped into that same space and predicted 
-to belong to a category based on which side of the gap they fall. 
+The SVM training algorithm builds a model from the training data that assigns
+the test samples to one category ('merger' or 'not merger'),
+making it a non-probabilistic binary linear classifier.
+An SVM model is a representation of the samples as points in space,
+mapped so that the examples of the separate categories are divided
+by a clear gap that is as wide as possible.
+New samples are then mapped into that same space and predicted
+to belong to a category based on which side of the gap they fall.
 '''

 from BagOfWords import BagOfWords
@ -25,12 +25,12 @@ from sklearn.svm import SVC
 class SVM:

    def make_svm(dataset):
-    
+
        print('# starting SVM')
        print('#')

        # split data into text and label set
-        
+
        # articles' text (title + text)
        X = dataset['Title'] + ' ' + dataset['Text']
        # articles' labels
@ -44,28 +44,28 @@ class SVM:
        X = CountVectorizer().fit_transform(X).toarray()

        # use stratified k-fold cross-validation as split method
-        skf = StratifiedKFold(n_splits = 10, shuffle=True)        
+        skf = StratifiedKFold(n_splits = 10, shuffle=True)

        # use only most important features
-        selector = SelectPercentile()  
-        
+        selector = SelectPercentile()
+
        pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
-        
-        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],                                                          
+
+        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
                            'SVC__kernel': ['linear','poly'],
-                            'SVC__gamma': [0.001, 0.01], 
-                            'SVC__C': [0.1, 1]}, 
-                            cv=skf, 
+                            'SVC__gamma': [0.001, 0.01],
+                            'SVC__C': [0.1, 1]},
+                            cv=skf,
                            scoring=make_scorer(f1_score))
-            
+
        print('# fit classifier')
-        print('#') 
-  
+        print('#')
+
        grid.fit(X,y)
-        
+
        # DataFrame of results
        df_results = grid.cv_results_
-    
+
        # print results
        ######################
        print('RESULTS:')
@ -76,12 +76,12 @@ class SVM:
        print('mean of means:')
        print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
        print('')
-        print('best score:') 
+        print('best score:')
        print(grid.best_score_)
        print()
        print('best parameters set found on development set:')
        print(grid.best_params_)
        print()
-        
+
        print('# ending SVM')
        print('#')
--- a/Starter.py
+++ b/Starter.py
@ -15,9 +15,6 @@ from SVM import SVM
 print('# starting program')
 print('#')

-# only if new unlabeled(!) data set is required:
-# Requester.save_articles_from_webhoseio()
-
 file = 'classification_labelled_corrected.csv'

 # read csv file
@ -25,7 +22,6 @@ print('# reading dataset')
 print('#')
 dataset = CsvHandler.read_csv(file)

-# DecisionTree.make_tree(dataset)
 NaiveBayes.make_naive_bayes(dataset)
 # SVM.make_svm(dataset)

--- a/thesis/LV.bib
+++ b/thesis/LV.bib
@ -0,0 +1,7 @@
+@BOOK{pierson2016,
+	AUTHOR="Lillian Pierson",
+	TITLE="Data Science für Dummies",
+	PUBLISHER="WILEY-VCH Verlag GmbH \& Co. KGaA",
+	YEAR=2016,
+	ADDRESS="Weinheim"
+}
--- a/thesis/UHH-Logo_2010_Farbe_CMYK.pdf
+++ b/thesis/UHH-Logo_2010_Farbe_CMYK.pdf
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@ -0,0 +1,450 @@
+\documentclass[11pt,a4paper]{scrbook}
+\usepackage{geometry}	
+\usepackage[utf8]{inputenc}
+\usepackage[T1]{fontenc}
+\usepackage[pdftex]{graphicx}
+%\usepackage[ngerman]{babel}
+\usepackage{colortbl}	
+\usepackage{xcolor}
+\usepackage{soul}
+\usepackage{cleveref}
+\usepackage{todonotes}
+
+\AtBeginDocument{\renewcommand{\chaptername}{}}
+
+% Kommentare Julian
+\newcommand{\jk}[1]{\todo[inline]{JK: #1}}
+\renewcommand{\familydefault}{\sfdefault}
+
+% Kommentare Anne
+\definecolor{comments}{cmyk}{1,0,1,0}
+\newcommand{\al}[1]{\todo[inline]{\color{comments}{AL: #1}}}
+
+
+
+\definecolor{uhhred}{cmyk}{0,100,100,0}
+
+\begin{document}
+
+\frontmatter
+\newgeometry{centering,left=2cm,right=2cm,top=2cm,bottom=2cm}
+\begin{titlepage}
+\includegraphics[scale=0.3]{UHH-Logo_2010_Farbe_CMYK.pdf}
+\vspace*{2cm}
+\Large
+\begin{center} 
+      {\color{uhhred}\textbf{\so{BACHELORTHESIS}}}
+\vspace*{2.0cm}\\
+{\LARGE \textbf{Interactive Labeling of Unclassified Data\\Using the Example of  Recognition of Company Mergers}}
+%or: Incremental labeling of an unknown data set using the example of classification of news articles
+\vspace*{2.0cm}\\
+vorgelegt von
+\vspace*{0.4cm}\\
+Anne Lorenz
+\end{center}
+\vspace*{3.5cm}
+
+\noindent 
+MIN-Fakultät \vspace*{0.4cm} \\ 
+Fachbereich Informatik \vspace*{0.4cm} \\ 
+%Ggf. Professur/Institut \vspace*{0.4cm} \\
+Studiengang: Software-System-Entwicklung \vspace*{0.4cm} \\ 
+Matrikelnummer: 6434073 \vspace*{0.8cm} \\ 
+Erstgutachter: Dr. Julian Kunkel \vspace*{0.4cm} \\ 
+Zweitgutachter: Eugen Betke
+ \vspace*{0.8cm} \\
+Betreuer: Dr. Julian Kunkel, Doris Birkefeld
+\end{titlepage}
+
+\restoregeometry
+
+\chapter*{Abstract}
+BLABLA ABSTRACT
+%So objektiv, kurz, verständlich, vollständig und genau wie möglich :-)
+
+\tableofcontents
+
+\mainmatter 
+
+%Kapitel Einleitung
+%####################
+\chapter{Introduction} 
+\label{chap:introduction}
+
+\textit{
+In this chapter...In \cref{sec:motivation} the motivation, then in \cref{sec:goals} the goals, blablabla...
+}
+
+\section{Motivation} 
+\label{sec:motivation}
+ Given a classification problem, there is always a labeled data set needed first to apply a machine learning model and make predictions possible. The larger the labeled data set is, the better are generally the predictions. However, to get there, each single data element must first be classified manually. Depending on the type of data, this procedure can be very time-consuming, for example if longer texts have to be read.
+
+In this thesis we want to present an alternative data labeling method that allows to label a larger amount of data in a shorter time. 
+
+\section{Goals} 
+\label{sec:goals}
+
+\jk{Ein Satz welcher das Problem beschreibt, dannach dann runtergebrochen in Teilaufgaben}
+
+ We want to compare a conventional method of data labeling with an alternative, incremental method using the following example: The aim is to investigate news articles about recent mergers ('mergers and acquisitions') and to classify them accordingly. With the help of the labeled data set, different classification models will be applied and optimized so that a prediction about future news articles will be possible. 
+
+\section{Outline}
+über die gliederung...
+
+\bigskip
+\paragraph{Summary:} 
+
+\textit{\newline In this chapter we discussed ... The following chapter deals with blabla.}
+ 
+%Kapitel Stand der Technik 
+%##########################
+\chapter{State of the Art} 
+\label{state_of_the_art}
+
+\textit{In this chapter the current state of research in the field of... will be presented.
+}
+
+\section{State of Research}
+\al{Was soll hier rein?}
+
+\bigskip
+\paragraph{Summary:} 
+
+\textit{\newline In this chapter we have described ... are described in the next chapter. In the next chapter we describe...
+}
+ 
+%Kapitel Grundlagen 
+%#################### 
+\chapter{Background and Related Work} 
+\label{chap:background}
+
+\textit{
+In this chapter...In \cref{sec:news} news sources are introduced, then blablabla...
+}
+
+\section{Business News about Mergers} 
+\label{sec:news}
+
+\subsection{Company Mergers} 
+When two companies merge, ... When shares of a company are sold, ... Blabla...
+
+\subsection{Webhose.io as Source for News Articles} 
+As a source for our initial data set, RSS feeds from established business news agencies such as Reuters or Bloomberg come into consideration. However, when crawling RSS feeds, it is not possible to retrieve news from a longer period in the past. Since we want to analyze news of the last 12 months, we obtain the data set from the provider webhose.io. It offers access to English news articles from the sections 'Financial News', 'Finance' and 'Business', among others. As we are only interested in reliable sources, we limit our request to the websites of Reuters, Bloomberg, Financial Times, The Economist and ... 
+
+
+
+\section{Supervised Machine Learning Problems} 
+
+\subsubsection{Structured / Unstructured Data} 
+
+\subsection{Classification Problems}
+\subsubsection{Binary Classification}
+Vergleichbar mit Spamfilterung...
+\subsubsection{Multiple Classification}
+
+\subsection{Balanced / Unbalanced Data Set}
+
+
+\section{Text Analysis} 
+\subsection{Natural Language Processing (NLP)} 
+\subsection{Tokenization}
+\subsection{Unigram, Bigram} 
+\subsection{Stemming} 
+\subsection{Feature Vectors}
+\subsubsection{Word Frequencies} 
+\subsection{Bag of Words (BOW)} 
+\subsection{Stop Words} 
+\subsection{Named Entity Recognition (NER)} 
+
+\section{Machine Learning Models} 
+\subsection{Naive Bayes Classifier} 
+\subsection{Support Vector Machines (SVM)} 
+\subsection{Decision Trees} 
+\subsection{Hyperparameters} 
+\subsection{Feature Selection}
+
+\section{Split Methods} 
+\subsection{Test-Train-Split}
+\subsection{Shuffle Split} 
+\subsection{(K-fold) Cross-Validation}
+
+\section{Metrics}
+\subsection{Accuracy, Error Rate, Sensitivity, Specifity}
+Sensitivity(=true positive rate) and Specificity(=true negative rate)
+\subsection{Recall, Precision, F1-score} 
+\subsection{Robustness}
+\subsection{Overfit, Underfit}
+\subsection{Bias, Variance}
+\subsection{Resubstitution Error}
+  
+\bigskip
+\paragraph{Summary:} 
+
+\textit{\newline
+In this chapter we ... blabla are described in section bla.
+In the next chapter we describe...
+}
+
+%Kapitel Design
+%###########################
+\chapter{Design} 
+\label{chap:design}
+
+\textit{
+In this chapter... In \cref{sec:overview} we give an overview of all, then in \cref{sec:pipeline} the data processing pipeline, blablabla...
+}
+
+\section{Overview}
+\label{sec:overview}
+
+\jk{Was muss insgesamt gemacht werden, welche Teilprobleme müssen addressiert werden}
+
+\jk{Alternativen besprechen, Entscheidungen fällen basierend auf Kriterien}
+
+\jk{Hier ist evtl. noch einiges drin was in Kapitel 'Grundlagen' verschoben wird. Hier kommt Deine Arbeit hin, kein Related work oder Methoden die es schon gibt. Nur falls man es Vergleicht, dann relevant.}
+
+\section{Data Processing Pipeline} 
+\label{sec:pipeline}
+
+\section{Preprocessing} 
+Tokenization, Stemming, Stop Words, Leaving Out Numbers
+
+\section{Data Labeling} 
+
+\subsection{Conventional Method} 
+
+\subsubsection{Top-Down / Waterfall}
+1) Data Labeling \\
+2) Data Cleaning\\
+3) Model Building\\
+4) Analysis of wrong predicted instances 
+=> evtl. neu labeln, wird meistens nicht gemacht\\
+5) Neue Hypothesen => 3); evl. zu 2)\\
+
+\subsection{Incremental Method} 
+
+\subsubsection{Visual Analyticts, Agile Model Development}
+
+\subsubsection{Unbalanced Data Set} 
+
+\section{Model Selection}
+\subsection{Naive Bayes} 
+GaussianNB vs MultinomialNB
+\subsection{SVM} 
+\subsection{Decision Tree} 
+
+
+\section{Recognition of merger partners} 
+\subsubsection{Named Entity Recognition (NER)} 
+
+\bigskip
+\paragraph{Summary:} 
+
+\textit{\newline
+In this chapter we... In the next chapter...
+}
+
+% Kapitel Labeling
+%###########################
+\chapter{Data Labeling}
+\label{chap:labeling}
+
+\textit{
+This chapter describes the procedure for labeling. blabla
+}
+
+\section{Conventional Method}
+
+\subsection{Data Set}
+1497 Artikel\\
+Zeitraum: 1 Monat\\
+Quelle: Reuters.com\\
+
+\subsection{Classification}
+Daten binär klassifiziert, Zeitaufwand ca. 30 Stunden
+
+\subsection{Difficulties}
+Hier ein paar Textbeispiele, die schwierig einzuordnen waren:\\
+- wie soll mit Anteilsverkäufen > 50 \% umgegangen werden? => bedeutet eigentlich Eigentümerwechsel\\
+- "X will buy Y", "X wants to buy Y" => findet es definitiv statt? => ganzer Artikel muss gelesen werden\\
+- Fusion nur als Randbemerkung, ("letztes Jahr haben X und Y fusioniert..., jetzt entstehen neue geschäftsbereiche blabla") ansonsten aber irrelevanter Artikel
+\\
+
+=> aus diesen problemen heraus entstand die idee, verschiedene klassen zu verwenden
+
+\section{Incremental Method}
+\subsection{Data Set}
+10.000 Artikel aus 130.000\\
+Zeitraum: 12 Monate\\
+Quellen: Reuters.com, Bloomberg.com, ...\\
+\subsection{Classification}
+Daten mehrfach klassifiert mit 6 Klassen:\\
+\\
+1: Merger \\
+2: Merger Pending\\
+3: Merger Aborted\\
+4: Sale of Shares\\
+5: Incidental \\
+6: Irrelevant \\
+\subsection{Selection of Articles} 
+\subsection{Procedure}
+Wähle von jedem Monat 10 Artikel zufällig aus.
+Es ist wahrscheinlich dann man nur Merger mit vielen Artikeln hat
+=> Das könnte man minimieren indem man “stratified” sampling macht
+=> Zuerst NER machen, danach fair über Klassen randomisieren
+=> wähle 10 Artikel von 100 Kategorien aus => 10 Kategorien auswählen => darunter zufällig ein Artikel
+Labeln von 1\% aller Artikel
+1) Erste Modelle bauen z.b. Bayes
+Auf alle Artikel anwenden => Wahrscheinlichkeit pro Klasse Vektor: (K1, K2, … , K6)
+Klare Fälle: Kx > 80\% und alle anderen Ky < 10\% (mit x in {1-6}, y != x)
+=> Label übernehmen => wie viele Fälle sind eindeutig?
+Behauptung: 10\% aller Artikel sind eindeutig
+Stichprobenartig überprüfen => 10 Artikel random auswählen von jeder Klasse
+Identifikation von äußert unklaren Fällen
+Mehr als eine Klasse hat ähnliche Wahrscheinlichkeit
+(5\%, 5\%, 5\%, …) => (80\%, 80\%, 0\%, 0\%, …)
+z.b. 100 Artikel angucken und manuell label
+=> Wiederhole ich 3-4 mal gehe zu Schritt 1) (Modell bauen)
+=> 95\% aller Fälle sind jetzt klar.
+=> warum gehen die 5\% nicht? Stichprobenartig Artikel anschauen
+Falls das nicht klappt, Modelle oder Preprozessing (z.b. NER) verbessern
+
+
+\subsection{Tagging of Named Entities} 
+Histogram: X: Autoren/Personen, Unternehmen, Y: Anzahl der Nennungen
+
+\bigskip
+\paragraph{Summary:} 
+
+\textit{\newline
+In this chapter...in the next chapter...
+}
+
+% Kapitel Implementierung
+%##########################
+\chapter{Implementation} 
+\label{chap:implementation}
+
+\textit{
+This chapter deals with the most relevant parts of the implementation.
+}
+
+\section{Data Download} 
+Query webhose.io:\\
+% austauschen!
+query\_params = \{'q':'site:(reuters.com OR ft.com OR cnn.com OR economist.com OR bloomberg.com OR theguardian.com) site\_category:(financial\_news OR finance OR business)',
+	'ts': '1533634070282',
+	'sort': 'crawled'\}
+
+\section{Python Modules} 
+\subsection{nltk} 
+\subsection{pandas} 
+\subsection{sklearn}
+\subsection{webhoseio}
+\section{Own Implementation}
+\subsection{Examples} 
+
+\bigskip
+\paragraph{Summary:} 
+
+\textit{\newline
+In this chapter, we...In the next chapter...
+}
+  
+% Kapitel Evaluation  
+%##########################
+\chapter{Evaluation} 
+\label{chap:evaluation}
+
+\textit{
+In this chapter we want to evaluate the different methods. blabla.
+}
+
+\section{News Articles Exploration} 
+
+\subsection{Length of Articles}
+Oder was sonst noch interessant ist.
+
+\subsection{Most Common Words} 
+
+Im Bezug auf die Artikel über Fusion.
+\subsubsection{Word Cloud} 
+z.B. Word Cloud mit Microsoft-Github-Fusion-Artikel.
+\section{Model Fitting}
+dran denken: Hyperparameter SEPARAT variieren
+
+
+\subsection{Naive Bayes Model}
+Grid-Search
+
+\subsection{SVM}
+\subsection{Decision Tree}
+
+\section{Performance}
+
+\bigskip
+\paragraph{Summary:} 
+
+\textit{\newline
+In this chapter we have described ... In the last chapter we describe...
+}
+
+\chapter{Discussion (?)}
+\al{Braucht man das? Arbeit soll kritisch hinterfragt werden, z.B. 'war der datensatz gut gewählt?' etc.}
+  
+% Kapitel ZUsammenfassung  
+%#############################
+\chapter{Summary} 
+\label{chap:summary}
+
+\section{Comparison of Labeling Methods}
+
+\section{Quality of Predictions} 
+
+\section{Conclusions} 
+
+\section{Future Work} 
+Neuronales Netz
+
+\bigskip
+\paragraph{Summary:} 
+
+\textit{\newline
+In the last chapter we have described ....
+}
+
+% Literaturliste soll im Inhaltsverzeichnis auftauchen
+\nocite{*}
+\addcontentsline{toc}{chapter}{Bibliography}
+
+% Literaturliste anzeigen
+\bibliography{LV}
+
+\backmatter 
+
+\thispagestyle{empty}
+
+\vspace*{\fill}
+\pagestyle{empty}
+
+{\normalsize
+\begin{center}\textbf{Eidesstattliche Erklärung}\end{center}
+Hiermit versichere ich an Eides statt, dass ich die vorliegende Arbeit im Bachelorstudiengang Wirtschaftsinformatik selbstständig verfasst und keine anderen als die angegebenen Hilfsmittel – insbesondere keine im Quellenverzeichnis nicht benannten Internet-Quellen – benutzt habe. Alle Stellen, die wörtlich oder sinngemäß aus Veröffentlichungen entnommen wurden, sind als solche kenntlich gemacht. Ich versichere weiterhin, dass ich die Arbeit vorher nicht in einem anderen Prüfungsverfahren eingereicht habe und die eingereichte schriftliche Fassung der auf dem elektronischen Speichermedium entspricht.
+\vspace*{1cm}\\
+Hamburg, den 01.02.2019
+\hspace*{\fill}\begin{tabular}{@{}l@{}}\hline
+\makebox[5cm]{Anne Lorenz}
+\end{tabular}
+\vspace*{3cm}
+%Dies ist optional, ggf. löschen!
+\begin{center}\textbf{Veröffentlichung}\end{center}
+Ich stimme der Einstellung der Arbeit in die Bibliothek des Fachbereichs Informatik zu.
+\vspace*{1cm}\\
+Hamburg, den 01.02.2019
+\hspace*{\fill}\begin{tabular}{@{}l@{}}\hline
+\makebox[5cm]{Anne Lorenz}
+\end{tabular}
+}
+\vspace*{\fill} 
+
+\end{document}