added .gitignore file

2018-09-10 10:38:24 +02:00 · 2018-09-10 10:38:24 +02:00 · 1195a161d6
commit 1195a161d6
parent 3f98aff635
6 changed files with 419 additions and 112 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,221 @@
+# Byte-compiled / optimized / DLL files
+
+__pycache__/
+
+*.py[cod]
+
+*$py.class
+
+
+
+# C extensions
+
+*.so
+
+
+
+# Distribution / packaging
+
+.Python
+
+build/
+
+develop-eggs/
+
+dist/
+
+downloads/
+
+eggs/
+
+.eggs/
+
+lib/
+
+lib64/
+
+parts/
+
+sdist/
+
+var/
+
+wheels/
+
+*.egg-info/
+
+.installed.cfg
+
+*.egg
+
+MANIFEST
+
+
+
+# PyInstaller
+
+#  Usually these files are written by a python script from a template
+
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+
+*.manifest
+
+*.spec
+
+
+
+# Installer logs
+
+pip-log.txt
+
+pip-delete-this-directory.txt
+
+
+
+# Unit test / coverage reports
+
+htmlcov/
+
+.tox/
+
+.nox/
+
+.coverage
+
+.coverage.*
+
+.cache
+
+nosetests.xml
+
+coverage.xml
+
+*.cover
+
+.hypothesis/
+
+.pytest_cache/
+
+
+
+# Translations
+
+*.mo
+
+*.pot
+
+
+
+# Django stuff:
+
+*.log
+
+local_settings.py
+
+db.sqlite3
+
+
+
+# Flask stuff:
+
+instance/
+
+.webassets-cache
+
+
+
+# Scrapy stuff:
+
+.scrapy
+
+
+
+# Sphinx documentation
+
+docs/_build/
+
+
+
+# PyBuilder
+
+target/
+
+
+
+# Jupyter Notebook
+
+.ipynb_checkpoints
+
+
+
+# IPython
+
+profile_default/
+
+ipython_config.py
+
+
+
+# pyenv
+
+.python-version
+
+
+
+# celery beat schedule file
+
+celerybeat-schedule
+
+
+
+# SageMath parsed files
+
+*.sage.py
+
+
+
+# Environments
+
+.env
+
+.venv
+
+env/
+
+venv/
+
+ENV/
+
+env.bak/
+
+venv.bak/
+
+
+
+# Spyder project settings
+
+.spyderproject
+
+.spyproject
+
+
+
+# Rope project settings
+
+.ropeproject
+
+
+
+# mkdocs documentation
+
+/site
+
+
+
+# mypy
+
+.mypy_cache/
+
+.dmypy.json
+
+dmypy.json
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -12,7 +12,7 @@ import pandas as pd

 from nltk.stem.porter import PorterStemmer

-class BagOfWords():
+class BagOfWords:

    def extract_words(text):
        '''takes article as argument, removes numbers,
@ -46,7 +46,8 @@ class BagOfWords():
        
    def make_matrix(series, vocab):
        '''calculates word stem frequencies in input articles.
-        returns matrix (DataFrame) with relative word frequencies (0 <= values < 1)
+        returns matrix (DataFrame) with relative word frequencies 
+        (0 <= values < 1)
        (rows: different articles, colums: different words in vocab)
        '''
        # create list of tuples
@ -67,7 +68,9 @@ class BagOfWords():
                        vector[i] += 1/word_count
            # add single vector as tuple
            vectors.append(tuple(vector))           
-        df_vectors = pd.DataFrame.from_records(vectors, index=None, columns=vocab)            
+        df_vectors = pd.DataFrame.from_records(vectors, 
+                                               index=None, 
+                                               columns=vocab)            
        return df_vectors
        
    def make_vocab(series):
@ -87,41 +90,49 @@ class BagOfWords():
        '''creates list of all words that will be ignored
        '''   
        # standard stopwords from nltk.corpus stopwords('english')
-        stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 
-                        'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t', 
-                        'as', 'at', 'be', 'because', 'been', 'before', 'being', 
-                        'below', 'between', 'both', 'but', 'by', 'can', 'couldn', 
-                        'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does', 
-                        'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down', 
-                        'during', 'each', 'few', 'for', 'from', 'further', 'had', 
-                        'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven', 
-                        'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself', 
-                        'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 
-                        'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll', 
-                        'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most', 
-                        'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t', 
-                        'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 
-                        'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 
-                        'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she', 
-                        'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t', 
-                        'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the', 
-                        'their', 'theirs', 'them', 'themselves', 'then', 'there', 
-                        'these', 'they', 'this', 'those', 'through', 'to', 'too', 
-                        'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t', 
-                        'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where', 
-                        'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 
-                        'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 
-                        'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves']    
+        stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 
+                      'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
+                      'aren\'t', 'as', 'at', 'be', 'because', 'been', 
+                      'before', 'being', 'below', 'between', 'both', 'but', 
+                      'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn', 
+                      'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing', 
+                      'don', 'don\'t', 'down', 'during', 'each', 'few', 
+                      'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
+                      'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t', 
+                      'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
+                      'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 
+                      'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
+                      'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 
+                      'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn', 
+                      'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
+                      'on', 'once', 'only', 'or', 'other', 'our', 'ours', 
+                      'ourselves', 'out', 'over', 'own', 're', 's', 'same', 
+                      'shan', 'shan\'t', 'she', 'she\'s', 'should', 
+                      'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some', 
+                      'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
+                      'theirs', 'them', 'themselves', 'then', 'there', 
+                      'these', 'they', 'this', 'those', 'through', 'to', 
+                      'too', 'under', 'until', 'up', 've', 'very', 'was', 
+                      'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t', 
+                      'what', 'when', 'where', 'which', 'while', 'who', 
+                      'whom', 'why', 'will', 'with', 'won', 'won\'t', 
+                      'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 
+                      'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 
+                      'yourselves']    
                        
        # add specific words
-        stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday'])    
+        stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 
+                           'wednesday', 'thursday', 'friday'])    

+        # => does this make sense?:
        # remove the word 'not' from stop words
-        stop_words.remove('not')       
+        #stop_words.remove('not')       
        
        for i in range(len(stop_words)):
+        
            # remove punctuation marks and strip endings from abbreviations
            #stop_words[i] = re.split(r'\W', stop_words[i])[0]
+            
            # reduce word to stem
            stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
        # transform list to set to eliminate duplicates
--- a/CsvHandler.py
+++ b/CsvHandler.py
@ -9,7 +9,7 @@ import csv

 import pandas as pd

-class CsvHandler():
+class CsvHandler:

    def read_csv(csv_file):
        df = pd.read_csv(csv_file, 
--- a/DecisionTree.py
+++ b/DecisionTree.py
@ -20,7 +20,7 @@ from sklearn.feature_selection import SelectPercentile
 from sklearn.metrics import f1_score
 from sklearn.model_selection import StratifiedKFold

-class DecisionTree():
+class DecisionTree:

    def make_tree(dataset):
    
--- a/FilterKeywords.py
+++ b/FilterKeywords.py
@ -10,20 +10,30 @@ import re

 from nltk.stem.porter import PorterStemmer

-class FilterKeywords():
+class FilterKeywords:
          
    def search_keywords(dict_input):
-        '''extracts relevant key-value pairs of in article's input dictionary.
+        '''extracts relevant key-value pairs of in article's input dictionary,
        output are the contained keywords and their count.
        ''' 
+        
+        # # list of regular expressions that match merger specific keywords
+        # regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?', 
+                      # r'business combinations?', r'combined compan(y|ies)', 
+                      # r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
+                      # r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
+                      # r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?', 
+                      # r'purchase', r'(sell(s|ers?|ing)?|sold)']   
                    
-        keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition',
-                        'acquire', 'acquisitions', 'acquires', 'combine', 'combines',
-                        'combination', 'combined', 'joint', 'venture', 'JV', 'takeover',
-                        'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions',
-                        'approve', 'approves', 'approved', 'approving', 'approval', 
-                        'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out', 
-                        'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
+        keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 
+                        'acquisition', 'acquire', 'acquisitions', 'acquires', 
+                        'combine', 'combines', 'combination', 'combined', 
+                        'joint', 'venture', 'JV', 'takeover', 'take-over', 
+                        'tie-up', 'deal', 'deals', 'transaction', 
+                        'transactions', 'approve', 'approves', 'approved', 
+                        'approving', 'approval', 'approvals', 'buy', 'buys', 
+                        'buying', 'bought', 'buyout', 'buy-out', 'purchase', 
+                        'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
                        
        # reduce words to stem
        stemmer = PorterStemmer()
@ -50,7 +60,7 @@ class FilterKeywords():
        return dict_keywords
        
    def count_keywords(dict_keywords):
-        '''input: dict with article's keywords (key) and their count (value). 
+        '''input: dict with article's keywords (key) and their count (value),
        returns number of keywords that are found.
        '''
        return sum(dict_keywords.values())
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -2,37 +2,35 @@
 Naive Bayes Classifier
 ====================== 

-Naive Bayes is a probabilistic classifier that is able to predict, 
-given an observation of an input, a probability distribution over a set of classes, 
-rather than only outputting the most likely class that the observation should belong to.
+Naive Bayes is a probabilistic classifier that is able to predict a 
+probability distribution over a set of classes, rather than only 
+outputting the most likely class that the observation should belong to.
 'Naive' means, that it assumes that the value of a particular feature 
 (word in an article) is independent of the value of any other feature, 
-given the class variable (label). It considers each of these features 
-to contribute independently to the probability that it belongs to its category,
+given the label. It considers each of these features to contribute 
+independently to the probability that it belongs to its category,
 regardless of any possible correlations between these features. 
 '''
 from BagOfWords import BagOfWords
 from CsvHandler import CsvHandler

-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.feature_selection import SelectPercentile
+#from sklearn.feature_extraction.text import CountVectorizer
+#from sklearn.feature_selection import SelectPercentile
 from sklearn.metrics import recall_score, precision_score
 from sklearn.model_selection import StratifiedKFold
-from sklearn.model_selection import train_test_split
+#from sklearn.model_selection import train_test_split
 from sklearn.naive_bayes import GaussianNB

-# toDo: für Julian erst mal ohne SelectPercentile machen
-
-class NaiveBayes():
+class NaiveBayes:

    def make_naive_bayes(dataset):
-        '''fits naive bayes model with StratifiedKFold, uses my BOW
-        '''     
-        
+        '''fits naive bayes model with StratifiedKFold, 
+        uses my BOW
+        '''           
        print('# starting naive bayes')
        print()
        
-        # alternative: use only articles' header => may give better results
+        # join title and text
        X = dataset['Title'] + ' ' + dataset['Text']        
        y = dataset['Label']
        
@ -56,18 +54,11 @@ class NaiveBayes():
            # transform testing data and return the matrix
            testing_data = BagOfWords.make_matrix(X[test], vocab)
            
-            # apply select percentile
-            selector = SelectPercentile(percentile=25)           
-            selector.fit(training_data, y[train])
-            
-            training_data_r = selector.transform(training_data)           
-            testing_data_r = selector.transform(testing_data)
-            
            #fit classifier
-            classifier.fit(training_data_r, y[train])            
+            classifier.fit(training_data, y[train])            
            #predict class                      
-            predictions_train = classifier.predict(training_data_r)
-            predictions_test = classifier.predict(testing_data_r)
+            predictions_train = classifier.predict(training_data)
+            predictions_test = classifier.predict(testing_data)
            
            #store metrics
            rec = recall_score(y[test], predictions_test)
@ -80,73 +71,146 @@ class NaiveBayes():
        #print metrics of test set    
        print('prediction of testing set:')
        print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
-        .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))       
+                .format(min(f1_scores), max(f1_scores), 
+                        sum(f1_scores)/float(len(f1_scores))))       
        print()
        #print('overfit testing: prediction of training set')
        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
-        #format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train))))
+        #format(min(f1_scores_train), max(f1_scores_train),
+        #sum(f1_scores_train)/float(len(f1_scores_train))))
        #print() 

        print('# ending naive bayes')
        print()
        
+    # def make_naive_bayes_selectpercentile(dataset):
+        # '''fits naive bayes model with StratifiedKFold, uses my BOW
+        # feature selection: select 0.25-percentile
+        # '''     
+        
+        # print('# starting naive bayes')
+        # print()
+        
+        # # alternative: use only articles' header => may give better results
+        # X = dataset['Title'] + ' ' + dataset['Text']        
+        # y = dataset['Label']
+        
+        # # use stratified k-fold cross-validation as split method
+        # skf = StratifiedKFold(n_splits = 10, shuffle=True)      
+        
+        # classifier = GaussianNB()    
+        
+        # # lists for metrics
+        # recall_scores = []
+        # precision_scores = []
+        # f1_scores = []
+        
+        # # for each fold
+        # n = 0
+        # for train, test in skf.split(X,y):                
+            # # BOW
+            # vocab = BagOfWords.make_vocab(X[train])           
+            # # fit the training data and then return the matrix
+            # training_data = BagOfWords.make_matrix(X[train], vocab)
+            # # transform testing data and return the matrix
+            # testing_data = BagOfWords.make_matrix(X[test], vocab)
+            
+            # # apply select percentile
+            # selector = SelectPercentile(percentile=25)           
+            # selector.fit(training_data, y[train])
+            
+            # training_data_r = selector.transform(training_data)           
+            # testing_data_r = selector.transform(testing_data)
+            
+            # #fit classifier
+            # classifier.fit(training_data_r, y[train])            
+            # #predict class                      
+            # predictions_train = classifier.predict(training_data_r)
+            # predictions_test = classifier.predict(testing_data_r)
+            
+            # #store metrics
+            # rec = recall_score(y[test], predictions_test)
+            # recall_scores.append(rec)  
+            # prec = precision_score(y[train], predictions_train)
+            # precision_scores.append(prec)
+            # # equation for f1 score
+            # f1_scores.append(2 * (prec * rec)/(prec + rec))
+            
+        # #print metrics of test set    
+        # print('prediction of testing set:')
+        # print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
+        # .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))       
+        # print()
+        # #print('overfit testing: prediction of training set')
+        # #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
+        # #format(min(f1_scores_train), max(f1_scores_train),
+        # sum(f1_scores_train)/float(len(f1_scores_train))))
+        # #print() 

-    def make_naive_bayes_CV(dataset):
-        '''alternative: uses CountVectorizer (faster)
-        '''     
-        # alternative: use only articles' header => may give better results
-        X = dataset['Title'] + '.' + dataset['Text'] + '.'
-        y = dataset['Label']
+        # print('# ending naive bayes')
+        # print()
        
-        # use stratified k-fold cross-validation as split method
-        skf = StratifiedKFold(n_splits = 10, shuffle=True)      
+
+    # def make_naive_bayes_CV(dataset):
+        # '''alternative: uses CountVectorizer (faster)
+        # '''     
+        # # alternative: use only articles' header => may give better results
+        # X = dataset['Title'] + '.' + dataset['Text'] + '.'
+        # y = dataset['Label']
        
-        count_vector = CountVectorizer()
+        # # use stratified k-fold cross-validation as split method
+        # skf = StratifiedKFold(n_splits = 10, shuffle=True)      
+        
+        # count_vector = CountVectorizer()
      
-        classifier = GaussianNB()    
+        # classifier = GaussianNB()    
        
-        # lists for metrics predicted on test/train set     
-        f1_scores, f1_scores_train = [] 
+        # # lists for metrics predicted on test/train set     
+        # f1_scores, f1_scores_train = [] 
        
-        # for each fold (10 times)
-        # fold number
-        n = 0
-        for train, test in skf.split(X,y):   
+        # # for each fold (10 times)
+        # # fold number
+        # n = 0
+        # for train, test in skf.split(X,y):   
        
-            # fit the training data and then return the matrix
-            training_data = count_vector.fit_transform(X[train], y[train]).toarray()
-            # transform testing data and return the matrix
-            testing_data = count_vector.transform(X[test]).toarray()
+            # # fit the training data and then return the matrix
+            # training_data = count_vector.fit_transform(X[train], y[train]).toarray()
+            # # transform testing data and return the matrix
+            # testing_data = count_vector.transform(X[test]).toarray()
            
-            # apply select percentile
-            selector = SelectPercentile(percentile=25)          
-            selector.fit(training_data, y[train])
+            # # apply select percentile
+            # selector = SelectPercentile(percentile=25)          
+            # selector.fit(training_data, y[train])
            
-            training_data_r = selector.transform(training_data)           
-            testing_data_r = selector.transform(testing_data)
+            # training_data_r = selector.transform(training_data)           
+            # testing_data_r = selector.transform(testing_data)
            
-            #fit classifier
-            classifier.fit(training_data_r, y[train]) 
+            # #fit classifier
+            # classifier.fit(training_data_r, y[train]) 
            
-            #predict class                      
-            predictions_train = classifier.predict(training_data_r)
-            predictions_test = classifier.predict(testing_data_r)
+            # #predict class                      
+            # predictions_train = classifier.predict(training_data_r)
+            # predictions_test = classifier.predict(testing_data_r)
            
-            #store metrics predicted on test set          
-            f1_scores.append(f1_score(y[test], predictions_test))
+            # #store metrics predicted on test set          
+            # f1_scores.append(f1_score(y[test], predictions_test))
            
-            #store metrics predicted on train set        
-            f1_scores_train.append(f1_score(y[train], predictions_train))
+            # #store metrics predicted on train set        
+            # f1_scores_train.append(f1_score(y[train], predictions_train))
            
-        #print metrics of test set
-        print('--------------------')     
-        print('prediction of testing set:')
-        print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
+        # #print metrics of test set
+        # print('--------------------')     
+        # print('prediction of testing set:')
+        # print('F1 score: min = {}, max = {}, average = {}'
+        # .format(min(f1_scores), max(f1_scores),
+        # sum(f1_scores)/float(len(f1_scores))))
        
-        print()
-        print('prediction of training set:')
-        print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train))))
-        print()               
+        # print()
+        # print('prediction of training set:')
+        # print('F1 score: min = {}, max = {}, average = {}'
+        # .format(min(f1_scores_train), max(f1_scores_train),
+        #           sum(f1_scores_train)/float(len(f1_scores_train))))
+        # print()               
        
    # def analyze_errors_cv(dataset):
        # '''calculates resubstitution error
@ -181,7 +245,8 @@ class NaiveBayes():
            # if y_train_test[i] != predictions[i]:
                # n += 1
                # print('error no.{}'.format(n))
-                # print('prediction at index {} is: {}, but actual is: {}'.format(i, predictions[i], y_train_test[i]))
+                # print('prediction at index {} is: {}, but actual is: {}'
+                # .format(i, predictions[i], y_train_test[i]))
                # print(X_train_test[i])
                # print(y_train_test[i])
                # print()