initial project version

2018-09-05 14:08:13 +02:00 · 2018-09-05 14:08:13 +02:00 · ecb629e16c
commit ecb629e16c
5 changed files with 520 additions and 0 deletions
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -0,0 +1,130 @@
+'''
+Bag Of Words
+============
+
+BagOfWords counts word stems in an article
+and adds new words to the global vocabulary. 
+'''
+
+import re
+
+import pandas as pd
+
+from nltk.stem.porter import PorterStemmer
+
+class BagOfWords():
+
+    def extract_words(text):
+        '''takes article as argument, removes numbers,
+        returns list of single words, recurrences included.
+        '''                   
+        stop_words = BagOfWords.set_stop_words()
+        # replace punctuation marks with spaces
+        words = re.sub(r'\W', ' ', text)      
+        # split str into list of single words
+        words = words.split()        
+        # list of all words to return
+        words_cleaned = []        
+        for word in words:
+            # remove numbers
+            if word.isalpha():
+                # reduce word to stem
+                word = BagOfWords.reduce_word_to_stem(word)
+                # check if not stop word
+                if word.lower() not in stop_words:
+                    # add every word in lowercase
+                    words_cleaned.append(word.lower())
+        return words_cleaned
+        
+    def reduce_word_to_stem(word):
+        '''takes normal word as input, returns the word's word stem
+        '''
+        stemmer = PorterStemmer()
+        # replace word by its stem
+        word = stemmer.stem(word)           
+        return word
+        
+    def make_matrix(series, vocab):
+        '''calculates word stem frequencies in input articles.
+        returns matrix (DataFrame) with relative word frequencies (0 <= values < 1)
+        (rows: different articles, colums: different words in vocab)
+        '''
+        # create list of tuples
+        vectors = []       
+        for i in range(len(series)):
+            # extract text of single article
+            text = series.iloc[i]
+            # extract its words
+            words = BagOfWords.extract_words(text)
+            # count words in single article 
+            word_count = len(words)     
+            vector = []
+            for i, v in enumerate(vocab):
+                vector.append(0)
+                for w in words:
+                    if w == v:
+                        # add relative word frequency
+                        vector[i] += 1/word_count
+            # add single vector as tuple
+            vectors.append(tuple(vector))           
+        df_vectors = pd.DataFrame.from_records(vectors, index=None, columns=vocab)            
+        return df_vectors
+        
+    def make_vocab(series):
+        '''adds words of input articles to a global vocabulary.
+        input: dataframe of all articles, return value: list of words
+        '''
+        vocab = set()
+        for text in series:
+            vocab |= set(BagOfWords.extract_words(text))
+        # transform to list
+        vocab = list(vocab)
+        # sort list
+        vocab.sort()
+        return vocab
+        
+    def set_stop_words():
+        '''creates list of all words that will be ignored
+        '''   
+        # standard stopwords from nltk.corpus stopwords('english')
+        stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 
+                        'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t', 
+                        'as', 'at', 'be', 'because', 'been', 'before', 'being', 
+                        'below', 'between', 'both', 'but', 'by', 'can', 'couldn', 
+                        'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does', 
+                        'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down', 
+                        'during', 'each', 'few', 'for', 'from', 'further', 'had', 
+                        'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven', 
+                        'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself', 
+                        'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 
+                        'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll', 
+                        'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most', 
+                        'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t', 
+                        'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 
+                        'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 
+                        'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she', 
+                        'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t', 
+                        'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the', 
+                        'their', 'theirs', 'them', 'themselves', 'then', 'there', 
+                        'these', 'they', 'this', 'those', 'through', 'to', 'too', 
+                        'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t', 
+                        'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where', 
+                        'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 
+                        'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 
+                        'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves']    
+                        
+        # add specific words
+        stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday'])    
+
+        # remove the word 'not' from stop words
+        stop_words.remove('not')       
+        
+        for i in range(len(stop_words)):
+            # remove punctuation marks and strip endings from abbreviations
+            #stop_words[i] = re.split(r'\W', stop_words[i])[0]
+            # reduce word to stem
+            stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
+        # transform list to set to eliminate duplicates
+        stop_words = set(stop_words)    
+        
+        return stop_words
--- a/CsvHandler.py
+++ b/CsvHandler.py
@ -0,0 +1,28 @@
+'''
+Csv Handler
+===========
+
+CsvHandler writes articles' information to csv file and reads it.
+'''
+
+import csv
+
+import pandas as pd
+
+class CsvHandler():
+
+    def read_csv(csv_file):
+        df = pd.read_csv(csv_file, 
+                         sep='|', 
+                         header=0, 
+                         engine='python', 
+                         usecols=[1,2,4], #use only 'Title', 'Text' and 'Label'
+                         decimal='.', 
+                         quotechar='\'',
+                         #nrows = 200,
+                         quoting=csv.QUOTE_NONE)
+        return df
+        
+    def write_csv(df, file_name):
+        df.to_csv(file_name, sep='|')
+        print('### saved {} articles in {}'.format(len(df), file_name))
--- a/DecisionTree.py
+++ b/DecisionTree.py
@ -0,0 +1,112 @@
+'''
+Decision Tree Classifier
+========================
+
+Decision Tree Classifier takes as input two arrays: 
+array X of size [n_samples, n_features], holding the training samples,
+and array y of integer values, size [n_samples], 
+holding the class labels for the training samples.
+'''
+import operator
+
+from BagOfWords import BagOfWords 
+from CsvHandler import CsvHandler
+
+import graphviz
+import numpy as np
+from sklearn import tree
+#from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_selection import SelectPercentile
+from sklearn.metrics import f1_score
+from sklearn.model_selection import StratifiedKFold
+
+class DecisionTree():
+
+    def make_tree(dataset):
+    
+        print('# starting decision tree')
+        print()
+    
+        # note: better results with only title, but other important words
+        X = dataset['Title'] + ' ' + dataset['Text']
+        y = dataset['Label']
+        
+        #count_vector = CountVectorizer()
+        
+        # use stratified k-fold cross-validation as split method
+        skf = StratifiedKFold(n_splits = 10, shuffle=True) 
+        
+        # lists for metrics predicted on test/train set     
+        f1_scores = []
+        f1_scores_train = []  
+        
+        classifier = tree.DecisionTreeClassifier()
+        
+        # dict of most important words of each fold
+        important_words = {}
+        
+        # for each fold
+        for train, test in skf.split(X,y):     
+                            
+            # BOW
+            vocab = BagOfWords.make_vocab(X[train])           
+            # fit the training data and then return the matrix
+            training_data = BagOfWords.make_matrix(X[train], vocab)           
+            # transform testing data and return the matrix
+            testing_data = BagOfWords.make_matrix(X[test], vocab) 
+            
+            # #fit the training data and then return the matrix
+            # training_data = count_vector.fit_transform(X[train], y[train]).toarray()          
+            # #transform testing data and return the matrix
+            # testing_data = count_vector.transform(X[test]).toarray() 
+            
+            # # apply select percentile
+            # selector = SelectPercentile(percentile=25)           
+            # selector.fit(training_data, y[train])
+            
+            # training_data_r = selector.transform(training_data)           
+            # testing_data_r = selector.transform(testing_data)
+            
+            # fit classifier
+            classifier.fit(training_data, y[train])
+            
+            #predict class                      
+            predictions_train = classifier.predict(training_data)
+            predictions_test = classifier.predict(testing_data)
+            
+            #store metrics predicted on test/train set          
+            f1_scores.append(f1_score(y[test], predictions_test))      
+            f1_scores_train.append(f1_score(y[train], predictions_train))
+            
+            # search for important features
+            feature_importances = np.array(classifier.feature_importances_)
+            important_indices = feature_importances.argsort()[-50:][::-1]
+            
+            for i in important_indices:
+                if vocab[i] in important_words:
+                    important_words[vocab[i]] += feature_importances[i]
+                else:
+                    important_words[vocab[i]] = feature_importances[i]
+          
+        print('20 most important words in training set:')
+        print()
+        sorted_i_w = sorted(important_words.items(), key=operator.itemgetter(1))
+        #print(sorted_i_w)[:20]
+        i_w = [x[0] for x in sorted_i_w]
+        print(i_w[:20])
+
+        print()
+          
+        #print metrics of test set    
+        print('prediction of testing set:')
+        print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
+                format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))       
+        print()
+        # print('overfit testing: prediction of training set')
+        # print('F1 score: min = {}, max = {}, average = {}'.
+                # format(min(f1_scores_train), max(f1_scores_train),
+                # sum(f1_scores_train)/float(len(f1_scores_train))))
+        # print()
+        
+        print('# ending decision tree')
+        print()
--- a/FilterKeywords.py
+++ b/FilterKeywords.py
@ -0,0 +1,59 @@
+'''
+Filter Keywords
+===============
+
+FilterKeywords searches for merger specific keywords 
+in an article and counts them.
+'''
+
+import re
+
+from nltk.stem.porter import PorterStemmer
+
+class FilterKeywords():
+          
+    def search_keywords(dict_input):
+        '''extracts relevant key-value pairs of in article's input dictionary.
+        output are the contained keywords and their count.
+        ''' 
+                    
+        keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition',
+                        'acquire', 'acquisitions', 'acquires', 'combine', 'combines',
+                        'combination', 'combined', 'joint', 'venture', 'JV', 'takeover',
+                        'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions',
+                        'approve', 'approves', 'approved', 'approving', 'approval', 
+                        'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out', 
+                        'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
+                        
+        # reduce words to stem
+        stemmer = PorterStemmer()
+        for i in range(len(keyword_list)):
+            keyword_list[i] = stemmer.stem(keyword_list[i])       
+        
+        # remove duplicates
+        keywords = set(keyword_list)
+    
+        # counts keywords in article
+        dict_keywords = {}
+        
+        # search for matchings in dictionary of input article
+        for key in dict_input.keys():
+            # iterate over all regular expressions
+            for kword in keywords:
+                if re.match(kword, key):  
+                    # if match, increase value of matching key
+                    if str(kword) in dict_keywords:
+                        dict_keywords[str(kword)] += dict_input[key]
+                    else:
+                        dict_keywords[str(kword)] = dict_input[key]
+                        
+        return dict_keywords
+        
+    def count_keywords(dict_keywords):
+        '''input: dict with article's keywords (key) and their count (value). 
+        returns number of keywords that are found.
+        '''
+        return sum(dict_keywords.values())
+    
+    
+        
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -0,0 +1,191 @@
+'''
+Naive Bayes Classifier
+====================== 
+
+Naive Bayes is a probabilistic classifier that is able to predict, 
+given an observation of an input, a probability distribution over a set of classes, 
+rather than only outputting the most likely class that the observation should belong to.
+'Naive' means, that it assumes that the value of a particular feature 
+(word in an article) is independent of the value of any other feature, 
+given the class variable (label). It considers each of these features 
+to contribute independently to the probability that it belongs to its category,
+regardless of any possible correlations between these features. 
+'''
+from BagOfWords import BagOfWords
+from CsvHandler import CsvHandler
+
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_selection import SelectPercentile
+from sklearn.metrics import recall_score, precision_score
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import GaussianNB
+
+# toDo: für Julian erst mal ohne SelectPercentile machen
+
+class NaiveBayes():
+
+    def make_naive_bayes(dataset):
+        '''fits naive bayes model with StratifiedKFold, uses my BOW
+        '''     
+        
+        print('# starting naive bayes')
+        print()
+        
+        # alternative: use only articles' header => may give better results
+        X = dataset['Title'] + ' ' + dataset['Text']        
+        y = dataset['Label']
+        
+        # use stratified k-fold cross-validation as split method
+        skf = StratifiedKFold(n_splits = 10, shuffle=True)      
+        
+        classifier = GaussianNB()    
+        
+        # lists for metrics
+        recall_scores = []
+        precision_scores = []
+        f1_scores = []
+        
+        # for each fold
+        n = 0
+        for train, test in skf.split(X,y):                
+            # BOW
+            vocab = BagOfWords.make_vocab(X[train])           
+            # fit the training data and then return the matrix
+            training_data = BagOfWords.make_matrix(X[train], vocab)
+            # transform testing data and return the matrix
+            testing_data = BagOfWords.make_matrix(X[test], vocab)
+            
+            # apply select percentile
+            selector = SelectPercentile(percentile=25)           
+            selector.fit(training_data, y[train])
+            
+            training_data_r = selector.transform(training_data)           
+            testing_data_r = selector.transform(testing_data)
+            
+            #fit classifier
+            classifier.fit(training_data_r, y[train])            
+            #predict class                      
+            predictions_train = classifier.predict(training_data_r)
+            predictions_test = classifier.predict(testing_data_r)
+            
+            #store metrics
+            rec = recall_score(y[test], predictions_test)
+            recall_scores.append(rec)  
+            prec = precision_score(y[train], predictions_train)
+            precision_scores.append(prec)
+            # equation for f1 score
+            f1_scores.append(2 * (prec * rec)/(prec + rec))
+            
+        #print metrics of test set    
+        print('prediction of testing set:')
+        print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
+        .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))       
+        print()
+        #print('overfit testing: prediction of training set')
+        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
+        #format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train))))
+        #print() 
+
+        print('# ending naive bayes')
+        print()
+        
+
+    def make_naive_bayes_CV(dataset):
+        '''alternative: uses CountVectorizer (faster)
+        '''     
+        # alternative: use only articles' header => may give better results
+        X = dataset['Title'] + '.' + dataset['Text'] + '.'
+        y = dataset['Label']
+        
+        # use stratified k-fold cross-validation as split method
+        skf = StratifiedKFold(n_splits = 10, shuffle=True)      
+        
+        count_vector = CountVectorizer()
+      
+        classifier = GaussianNB()    
+        
+        # lists for metrics predicted on test/train set     
+        f1_scores, f1_scores_train = [] 
+        
+        # for each fold (10 times)
+        # fold number
+        n = 0
+        for train, test in skf.split(X,y):   
+        
+            # fit the training data and then return the matrix
+            training_data = count_vector.fit_transform(X[train], y[train]).toarray()
+            # transform testing data and return the matrix
+            testing_data = count_vector.transform(X[test]).toarray()
+            
+            # apply select percentile
+            selector = SelectPercentile(percentile=25)          
+            selector.fit(training_data, y[train])
+            
+            training_data_r = selector.transform(training_data)           
+            testing_data_r = selector.transform(testing_data)
+            
+            #fit classifier
+            classifier.fit(training_data_r, y[train]) 
+            
+            #predict class                      
+            predictions_train = classifier.predict(training_data_r)
+            predictions_test = classifier.predict(testing_data_r)
+            
+            #store metrics predicted on test set          
+            f1_scores.append(f1_score(y[test], predictions_test))
+            
+            #store metrics predicted on train set        
+            f1_scores_train.append(f1_score(y[train], predictions_train))
+            
+        #print metrics of test set
+        print('--------------------')     
+        print('prediction of testing set:')
+        print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
+        
+        print()
+        print('prediction of training set:')
+        print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train))))
+        print()               
+        
+    # def analyze_errors_cv(dataset):
+        # '''calculates resubstitution error
+        # shows indices of false classified articles
+        # uses Gaussian Bayes with train test split
+        # '''
+    
+        # X_train_test = dataset['Text']
+        # y_train_test = dataset['Label']
+        
+        # count_vector = CountVectorizer()
+        
+        # # fit the training data and then return the matrix
+        # training_data = count_vector.fit_transform(X_train_test).toarray()
+
+        # # transform testing data and return the matrix
+        # testing_data = count_vector.transform(X_train_test).toarray()
+
+        # # Naive Bayes
+        # classifier = GaussianNB()
+        
+        # # fit classifier
+        # classifier.fit(training_data, y_train_test)
+        
+        # # Predict class
+        # predictions = classifier.predict(testing_data)
+        
+        # print()
+        # print('errors at index:')
+        # n = 0
+        # for i in range(len(y_train_test)):
+            # if y_train_test[i] != predictions[i]:
+                # n += 1
+                # print('error no.{}'.format(n))
+                # print('prediction at index {} is: {}, but actual is: {}'.format(i, predictions[i], y_train_test[i]))
+                # print(X_train_test[i])
+                # print(y_train_test[i])
+                # print()
+        
+        # print()
+        # #print metrics               
+        # print('F1 score: ', format(f1_score(y_train_test, predictions)))