initial project version

2018-09-05 14:08:13 +02:00 · 2018-09-05 14:08:13 +02:00 · ecb629e16c
commit ecb629e16c
5 changed files with 520 additions and 0 deletions
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -0,0 +1,130 @@
 '''
 Bag Of Words
 ============
 BagOfWords counts word stems in an article
 and adds new words to the global vocabulary. 
 '''
 import re
 import pandas as pd
 from nltk.stem.porter import PorterStemmer
 class BagOfWords():
    def extract_words(text):
        '''takes article as argument, removes numbers,
        returns list of single words, recurrences included.
        '''                   
        stop_words = BagOfWords.set_stop_words()
        # replace punctuation marks with spaces
        words = re.sub(r'\W', ' ', text)      
        # split str into list of single words
        words = words.split()        
        # list of all words to return
        words_cleaned = []        
        for word in words:
            # remove numbers
            if word.isalpha():
                # reduce word to stem
                word = BagOfWords.reduce_word_to_stem(word)
                # check if not stop word
                if word.lower() not in stop_words:
                    # add every word in lowercase
                    words_cleaned.append(word.lower())
        return words_cleaned
    def reduce_word_to_stem(word):
        '''takes normal word as input, returns the word's word stem
        '''
        stemmer = PorterStemmer()
        # replace word by its stem
        word = stemmer.stem(word)           
        return word
    def make_matrix(series, vocab):
        '''calculates word stem frequencies in input articles.
        returns matrix (DataFrame) with relative word frequencies (0 <= values < 1)
        (rows: different articles, colums: different words in vocab)
        '''
        # create list of tuples
        vectors = []       
        for i in range(len(series)):
            # extract text of single article
            text = series.iloc[i]
            # extract its words
            words = BagOfWords.extract_words(text)
            # count words in single article 
            word_count = len(words)     
            vector = []
            for i, v in enumerate(vocab):
                vector.append(0)
                for w in words:
                    if w == v:
                        # add relative word frequency
                        vector[i] += 1/word_count
            # add single vector as tuple
            vectors.append(tuple(vector))           
        df_vectors = pd.DataFrame.from_records(vectors, index=None, columns=vocab)            
        return df_vectors
    def make_vocab(series):
        '''adds words of input articles to a global vocabulary.
        input: dataframe of all articles, return value: list of words
        '''
        vocab = set()
        for text in series:
            vocab |= set(BagOfWords.extract_words(text))
        # transform to list
        vocab = list(vocab)
        # sort list
        vocab.sort()
        return vocab
    def set_stop_words():
        '''creates list of all words that will be ignored
        '''   
        # standard stopwords from nltk.corpus stopwords('english')
        stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 
                        'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t', 
                        'as', 'at', 'be', 'because', 'been', 'before', 'being', 
                        'below', 'between', 'both', 'but', 'by', 'can', 'couldn', 
                        'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does', 
                        'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down', 
                        'during', 'each', 'few', 'for', 'from', 'further', 'had', 
                        'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven', 
                        'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself', 
                        'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 
                        'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll', 
                        'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most', 
                        'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t', 
                        'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 
                        'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 
                        'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she', 
                        'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t', 
                        'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the', 
                        'their', 'theirs', 'them', 'themselves', 'then', 'there', 
                        'these', 'they', 'this', 'those', 'through', 'to', 'too', 
                        'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t', 
                        'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where', 
                        'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 
                        'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 
                        'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves']    
        # add specific words
        stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday'])    
        # remove the word 'not' from stop words
        stop_words.remove('not')       
        for i in range(len(stop_words)):
            # remove punctuation marks and strip endings from abbreviations
            #stop_words[i] = re.split(r'\W', stop_words[i])[0]
            # reduce word to stem
            stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
        # transform list to set to eliminate duplicates
        stop_words = set(stop_words)    
        return stop_words
--- a/CsvHandler.py
+++ b/CsvHandler.py
@ -0,0 +1,28 @@
 '''
 Csv Handler
 ===========
 CsvHandler writes articles' information to csv file and reads it.
 '''
 import csv
 import pandas as pd
 class CsvHandler():
    def read_csv(csv_file):
        df = pd.read_csv(csv_file, 
                         sep='|', 
                         header=0, 
                         engine='python', 
                         usecols=[1,2,4], #use only 'Title', 'Text' and 'Label'
                         decimal='.', 
                         quotechar='\'',
                         #nrows = 200,
                         quoting=csv.QUOTE_NONE)
        return df
    def write_csv(df, file_name):
        df.to_csv(file_name, sep='|')
        print('### saved {} articles in {}'.format(len(df), file_name))
--- a/DecisionTree.py
+++ b/DecisionTree.py
@ -0,0 +1,112 @@
 '''
 Decision Tree Classifier
 ========================
 Decision Tree Classifier takes as input two arrays: 
 array X of size [n_samples, n_features], holding the training samples,
 and array y of integer values, size [n_samples], 
 holding the class labels for the training samples.
 '''
 import operator
 from BagOfWords import BagOfWords 
 from CsvHandler import CsvHandler
 import graphviz
 import numpy as np
 from sklearn import tree
 #from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
 from sklearn.metrics import f1_score
 from sklearn.model_selection import StratifiedKFold
 class DecisionTree():
    def make_tree(dataset):
        print('# starting decision tree')
        print()
        # note: better results with only title, but other important words
        X = dataset['Title'] + ' ' + dataset['Text']
        y = dataset['Label']
        #count_vector = CountVectorizer()
        # use stratified k-fold cross-validation as split method
        skf = StratifiedKFold(n_splits = 10, shuffle=True) 
        # lists for metrics predicted on test/train set     
        f1_scores = []
        f1_scores_train = []  
        classifier = tree.DecisionTreeClassifier()
        # dict of most important words of each fold
        important_words = {}
        # for each fold
        for train, test in skf.split(X,y):     
            # BOW
            vocab = BagOfWords.make_vocab(X[train])           
            # fit the training data and then return the matrix
            training_data = BagOfWords.make_matrix(X[train], vocab)           
            # transform testing data and return the matrix
            testing_data = BagOfWords.make_matrix(X[test], vocab) 
            # #fit the training data and then return the matrix
            # training_data = count_vector.fit_transform(X[train], y[train]).toarray()          
            # #transform testing data and return the matrix
            # testing_data = count_vector.transform(X[test]).toarray() 
            # # apply select percentile
            # selector = SelectPercentile(percentile=25)           
            # selector.fit(training_data, y[train])
            # training_data_r = selector.transform(training_data)           
            # testing_data_r = selector.transform(testing_data)
            # fit classifier
            classifier.fit(training_data, y[train])
            #predict class                      
            predictions_train = classifier.predict(training_data)
            predictions_test = classifier.predict(testing_data)
            #store metrics predicted on test/train set          
            f1_scores.append(f1_score(y[test], predictions_test))      
            f1_scores_train.append(f1_score(y[train], predictions_train))
            # search for important features
            feature_importances = np.array(classifier.feature_importances_)
            important_indices = feature_importances.argsort()[-50:][::-1]
            for i in important_indices:
                if vocab[i] in important_words:
                    important_words[vocab[i]] += feature_importances[i]
                else:
                    important_words[vocab[i]] = feature_importances[i]
        print('20 most important words in training set:')
        print()
        sorted_i_w = sorted(important_words.items(), key=operator.itemgetter(1))
        #print(sorted_i_w)[:20]
        i_w = [x[0] for x in sorted_i_w]
        print(i_w[:20])
        print()
        #print metrics of test set    
        print('prediction of testing set:')
        print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
                format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))       
        print()
        # print('overfit testing: prediction of training set')
        # print('F1 score: min = {}, max = {}, average = {}'.
                # format(min(f1_scores_train), max(f1_scores_train),
                # sum(f1_scores_train)/float(len(f1_scores_train))))
        # print()
        print('# ending decision tree')
        print()
--- a/FilterKeywords.py
+++ b/FilterKeywords.py
@ -0,0 +1,59 @@
 '''
 Filter Keywords
 ===============
 FilterKeywords searches for merger specific keywords 
 in an article and counts them.
 '''
 import re
 from nltk.stem.porter import PorterStemmer
 class FilterKeywords():
    def search_keywords(dict_input):
        '''extracts relevant key-value pairs of in article's input dictionary.
        output are the contained keywords and their count.
        ''' 
        keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition',
                        'acquire', 'acquisitions', 'acquires', 'combine', 'combines',
                        'combination', 'combined', 'joint', 'venture', 'JV', 'takeover',
                        'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions',
                        'approve', 'approves', 'approved', 'approving', 'approval', 
                        'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out', 
                        'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
        # reduce words to stem
        stemmer = PorterStemmer()
        for i in range(len(keyword_list)):
            keyword_list[i] = stemmer.stem(keyword_list[i])       
        # remove duplicates
        keywords = set(keyword_list)
        # counts keywords in article
        dict_keywords = {}
        # search for matchings in dictionary of input article
        for key in dict_input.keys():
            # iterate over all regular expressions
            for kword in keywords:
                if re.match(kword, key):  
                    # if match, increase value of matching key
                    if str(kword) in dict_keywords:
                        dict_keywords[str(kword)] += dict_input[key]
                    else:
                        dict_keywords[str(kword)] = dict_input[key]
        return dict_keywords
    def count_keywords(dict_keywords):
        '''input: dict with article's keywords (key) and their count (value). 
        returns number of keywords that are found.
        '''
        return sum(dict_keywords.values())
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -0,0 +1,191 @@
 '''
 Naive Bayes Classifier
 ====================== 
 Naive Bayes is a probabilistic classifier that is able to predict, 
 given an observation of an input, a probability distribution over a set of classes, 
 rather than only outputting the most likely class that the observation should belong to.
 'Naive' means, that it assumes that the value of a particular feature 
 (word in an article) is independent of the value of any other feature, 
 given the class variable (label). It considers each of these features 
 to contribute independently to the probability that it belongs to its category,
 regardless of any possible correlations between these features. 
 '''
 from BagOfWords import BagOfWords
 from CsvHandler import CsvHandler
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
 from sklearn.metrics import recall_score, precision_score
 from sklearn.model_selection import StratifiedKFold
 from sklearn.model_selection import train_test_split
 from sklearn.naive_bayes import GaussianNB
 # toDo: für Julian erst mal ohne SelectPercentile machen
 class NaiveBayes():
    def make_naive_bayes(dataset):
        '''fits naive bayes model with StratifiedKFold, uses my BOW
        '''     
        print('# starting naive bayes')
        print()
        # alternative: use only articles' header => may give better results
        X = dataset['Title'] + ' ' + dataset['Text']        
        y = dataset['Label']
        # use stratified k-fold cross-validation as split method
        skf = StratifiedKFold(n_splits = 10, shuffle=True)      
        classifier = GaussianNB()    
        # lists for metrics
        recall_scores = []
        precision_scores = []
        f1_scores = []
        # for each fold
        n = 0
        for train, test in skf.split(X,y):                
            # BOW
            vocab = BagOfWords.make_vocab(X[train])           
            # fit the training data and then return the matrix
            training_data = BagOfWords.make_matrix(X[train], vocab)
            # transform testing data and return the matrix
            testing_data = BagOfWords.make_matrix(X[test], vocab)
            # apply select percentile
            selector = SelectPercentile(percentile=25)           
            selector.fit(training_data, y[train])
            training_data_r = selector.transform(training_data)           
            testing_data_r = selector.transform(testing_data)
            #fit classifier
            classifier.fit(training_data_r, y[train])            
            #predict class                      
            predictions_train = classifier.predict(training_data_r)
            predictions_test = classifier.predict(testing_data_r)
            #store metrics
            rec = recall_score(y[test], predictions_test)
            recall_scores.append(rec)  
            prec = precision_score(y[train], predictions_train)
            precision_scores.append(prec)
            # equation for f1 score
            f1_scores.append(2 * (prec * rec)/(prec + rec))
        #print metrics of test set    
        print('prediction of testing set:')
        print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
        .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))       
        print()
        #print('overfit testing: prediction of training set')
        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
        #format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train))))
        #print() 
        print('# ending naive bayes')
        print()
    def make_naive_bayes_CV(dataset):
        '''alternative: uses CountVectorizer (faster)
        '''     
        # alternative: use only articles' header => may give better results
        X = dataset['Title'] + '.' + dataset['Text'] + '.'
        y = dataset['Label']
        # use stratified k-fold cross-validation as split method
        skf = StratifiedKFold(n_splits = 10, shuffle=True)      
        count_vector = CountVectorizer()
        classifier = GaussianNB()    
        # lists for metrics predicted on test/train set     
        f1_scores, f1_scores_train = [] 
        # for each fold (10 times)
        # fold number
        n = 0
        for train, test in skf.split(X,y):   
            # fit the training data and then return the matrix
            training_data = count_vector.fit_transform(X[train], y[train]).toarray()
            # transform testing data and return the matrix
            testing_data = count_vector.transform(X[test]).toarray()
            # apply select percentile
            selector = SelectPercentile(percentile=25)          
            selector.fit(training_data, y[train])
            training_data_r = selector.transform(training_data)           
            testing_data_r = selector.transform(testing_data)
            #fit classifier
            classifier.fit(training_data_r, y[train]) 
            #predict class                      
            predictions_train = classifier.predict(training_data_r)
            predictions_test = classifier.predict(testing_data_r)
            #store metrics predicted on test set          
            f1_scores.append(f1_score(y[test], predictions_test))
            #store metrics predicted on train set        
            f1_scores_train.append(f1_score(y[train], predictions_train))
        #print metrics of test set
        print('--------------------')     
        print('prediction of testing set:')
        print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
        print()
        print('prediction of training set:')
        print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train))))
        print()               
    # def analyze_errors_cv(dataset):
        # '''calculates resubstitution error
        # shows indices of false classified articles
        # uses Gaussian Bayes with train test split
        # '''
        # X_train_test = dataset['Text']
        # y_train_test = dataset['Label']
        # count_vector = CountVectorizer()
        # # fit the training data and then return the matrix
        # training_data = count_vector.fit_transform(X_train_test).toarray()
        # # transform testing data and return the matrix
        # testing_data = count_vector.transform(X_train_test).toarray()
        # # Naive Bayes
        # classifier = GaussianNB()
        # # fit classifier
        # classifier.fit(training_data, y_train_test)
        # # Predict class
        # predictions = classifier.predict(testing_data)
        # print()
        # print('errors at index:')
        # n = 0
        # for i in range(len(y_train_test)):
            # if y_train_test[i] != predictions[i]:
                # n += 1
                # print('error no.{}'.format(n))
                # print('prediction at index {} is: {}, but actual is: {}'.format(i, predictions[i], y_train_test[i]))
                # print(X_train_test[i])
                # print(y_train_test[i])
                # print()
        # print()
        # #print metrics               
        # print('F1 score: ', format(f1_score(y_train_test, predictions)))