SVM.py, NaiveBaies.py: built in grid-search, pipeline

2018-09-12 14:21:50 +02:00 · 2018-09-12 14:21:50 +02:00 · 52146158e2
commit 52146158e2
parent 1195a161d6
5 changed files with 230 additions and 249 deletions
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -14,6 +14,12 @@ from nltk.stem.porter import PorterStemmer

 class BagOfWords:

+    def fit_transform(X, relative_word_frequencies=True):
+        ''' similar to CountVectorizer's fit_transform method
+        '''
+        vocab = BagOfWords.make_vocab(X)
+        return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)
+
    def extract_words(text):
        '''takes article as argument, removes numbers,
        returns list of single words, recurrences included.
@ -37,17 +43,17 @@ class BagOfWords:
        return words_cleaned
        
    def reduce_word_to_stem(word):
-        '''takes normal word as input, returns the word's word stem
+        '''takes normal word as input, returns the word's stem
        '''
        stemmer = PorterStemmer()
        # replace word by its stem
        word = stemmer.stem(word)           
        return word
        
-    def make_matrix(series, vocab):
+    def make_matrix(series, vocab, relative_word_frequencies):
        '''calculates word stem frequencies in input articles.
        returns matrix (DataFrame) with relative word frequencies 
-        (0 <= values < 1)
+        (0 <= values < 1) or absolute word frequencies (int).
        (rows: different articles, colums: different words in vocab)
        '''
        # create list of tuples
@ -64,8 +70,13 @@ class BagOfWords:
                vector.append(0)
                for w in words:
                    if w == v:
-                        # add relative word frequency
-                        vector[i] += 1/word_count
+                        if relative_word_frequencies:
+                            # relative word frequency
+                            vector[i] += 1/word_count
+                        else:
+                            # absolute word frequency
+                            vector[i] += 1
+                            
            # add single vector as tuple
            vectors.append(tuple(vector))           
        df_vectors = pd.DataFrame.from_records(vectors, 
@ -89,7 +100,7 @@ class BagOfWords:
    def set_stop_words():
        '''creates list of all words that will be ignored
        '''   
-        # standard stopwords from nltk.corpus stopwords('english')
+        # stopwords
        stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 
                      'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
                      'aren\'t', 'as', 'at', 'be', 'because', 'been', 
@ -119,13 +130,12 @@ class BagOfWords:
                      'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 
                      'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 
                      'yourselves']    
-                        
-        # add specific words
-        stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 
-                           'wednesday', 'thursday', 'friday'])    
-
-        # => does this make sense?:
-        # remove the word 'not' from stop words
+               
+        ##=> ist das sinnvoll?:         
+        #add specific words
+        #stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 
+        #                   'wednesday', 'thursday', 'friday'])          
+        #remove the word 'not' from stop words
        #stop_words.remove('not')       
        
        for i in range(len(stop_words)):
--- a/DecisionTree.py
+++ b/DecisionTree.py
@ -9,8 +9,7 @@ holding the class labels for the training samples.
 '''
 import operator

-from BagOfWords import BagOfWords 
-from CsvHandler import CsvHandler
+from BagOfWords import BagOfWords

 import graphviz
 import numpy as np
@ -25,9 +24,8 @@ class DecisionTree:
    def make_tree(dataset):
    
        print('# starting decision tree')
-        print()
+        print('#')
    
-        # note: better results with only title, but other important words
        X = dataset['Title'] + ' ' + dataset['Text']
        y = dataset['Label']
        
@ -94,7 +92,6 @@ class DecisionTree:
        #print(sorted_i_w)[:20]
        i_w = [x[0] for x in sorted_i_w]
        print(i_w[:20])
-
        print()
          
        #print metrics of test set    
@ -109,4 +106,4 @@ class DecisionTree:
        # print()
        
        print('# ending decision tree')
-        print()
+        print('#')
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -11,246 +11,129 @@ given the label. It considers each of these features to contribute
 independently to the probability that it belongs to its category,
 regardless of any possible correlations between these features. 
 '''
-from BagOfWords import BagOfWords
-from CsvHandler import CsvHandler

-#from sklearn.feature_extraction.text import CountVectorizer
-#from sklearn.feature_selection import SelectPercentile
-from sklearn.metrics import recall_score, precision_score
+#!!
+# The multinomial Naive Bayes classifier is suitable 
+#for classification with discrete features (e.g., 
+#word counts for text classification). 
+#The multinomial distribution normally requires 
+#integer feature counts. However, in practice, 
+#fractional counts such as tf-idf may also work.
+
+# => nur bei eigenem BOW berücksichtigt
+
+from BagOfWords import BagOfWords
+
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_selection import SelectPercentile
+from sklearn.metrics import f1_score, make_scorer
 from sklearn.model_selection import StratifiedKFold
-#from sklearn.model_selection import train_test_split
-from sklearn.naive_bayes import GaussianNB
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.naive_bayes import MultinomialNB
+
+# MultinomialNB statt GaussianNB benutzt => OK?
+#from sklearn.naive_bayes import GaussianNB

 class NaiveBayes:

    def make_naive_bayes(dataset):
-        '''fits naive bayes model with StratifiedKFold, 
-        uses my BOW
+        '''fits naive bayes model
        '''           
        print('# starting naive bayes')
-        print()
+        print('#')
        
-        # join title and text
+        # split data into text and label set
        X = dataset['Title'] + ' ' + dataset['Text']        
        y = dataset['Label']
        
+        # Bag of Words
+        print('# calculating bag of words')
+        print('#')
+        
+        # fit the training data and then return the matrix     
+        
+        # toDO: warum so andere (schlechte) werte mit meinem BOW?
+        #X = BagOfWords.fit_transform(X, False)
+        
+        X = CountVectorizer().fit_transform(X).toarray()
+        
        # use stratified k-fold cross-validation as split method
-        skf = StratifiedKFold(n_splits = 10, shuffle=True)      
-        
-        classifier = GaussianNB()    
-        
-        # lists for metrics
-        recall_scores = []
-        precision_scores = []
-        f1_scores = []
-        
-        # for each fold
-        n = 0
-        for train, test in skf.split(X,y):                
-            # BOW
-            vocab = BagOfWords.make_vocab(X[train])           
-            # fit the training data and then return the matrix
-            training_data = BagOfWords.make_matrix(X[train], vocab)
-            # transform testing data and return the matrix
-            testing_data = BagOfWords.make_matrix(X[test], vocab)
-            
-            #fit classifier
-            classifier.fit(training_data, y[train])            
-            #predict class                      
-            predictions_train = classifier.predict(training_data)
-            predictions_test = classifier.predict(testing_data)
-            
-            #store metrics
-            rec = recall_score(y[test], predictions_test)
-            recall_scores.append(rec)  
-            prec = precision_score(y[train], predictions_train)
-            precision_scores.append(prec)
-            # equation for f1 score
-            f1_scores.append(2 * (prec * rec)/(prec + rec))
-            
-        #print metrics of test set    
-        print('prediction of testing set:')
-        print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
-                .format(min(f1_scores), max(f1_scores), 
-                        sum(f1_scores)/float(len(f1_scores))))       
-        print()
-        #print('overfit testing: prediction of training set')
-        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
-        #format(min(f1_scores_train), max(f1_scores_train),
-        #sum(f1_scores_train)/float(len(f1_scores_train))))
-        #print() 
+        skf = StratifiedKFold(n_splits = 10, shuffle=True)        

+        # use only most important features
+        selector = SelectPercentile()  
+        
+        pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
+        
+        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
+                                       'NB__alpha': [0.00000001, 0.0000001, 
+                                                     0.000001, 0.00001, 
+                                                     0.0001, 0.001, 0.01, 
+                                                     0.1]},
+                                       cv=skf, 
+                                       scoring=make_scorer(f1_score))
+            
+        print('# fit classifier')
+        print('#') 
+  
+        grid.fit(X,y)
+        
+        # DataFrame of results
+        df_results = grid.cv_results_
+        
+        # print results
+        ######################
+        print('RESULTS:')
+        print('#')
+        print('mean_test_score:')
+        print(df_results['mean_test_score'])
+        print('#')
+        print('mean of means:')
+        print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
+        print('#')
+        print('best score:') 
+        print(grid.best_score_)
+        print('#')
+        print('best parameters set found on development set:')
+        print(grid.best_params_)
+        print('#')
+        
        print('# ending naive bayes')
+        print('#')
+        
+    def analyze_errors(dataset):
+        '''calculates resubstitution error
+        shows indices of false classified articles
+        uses Gaussian Bayes with train test split
+        '''   
+        X_train_test = dataset['Title'] + ' ' + dataset['Text']      
+        y_train_test = dataset['Label']
+        
+        count_vector = CountVectorizer()       
+        # fit the training data and then return the matrix
+        training_data = count_vector.fit_transform(X_train_test).toarray()
+        # transform testing data and return the matrix
+        testing_data = count_vector.transform(X_train_test).toarray()
+
+        # Naive Bayes
+        classifier = GaussianNB()      
+        # fit classifier
+        classifier.fit(training_data, y_train_test)
+        
+        # Predict class
+        predictions = classifier.predict(testing_data)        
+        print('Errors at index:')
        print()
-        
-    # def make_naive_bayes_selectpercentile(dataset):
-        # '''fits naive bayes model with StratifiedKFold, uses my BOW
-        # feature selection: select 0.25-percentile
-        # '''     
-        
-        # print('# starting naive bayes')
-        # print()
-        
-        # # alternative: use only articles' header => may give better results
-        # X = dataset['Title'] + ' ' + dataset['Text']        
-        # y = dataset['Label']
-        
-        # # use stratified k-fold cross-validation as split method
-        # skf = StratifiedKFold(n_splits = 10, shuffle=True)      
-        
-        # classifier = GaussianNB()    
-        
-        # # lists for metrics
-        # recall_scores = []
-        # precision_scores = []
-        # f1_scores = []
-        
-        # # for each fold
-        # n = 0
-        # for train, test in skf.split(X,y):                
-            # # BOW
-            # vocab = BagOfWords.make_vocab(X[train])           
-            # # fit the training data and then return the matrix
-            # training_data = BagOfWords.make_matrix(X[train], vocab)
-            # # transform testing data and return the matrix
-            # testing_data = BagOfWords.make_matrix(X[test], vocab)
-            
-            # # apply select percentile
-            # selector = SelectPercentile(percentile=25)           
-            # selector.fit(training_data, y[train])
-            
-            # training_data_r = selector.transform(training_data)           
-            # testing_data_r = selector.transform(testing_data)
-            
-            # #fit classifier
-            # classifier.fit(training_data_r, y[train])            
-            # #predict class                      
-            # predictions_train = classifier.predict(training_data_r)
-            # predictions_test = classifier.predict(testing_data_r)
-            
-            # #store metrics
-            # rec = recall_score(y[test], predictions_test)
-            # recall_scores.append(rec)  
-            # prec = precision_score(y[train], predictions_train)
-            # precision_scores.append(prec)
-            # # equation for f1 score
-            # f1_scores.append(2 * (prec * rec)/(prec + rec))
-            
-        # #print metrics of test set    
-        # print('prediction of testing set:')
-        # print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
-        # .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))       
-        # print()
-        # #print('overfit testing: prediction of training set')
-        # #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
-        # #format(min(f1_scores_train), max(f1_scores_train),
-        # sum(f1_scores_train)/float(len(f1_scores_train))))
-        # #print() 
-
-        # print('# ending naive bayes')
-        # print()
-        
-
-    # def make_naive_bayes_CV(dataset):
-        # '''alternative: uses CountVectorizer (faster)
-        # '''     
-        # # alternative: use only articles' header => may give better results
-        # X = dataset['Title'] + '.' + dataset['Text'] + '.'
-        # y = dataset['Label']
-        
-        # # use stratified k-fold cross-validation as split method
-        # skf = StratifiedKFold(n_splits = 10, shuffle=True)      
-        
-        # count_vector = CountVectorizer()
-      
-        # classifier = GaussianNB()    
-        
-        # # lists for metrics predicted on test/train set     
-        # f1_scores, f1_scores_train = [] 
-        
-        # # for each fold (10 times)
-        # # fold number
-        # n = 0
-        # for train, test in skf.split(X,y):   
-        
-            # # fit the training data and then return the matrix
-            # training_data = count_vector.fit_transform(X[train], y[train]).toarray()
-            # # transform testing data and return the matrix
-            # testing_data = count_vector.transform(X[test]).toarray()
-            
-            # # apply select percentile
-            # selector = SelectPercentile(percentile=25)          
-            # selector.fit(training_data, y[train])
-            
-            # training_data_r = selector.transform(training_data)           
-            # testing_data_r = selector.transform(testing_data)
-            
-            # #fit classifier
-            # classifier.fit(training_data_r, y[train]) 
-            
-            # #predict class                      
-            # predictions_train = classifier.predict(training_data_r)
-            # predictions_test = classifier.predict(testing_data_r)
-            
-            # #store metrics predicted on test set          
-            # f1_scores.append(f1_score(y[test], predictions_test))
-            
-            # #store metrics predicted on train set        
-            # f1_scores_train.append(f1_score(y[train], predictions_train))
-            
-        # #print metrics of test set
-        # print('--------------------')     
-        # print('prediction of testing set:')
-        # print('F1 score: min = {}, max = {}, average = {}'
-        # .format(min(f1_scores), max(f1_scores),
-        # sum(f1_scores)/float(len(f1_scores))))
-        
-        # print()
-        # print('prediction of training set:')
-        # print('F1 score: min = {}, max = {}, average = {}'
-        # .format(min(f1_scores_train), max(f1_scores_train),
-        #           sum(f1_scores_train)/float(len(f1_scores_train))))
-        # print()               
-        
-    # def analyze_errors_cv(dataset):
-        # '''calculates resubstitution error
-        # shows indices of false classified articles
-        # uses Gaussian Bayes with train test split
-        # '''
-    
-        # X_train_test = dataset['Text']
-        # y_train_test = dataset['Label']
-        
-        # count_vector = CountVectorizer()
-        
-        # # fit the training data and then return the matrix
-        # training_data = count_vector.fit_transform(X_train_test).toarray()
-
-        # # transform testing data and return the matrix
-        # testing_data = count_vector.transform(X_train_test).toarray()
-
-        # # Naive Bayes
-        # classifier = GaussianNB()
-        
-        # # fit classifier
-        # classifier.fit(training_data, y_train_test)
-        
-        # # Predict class
-        # predictions = classifier.predict(testing_data)
-        
-        # print()
-        # print('errors at index:')
-        # n = 0
-        # for i in range(len(y_train_test)):
-            # if y_train_test[i] != predictions[i]:
-                # n += 1
-                # print('error no.{}'.format(n))
-                # print('prediction at index {} is: {}, but actual is: {}'
-                # .format(i, predictions[i], y_train_test[i]))
-                # print(X_train_test[i])
-                # print(y_train_test[i])
-                # print()
-        
-        # print()
-        # #print metrics               
-        # print('F1 score: ', format(f1_score(y_train_test, predictions)))
+        n = 0
+        for i in range(len(y_train_test)):
+            if y_train_test[i] != predictions[i]:
+                n += 1
+                print('error no.{}'.format(n))
+                print('prediction at index {} is: {}, but actual is: {}'
+                .format(i, predictions[i], y_train_test[i]))
+                print(X_train_test[i])
+                print(y_train_test[i])
+                print()        
+        #print metrics               
+        print('F1 score: ', format(f1_score(y_train_test, predictions)))
--- a/SVM.py
+++ b/SVM.py
@ -0,0 +1,87 @@
+'''
+Support Vector Machines (SVM) Classifier
+========================================
+
+The SVM training algorithm builds a model from the training data that assigns 
+the test samples to one category ('merger' or 'not merger'), 
+making it a non-probabilistic binary linear classifier. 
+An SVM model is a representation of the samples as points in space, 
+mapped so that the examples of the separate categories are divided 
+by a clear gap that is as wide as possible. 
+New samples are then mapped into that same space and predicted 
+to belong to a category based on which side of the gap they fall. 
+'''
+
+from BagOfWords import BagOfWords
+
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_selection import SelectPercentile
+from sklearn.metrics import f1_score, make_scorer
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.svm import SVC
+
+class SVM:
+
+    def make_svm(dataset):
+    
+        print('# starting SVM')
+        print('#')
+
+        # split data into text and label set
+        
+        # articles' text (title + text)
+        X = dataset['Title'] + ' ' + dataset['Text']
+        # articles' labels
+        y = dataset['Label']
+
+        # Bag of Words
+        print('# calculating bag of words')
+        print('#')
+        # fit the training data and then return the matrix
+        #X = BagOfWords.fit_transform(X)
+        X = CountVectorizer().fit_transform(X).toarray()
+
+        # use stratified k-fold cross-validation as split method
+        skf = StratifiedKFold(n_splits = 10, shuffle=True)        
+
+        # use only most important features
+        selector = SelectPercentile()  
+        
+        pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
+        
+        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],                                                          
+                            'SVC__kernel': ['linear','poly','rbf','sigmoid'],
+                            'SVC__gamma': [0.0001, 0.001, 0.01, 0.1, 1], 
+                            'SVC__C': [0.0001, 0.001, 0.01, 0.1, 1]}, 
+                            cv=skf, 
+                            scoring=make_scorer(f1_score))
+            
+        print('# fit classifier')
+        print('#') 
+  
+        grid.fit(X,y)
+        
+        # DataFrame of results
+        df_results = grid.cv_results_
+    
+        # print results
+        ######################
+        print('RESULTS:')
+        print('')
+        print('mean_test_score:')
+        print(df_results['mean_test_score'])
+        print('')
+        print('mean of means:')
+        print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
+        print('')
+        print('best score:') 
+        print(grid.best_score_)
+        print()
+        print('best parameters set found on development set:')
+        print(grid.best_params_)
+        print()
+        
+        print('# ending SVM')
+        print('#')
--- a/Starter.py
+++ b/Starter.py
@ -10,19 +10,23 @@ from CsvHandler import CsvHandler
 from DecisionTree import DecisionTree
 from NaiveBayes import NaiveBayes
 #from Requester import Requester
-#from SVM import SVM
+from SVM import SVM

 print('# starting program')
-print()
+print('#')

+# only if new unlabeled(!) data set is required:
 # Requester.save_articles_from_webhoseio()
+
 file = 'classification_labelled_corrected.csv'

 # read csv file
+print('# reading dataset')
+print('#')
 dataset = CsvHandler.read_csv(file)

 # DecisionTree.make_tree(dataset)
 NaiveBayes.make_naive_bayes(dataset)
-# SVM.make_svm(dataset)
+SVM.make_svm(dataset)

 print('# ending program')