thesis-anne/NaiveBayes.py

'''
Naive Bayes Classifier
====================== 

Naive Bayes is a probabilistic classifier that is able to predict a 
probability distribution over a set of classes, rather than only 
outputting the most likely class that the observation should belong to.
'Naive' means, that it assumes that the value of a particular feature 
(word in an article) is independent of the value of any other feature, 
given the label. It considers each of these features to contribute 
independently to the probability that it belongs to its category,
regardless of any possible correlations between these features. 
'''

#!!
# The multinomial Naive Bayes classifier is suitable 
#for classification with discrete features (e.g., 
#word counts for text classification). 
#The multinomial distribution normally requires 
#integer feature counts. However, in practice, 
#fractional counts such as tf-idf may also work.

# => nur bei eigenem BOW berücksichtigt

from BagOfWords import BagOfWords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

# MultinomialNB statt GaussianNB benutzt => OK?
#from sklearn.naive_bayes import GaussianNB

class NaiveBayes:

    def make_naive_bayes(dataset):
        '''fits naive bayes model
        '''           
        print('# starting naive bayes')
        print('#')
        
        # split data into text and label set
        X = dataset['Title'] + ' ' + dataset['Text']        
        y = dataset['Label']
        
        # Bag of Words
        print('# calculating bag of words')
        print('#')
        
        # fit the training data and then return the matrix     
        
        # toDO: warum so andere (schlechte) werte mit meinem BOW?
        #X = BagOfWords.fit_transform(X, False)
        
        X = CountVectorizer().fit_transform(X).toarray()
        
        # use stratified k-fold cross-validation as split method
        skf = StratifiedKFold(n_splits = 10, shuffle=True)        

        # use only most important features
        selector = SelectPercentile()  
        
        pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
        
        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
                                       'NB__alpha': [0.00000001, 0.0000001, 
                                                     0.000001, 0.00001, 
                                                     0.0001, 0.001, 0.01, 
                                                     0.1]},
                                       cv=skf, 
                                       scoring=make_scorer(f1_score))
            
        print('# fit classifier')
        print('#') 
  
        grid.fit(X,y)
        
        # DataFrame of results
        df_results = grid.cv_results_
        
        # print results
        ######################
        print('RESULTS:')
        print('#')
        print('mean_test_score:')
        print(df_results['mean_test_score'])
        print('#')
        print('mean of means:')
        print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
        print('#')
        print('best score:') 
        print(grid.best_score_)
        print('#')
        print('best parameters set found on development set:')
        print(grid.best_params_)
        print('#')
        
        print('# ending naive bayes')
        print('#')
        
    def analyze_errors(dataset):
        '''calculates resubstitution error
        shows indices of false classified articles
        uses Gaussian Bayes with train test split
        '''   
        X_train_test = dataset['Title'] + ' ' + dataset['Text']      
        y_train_test = dataset['Label']
        
        count_vector = CountVectorizer()       
        # fit the training data and then return the matrix
        training_data = count_vector.fit_transform(X_train_test).toarray()
        # transform testing data and return the matrix
        testing_data = count_vector.transform(X_train_test).toarray()

        # Naive Bayes
        classifier = GaussianNB()      
        # fit classifier
        classifier.fit(training_data, y_train_test)
        
        # Predict class
        predictions = classifier.predict(testing_data)        
        print('Errors at index:')
        print()
        n = 0
        for i in range(len(y_train_test)):
            if y_train_test[i] != predictions[i]:
                n += 1
                print('error no.{}'.format(n))
                print('prediction at index {} is: {}, but actual is: {}'
                .format(i, predictions[i], y_train_test[i]))
                print(X_train_test[i])
                print(y_train_test[i])
                print()        
        #print metrics               
        print('F1 score: ', format(f1_score(y_train_test, predictions)))
initial project version 2018-09-05 12:08:13 +00:00			`'''`
			`Naive Bayes Classifier`
			`======================`

added .gitignore file 2018-09-10 08:38:24 +00:00			`Naive Bayes is a probabilistic classifier that is able to predict a`
			`probability distribution over a set of classes, rather than only`
			`outputting the most likely class that the observation should belong to.`
initial project version 2018-09-05 12:08:13 +00:00			`'Naive' means, that it assumes that the value of a particular feature`
			`(word in an article) is independent of the value of any other feature,`
added .gitignore file 2018-09-10 08:38:24 +00:00			`given the label. It considers each of these features to contribute`
			`independently to the probability that it belongs to its category,`
initial project version 2018-09-05 12:08:13 +00:00			`regardless of any possible correlations between these features.`
			`'''`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00
			`#!!`
			`# The multinomial Naive Bayes classifier is suitable`
			`#for classification with discrete features (e.g.,`
			`#word counts for text classification).`
			`#The multinomial distribution normally requires`
			`#integer feature counts. However, in practice,`
			`#fractional counts such as tf-idf may also work.`

			`# => nur bei eigenem BOW berücksichtigt`

initial project version 2018-09-05 12:08:13 +00:00			`from BagOfWords import BagOfWords`

SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`from sklearn.feature_extraction.text import CountVectorizer`
			`from sklearn.feature_selection import SelectPercentile`
			`from sklearn.metrics import f1_score, make_scorer`
initial project version 2018-09-05 12:08:13 +00:00			`from sklearn.model_selection import StratifiedKFold`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`from sklearn.model_selection import GridSearchCV`
			`from sklearn.pipeline import Pipeline`
			`from sklearn.naive_bayes import MultinomialNB`

			`# MultinomialNB statt GaussianNB benutzt => OK?`
			`#from sklearn.naive_bayes import GaussianNB`
initial project version 2018-09-05 12:08:13 +00:00
added .gitignore file 2018-09-10 08:38:24 +00:00			`class NaiveBayes:`
initial project version 2018-09-05 12:08:13 +00:00
			`def make_naive_bayes(dataset):`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`'''fits naive bayes model`
added .gitignore file 2018-09-10 08:38:24 +00:00			`'''`
initial project version 2018-09-05 12:08:13 +00:00			`print('# starting naive bayes')`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`print('#')`
initial project version 2018-09-05 12:08:13 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# split data into text and label set`
initial project version 2018-09-05 12:08:13 +00:00			`X = dataset['Title'] + ' ' + dataset['Text']`
			`y = dataset['Label']`

SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# Bag of Words`
			`print('# calculating bag of words')`
			`print('#')`
initial project version 2018-09-05 12:08:13 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# fit the training data and then return the matrix`
initial project version 2018-09-05 12:08:13 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# toDO: warum so andere (schlechte) werte mit meinem BOW?`
			`#X = BagOfWords.fit_transform(X, False)`
initial project version 2018-09-05 12:08:13 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`X = CountVectorizer().fit_transform(X).toarray()`
initial project version 2018-09-05 12:08:13 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# use stratified k-fold cross-validation as split method`
			`skf = StratifiedKFold(n_splits = 10, shuffle=True)`
added .gitignore file 2018-09-10 08:38:24 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# use only most important features`
			`selector = SelectPercentile()`

			`pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])`

			`grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],`
			`'NB__alpha': [0.00000001, 0.0000001,`
			`0.000001, 0.00001,`
			`0.0001, 0.001, 0.01,`
			`0.1]},`
			`cv=skf,`
			`scoring=make_scorer(f1_score))`
initial project version 2018-09-05 12:08:13 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`print('# fit classifier')`
			`print('#')`

			`grid.fit(X,y)`

			`# DataFrame of results`
			`df_results = grid.cv_results_`

			`# print results`
			`######################`
			`print('RESULTS:')`
			`print('#')`
			`print('mean_test_score:')`
			`print(df_results['mean_test_score'])`
			`print('#')`
			`print('mean of means:')`
			`print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))`
			`print('#')`
			`print('best score:')`
			`print(grid.best_score_)`
			`print('#')`
			`print('best parameters set found on development set:')`
			`print(grid.best_params_)`
			`print('#')`
initial project version 2018-09-05 12:08:13 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`print('# ending naive bayes')`
			`print('#')`

			`def analyze_errors(dataset):`
			`'''calculates resubstitution error`
			`shows indices of false classified articles`
			`uses Gaussian Bayes with train test split`
			`'''`
			`X_train_test = dataset['Title'] + ' ' + dataset['Text']`
			`y_train_test = dataset['Label']`

			`count_vector = CountVectorizer()`
			`# fit the training data and then return the matrix`
			`training_data = count_vector.fit_transform(X_train_test).toarray()`
			`# transform testing data and return the matrix`
			`testing_data = count_vector.transform(X_train_test).toarray()`
initial project version 2018-09-05 12:08:13 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# Naive Bayes`
			`classifier = GaussianNB()`
			`# fit classifier`
			`classifier.fit(training_data, y_train_test)`

			`# Predict class`
			`predictions = classifier.predict(testing_data)`
			`print('Errors at index:')`
			`print()`
			`n = 0`
			`for i in range(len(y_train_test)):`
			`if y_train_test[i] != predictions[i]:`
			`n += 1`
			`print('error no.{}'.format(n))`
			`print('prediction at index {} is: {}, but actual is: {}'`
			`.format(i, predictions[i], y_train_test[i]))`
			`print(X_train_test[i])`
			`print(y_train_test[i])`
			`print()`
			`#print metrics`
			`print('F1 score: ', format(f1_score(y_train_test, predictions)))`