thesis-anne/NaiveBayes.py

'''
Naive Bayes Classifier
======================

Naive Bayes is a probabilistic classifier that is able to predict a
probability distribution over a set of classes, rather than only
outputting the most likely class that the observation should belong to
'Naive' means, that it assumes that the value of a particular feature
(word in an article) is independent of the value of any other feature,
given the label. It considers each of these features to contribute
independently to the probability that it belongs to its category,
regardless of any possible correlations between these features.
'''

from BagOfWords import BagOfWords
from CsvHandler import CsvHandler

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB

class NaiveBayes:

    def make_naive_bayes(dataset):
        '''fits naive bayes model with StratifiedKFold,
        uses my BOW
        '''
        print('# fitting model')
        print('# ...')

        # split data into text and label set
        # join title and text
        X = dataset['Title'] + ' ' + dataset['Text']
        y = dataset['Label']

        cv = CountVectorizer()

        # use stratified k-fold cross-validation as split method
        skf = StratifiedKFold(n_splits = 10, shuffle=True)

        classifier = GaussianNB()

        # lists for metrics
        recall_scores = []
        precision_scores = []
        f1_scores = []

        # for each fold
        n = 0
        for train, test in skf.split(X,y):

            n += 1
            print('# split no. ' + str(n))

            # eigenes BOW => schlechtere ergebnisse
            vocab = BagOfWords.make_vocab(X[train])
            # fit the training data and then return the matrix
            training_data = BagOfWords.make_matrix(X[train], vocab)
            # transform testing data and return the matrix
            testing_data = BagOfWords.make_matrix(X[test], vocab)

            # # # using CountVectorizer:
            # # fit the training data and then return the matrix
            # training_data = cv.fit_transform(X[train], y[train]).toarray()
            # # transform testing data and return the matrix
            # testing_data = cv.transform(X[test]).toarray()

            # # apply select percentile
            # selector = SelectPercentile(percentile=25)
            # selector.fit(training_data, y[train])

            # training_data_r = selector.transform(training_data)
            # testing_data_r = selector.transform(testing_data)

            # #fit classifier
            # classifier.fit(training_data_r, y[train])
            # #predict class
            # predictions_train = classifier.predict(training_data_r)
            # predictions_test = classifier.predict(testing_data_r)

            #fit classifier
            classifier.fit(training_data, y[train])
            #predict class
            predictions_train = classifier.predict(training_data)
            predictions_test = classifier.predict(testing_data)

            #print and store metrics
            rec = recall_score(y[test], predictions_test)
            print('rec: ' + str(rec))
            recall_scores.append(rec)
            prec = precision_score(y[train], predictions_train)
            print('prec: ' + str(prec))
            print('#')
            precision_scores.append(prec)
            # equation for f1 score
            f1_scores.append(2 * (prec * rec)/(prec + rec))

        ##########################
        #print metrics of test set
        print('-------------------------')
        print('prediction of testing set:')
        print('Precision score: min = {}, max = {}, average = {}'
                .format(min(precision_scores),
                        max(precision_scores),
                        sum(precision_scores)/float(len(precision_scores))))
        print('Recall score: min = {}, max = {}, average = {}'
                .format(min(recall_scores),
                        max(recall_scores),
                        sum(recall_scores)/float(len(recall_scores))))
        print('F1 score: min = {}, max = {}, average = {}'
                .format(min(f1_scores),
                        max(f1_scores),
                        sum(f1_scores)/float(len(f1_scores))))
        print()

        ##### nur für overfit testing ###########
        #print('overfit testing: prediction of training set')
        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
        #format(min(f1_scores_train), max(f1_scores_train),
        #sum(f1_scores_train)/float(len(f1_scores_train))))
        #print()

    ######## nur für resubstitutionsfehler benötigt ########
    def analyze_errors(dataset):
        '''calculates resubstitution error
        shows indices of false classified articles
        uses Gaussian Bayes with train test split
        '''
        X_train_test = dataset['Title'] + ' ' + dataset['Text']
        y_train_test = dataset['Label']

        count_vector = CountVectorizer()
        # fit the training data and then return the matrix
        training_data = count_vector.fit_transform(X_train_test).toarray()
        # transform testing data and return the matrix
        testing_data = count_vector.transform(X_train_test).toarray()

        # Naive Bayes
        classifier = GaussianNB()
        # fit classifier
        classifier.fit(training_data, y_train_test)

        # Predict class
        predictions = classifier.predict(testing_data)
        print('Errors at index:')
        print()
        n = 0
        for i in range(len(y_train_test)):
            if y_train_test[i] != predictions[i]:
                n += 1
                print('error no.{}'.format(n))
                print('prediction at index {} is: {}, but actual is: {}'
                .format(i, predictions[i], y_train_test[i]))
                print(X_train_test[i])
                print(y_train_test[i])
                print()
        #print metrics
        print('F1 score: ', format(f1_score(y_train_test, predictions)))

    #################################
    print('# starting naive bayes')
    print('# ...')

    file = 'classification_labelled_corrected.csv'

    # read csv file
    print('# reading dataset')
    print('# ...')

    dataset = CsvHandler.read_csv(file)

    make_naive_bayes(dataset)

    print('#')
    print('# ending naive bayes')
initial project version 2018-09-05 12:08:13 +00:00			`'''`
			`Naive Bayes Classifier`
callable scripts 2018-09-17 19:16:19 +00:00			`======================`
initial project version 2018-09-05 12:08:13 +00:00
added requirements and some things 2018-09-17 12:47:50 +00:00			`Naive Bayes is a probabilistic classifier that is able to predict a`
			`probability distribution over a set of classes, rather than only`
			`outputting the most likely class that the observation should belong to`
			`'Naive' means, that it assumes that the value of a particular feature`
			`(word in an article) is independent of the value of any other feature,`
			`given the label. It considers each of these features to contribute`
added .gitignore file 2018-09-10 08:38:24 +00:00			`independently to the probability that it belongs to its category,`
added requirements and some things 2018-09-17 12:47:50 +00:00			`regardless of any possible correlations between these features.`
initial project version 2018-09-05 12:08:13 +00:00			`'''`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00
initial project version 2018-09-05 12:08:13 +00:00			`from BagOfWords import BagOfWords`
callable scripts 2018-09-17 19:16:19 +00:00			`from CsvHandler import CsvHandler`
initial project version 2018-09-05 12:08:13 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`from sklearn.feature_extraction.text import CountVectorizer`
			`from sklearn.feature_selection import SelectPercentile`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`from sklearn.metrics import recall_score, precision_score`
initial project version 2018-09-05 12:08:13 +00:00			`from sklearn.model_selection import StratifiedKFold`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`from sklearn.naive_bayes import GaussianNB`
initial project version 2018-09-05 12:08:13 +00:00
added .gitignore file 2018-09-10 08:38:24 +00:00			`class NaiveBayes:`
initial project version 2018-09-05 12:08:13 +00:00
			`def make_naive_bayes(dataset):`
added requirements and some things 2018-09-17 12:47:50 +00:00			`'''fits naive bayes model with StratifiedKFold,`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`uses my BOW`
added requirements and some things 2018-09-17 12:47:50 +00:00			`'''`
callable scripts 2018-09-17 19:16:19 +00:00			`print('# fitting model')`
			`print('# ...')`
added requirements and some things 2018-09-17 12:47:50 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# split data into text and label set`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`# join title and text`
added requirements and some things 2018-09-17 12:47:50 +00:00			`X = dataset['Title'] + ' ' + dataset['Text']`
initial project version 2018-09-05 12:08:13 +00:00			`y = dataset['Label']`
added requirements and some things 2018-09-17 12:47:50 +00:00
deleted .gitignore 2018-09-14 07:19:12 +00:00			`cv = CountVectorizer()`
added requirements and some things 2018-09-17 12:47:50 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# use stratified k-fold cross-validation as split method`
added requirements and some things 2018-09-17 12:47:50 +00:00			`skf = StratifiedKFold(n_splits = 10, shuffle=True)`

			`classifier = GaussianNB()`

deleted .gitignore 2018-09-14 07:19:12 +00:00			`# lists for metrics`
			`recall_scores = []`
			`precision_scores = []`
			`f1_scores = []`
added requirements and some things 2018-09-17 12:47:50 +00:00
deleted .gitignore 2018-09-14 07:19:12 +00:00			`# for each fold`
			`n = 0`
added requirements and some things 2018-09-17 12:47:50 +00:00			`for train, test in skf.split(X,y):`

deleted .gitignore 2018-09-14 07:19:12 +00:00			`n += 1`
			`print('# split no. ' + str(n))`
added requirements and some things 2018-09-17 12:47:50 +00:00
changed CountVectorizer => my BOW 2018-09-14 07:49:56 +00:00			`# eigenes BOW => schlechtere ergebnisse`
added requirements and some things 2018-09-17 12:47:50 +00:00			`vocab = BagOfWords.make_vocab(X[train])`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`# fit the training data and then return the matrix`
added requirements and some things 2018-09-17 12:47:50 +00:00			`training_data = BagOfWords.make_matrix(X[train], vocab)`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`# transform testing data and return the matrix`
changed CountVectorizer => my BOW 2018-09-14 07:49:56 +00:00			`testing_data = BagOfWords.make_matrix(X[test], vocab)`
added requirements and some things 2018-09-17 12:47:50 +00:00
changed CountVectorizer => my BOW 2018-09-14 07:49:56 +00:00			`# # # using CountVectorizer:`
			`# # fit the training data and then return the matrix`
			`# training_data = cv.fit_transform(X[train], y[train]).toarray()`
			`# # transform testing data and return the matrix`
added requirements and some things 2018-09-17 12:47:50 +00:00			`# testing_data = cv.transform(X[test]).toarray()`

deleted .gitignore 2018-09-14 07:19:12 +00:00			`# # apply select percentile`
added requirements and some things 2018-09-17 12:47:50 +00:00			`# selector = SelectPercentile(percentile=25)`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`# selector.fit(training_data, y[train])`
added requirements and some things 2018-09-17 12:47:50 +00:00
			`# training_data_r = selector.transform(training_data)`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`# testing_data_r = selector.transform(testing_data)`
added requirements and some things 2018-09-17 12:47:50 +00:00
deleted .gitignore 2018-09-14 07:19:12 +00:00			`# #fit classifier`
added requirements and some things 2018-09-17 12:47:50 +00:00			`# classifier.fit(training_data_r, y[train])`
			`# #predict class`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`# predictions_train = classifier.predict(training_data_r)`
			`# predictions_test = classifier.predict(testing_data_r)`
added requirements and some things 2018-09-17 12:47:50 +00:00
deleted .gitignore 2018-09-14 07:19:12 +00:00			`#fit classifier`
added requirements and some things 2018-09-17 12:47:50 +00:00			`classifier.fit(training_data, y[train])`
			`#predict class`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`predictions_train = classifier.predict(training_data)`
			`predictions_test = classifier.predict(testing_data)`
added requirements and some things 2018-09-17 12:47:50 +00:00
deleted .gitignore 2018-09-14 07:19:12 +00:00			`#print and store metrics`
			`rec = recall_score(y[test], predictions_test)`
			`print('rec: ' + str(rec))`
added requirements and some things 2018-09-17 12:47:50 +00:00			`recall_scores.append(rec)`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`prec = precision_score(y[train], predictions_train)`
			`print('prec: ' + str(prec))`
			`print('#')`
			`precision_scores.append(prec)`
			`# equation for f1 score`
			`f1_scores.append(2 * (prec * rec)/(prec + rec))`
added requirements and some things 2018-09-17 12:47:50 +00:00
deleted .gitignore 2018-09-14 07:19:12 +00:00			`##########################`
added requirements and some things 2018-09-17 12:47:50 +00:00			`#print metrics of test set`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`print('-------------------------')`
			`print('prediction of testing set:')`
			`print('Precision score: min = {}, max = {}, average = {}'`
			`.format(min(precision_scores),`
			`max(precision_scores),`
added requirements and some things 2018-09-17 12:47:50 +00:00			`sum(precision_scores)/float(len(precision_scores))))`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`print('Recall score: min = {}, max = {}, average = {}'`
			`.format(min(recall_scores),`
			`max(recall_scores),`
added requirements and some things 2018-09-17 12:47:50 +00:00			`sum(recall_scores)/float(len(recall_scores))))`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`print('F1 score: min = {}, max = {}, average = {}'`
callable scripts 2018-09-17 19:16:19 +00:00			`.format(min(f1_scores),`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`max(f1_scores),`
added requirements and some things 2018-09-17 12:47:50 +00:00			`sum(f1_scores)/float(len(f1_scores))))`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`print()`
added requirements and some things 2018-09-17 12:47:50 +00:00
deleted .gitignore 2018-09-14 07:19:12 +00:00			`##### nur für overfit testing ###########`
			`#print('overfit testing: prediction of training set')`
			`#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.`
			`#format(min(f1_scores_train), max(f1_scores_train),`
			`#sum(f1_scores_train)/float(len(f1_scores_train))))`
callable scripts 2018-09-17 19:16:19 +00:00			`#print()`
deleted .gitignore 2018-09-14 07:19:12 +00:00
added requirements and some things 2018-09-17 12:47:50 +00:00			`######## nur für resubstitutionsfehler benötigt ########`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`def analyze_errors(dataset):`
			`'''calculates resubstitution error`
			`shows indices of false classified articles`
			`uses Gaussian Bayes with train test split`
added requirements and some things 2018-09-17 12:47:50 +00:00			`'''`
			`X_train_test = dataset['Title'] + ' ' + dataset['Text']`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`y_train_test = dataset['Label']`
callable scripts 2018-09-17 19:16:19 +00:00
added requirements and some things 2018-09-17 12:47:50 +00:00			`count_vector = CountVectorizer()`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# fit the training data and then return the matrix`
			`training_data = count_vector.fit_transform(X_train_test).toarray()`
			`# transform testing data and return the matrix`
			`testing_data = count_vector.transform(X_train_test).toarray()`
initial project version 2018-09-05 12:08:13 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# Naive Bayes`
added requirements and some things 2018-09-17 12:47:50 +00:00			`classifier = GaussianNB()`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# fit classifier`
			`classifier.fit(training_data, y_train_test)`
added requirements and some things 2018-09-17 12:47:50 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# Predict class`
added requirements and some things 2018-09-17 12:47:50 +00:00			`predictions = classifier.predict(testing_data)`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`print('Errors at index:')`
			`print()`
			`n = 0`
			`for i in range(len(y_train_test)):`
			`if y_train_test[i] != predictions[i]:`
			`n += 1`
			`print('error no.{}'.format(n))`
			`print('prediction at index {} is: {}, but actual is: {}'`
			`.format(i, predictions[i], y_train_test[i]))`
			`print(X_train_test[i])`
			`print(y_train_test[i])`
added requirements and some things 2018-09-17 12:47:50 +00:00			`print()`
			`#print metrics`
			`print('F1 score: ', format(f1_score(y_train_test, predictions)))`

callable scripts 2018-09-17 19:16:19 +00:00			`#################################`
			`print('# starting naive bayes')`
			`print('# ...')`

			`file = 'classification_labelled_corrected.csv'`

			`# read csv file`
			`print('# reading dataset')`
			`print('# ...')`

			`dataset = CsvHandler.read_csv(file)`

			`make_naive_bayes(dataset)`

added requirements and some things 2018-09-17 12:47:50 +00:00			`print('#')`
callable scripts 2018-09-17 19:16:19 +00:00			`print('# ending naive bayes')`