''' Multinomial Naive Bayes Classifier ================================== ''' from BagOfWords import BagOfWords import csv import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectPercentile from sklearn.metrics import recall_score, precision_score import sklearn from sklearn.model_selection import StratifiedKFold from sklearn.naive_bayes import MultinomialNB class MultinomialNaiveBayes: def make_mnb(dataset, sklearn_cv=True, percentile=100): '''fits naive bayes model with StratifiedKFold ''' print('# starting multinomial naive bayes') print('# ...') # split data into text and label set # join title and text X = dataset['Title'] + '. ' + dataset['Text'] y = dataset['Label'] if sklearn_cv: cv = CountVectorizer() # use stratified k-fold cross-validation as split method skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5) classifier = MultinomialNB(alpha=1.0e-10, fit_prior=False, class_prior=None) # metrics recall_scores = [] precision_scores = [] f1_scores = [] # probabilities of each class (of each fold) #class_prob = [] # counts number of training samples observed in each class #class_counts = [] # for each fold n = 0 for train, test in skf.split(X,y): n += 1 print('# split no. ' + str(n)) if sklearn_cv: # use sklearn CountVectorizer # fit the training data and then return the matrix training_data = cv.fit_transform(X[train], y[train]).toarray() # transform testing data and return the matrix testing_data = cv.transform(X[test]).toarray() else: # use my own BagOfWords python implementation stemming = True rel_freq = True extracted_words = BagOfWords.extract_all_words(X[train]) vocab = BagOfWords.make_vocab(extracted_words) # fit the training data and then return the matrix training_data = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming) # transform testing data and return the matrix extracted_words = BagOfWords.extract_all_words(X[test]) testing_data = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming) # apply select percentile selector = SelectPercentile(percentile=percentile) selector.fit(training_data, y[train]) # new reduced data sets training_data_r = selector.transform(training_data) testing_data_r = selector.transform(testing_data) #fit classifier classifier.fit(training_data_r, y[train]) #predict class predictions_train = classifier.predict(training_data_r) predictions_test = classifier.predict(testing_data_r) # print('train:') # print(y[train]) # print('test:') # print(y[test]) # print() # print('pred') # print(predictions_test) #print and store metrics rec = recall_score(y[test], predictions_test, average='weighted') print('rec: ' + str(rec)) recall_scores.append(rec) prec = precision_score(y[test], predictions_test, average='weighted') print('prec: ' + str(prec)) print('#') precision_scores.append(prec) # equation for f1 score f1_scores.append(2 * (prec * rec)/(prec + rec)) #class_prob.append(classifier.class_prior_) #class_counts.append(classifier.class_count_) ########################## # probability estimates for the test vector (testing_data) class_probs = classifier.predict_proba(testing_data) # number of samples encountered for each class during fitting # this value is weighted by the sample weight when provided class_count = classifier.class_count_ # classes in order used classes = classifier.classes_ print('average: recall, precision, f1 score') print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10) # return classes and vector of class estimates return recall_scores, precision_scores, f1_scores, class_probs ######## nur für resubstitutionsfehler benötigt ######## def analyze_errors(training, testing): '''calculates resubstitution error shows indices of false classified articles uses Gaussian Bayes with train test split ''' X_train = training['Title'] + ' ' + training['Text'] y_train = training['Label'] X_test = testing['Title'] + ' ' + testing['Text'] y_test = testing['Label'] count_vector = CountVectorizer() # fit the training data and then return the matrix training_data = count_vector.fit_transform(X_train).toarray() # transform testing data and return the matrix testing_data = count_vector.transform(X_test).toarray() # Naive Bayes classifier = MultinomialNB(alpha=1.0e-10, fit_prior=False, class_prior=None) # fit classifier classifier.fit(training_data, y_train) # Predict class predictions = classifier.predict(testing_data) print(type(y_test)) print(len(y_test)) print(type(predictions)) print(len(predictions)) print('Errors at index:') print() n = 0 for i in range(len(y_test)): if y_test[i] != predictions[i]: n += 1 print('error no.{}'.format(n)) print('prediction at index {} is: {}, but actual is: {}' .format(i, predictions[i], y_test[i])) print(X_test[i]) print(y_test[i]) print() #print metrics print('F1 score: ', format(f1_score(y_test, predictions))) if __name__ == '__main__': # read csv file print('# reading dataset') print('# ...') # read current data set from csv df = pd.read_csv('../data/interactive_labeling_round_11.csv', sep='|', usecols=range(1,13), # drop first column 'unnamed' encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC, quotechar='\'') # select only labeled articles MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=True, percentile=100)