''' Naive Bayes Classifier ====================== basic implementation of naive bayes. prints out probabilities for classes. needed for interactive labeling. ''' from CsvHandler import CsvHandler from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import recall_score, precision_score from sklearn.model_selection import KFold from sklearn.naive_bayes import GaussianNB class NaiveBayes_simple: def make_naive_bayes(dataset): '''fits naive bayes model with StratifiedKFold, uses my BOW ''' print('# fitting model') print('# ...') # split data into text and label set # join title and text X = dataset['Title'] + ' ' + dataset['Text'] y = dataset['Label'] cv = CountVectorizer() # k-fold cross-validation as split method kf = KFold(n_splits=10, shuffle=True, random_state=5) classifier = GaussianNB() # metrics recall_scores = [] precision_scores = [] f1_scores = [] # probabilities of each class (of each fold) class_prob = [] # counts number of training samples observed in each class class_counts = [] # for each fold n = 0 for train, test in kf.split(X,y): n += 1 print('# split no. ' + str(n)) # using CountVectorizer: # fit the training data and then return the matrix training_data = cv.fit_transform(X[train], y[train]).toarray() # transform testing data and return the matrix testing_data = cv.transform(X[test]).toarray() #fit classifier classifier.fit(training_data, y[train]) #predict class predictions_train = classifier.predict(training_data) predictions_test = classifier.predict(testing_data) #print and store metrics rec = recall_score(y[test], predictions_test) print('rec: ' + str(rec)) recall_scores.append(rec) prec = precision_score(y[train], predictions_train) print('prec: ' + str(prec)) print('#') precision_scores.append(prec) # equation for f1 score f1_scores.append(2 * (prec * rec)/(prec + rec)) class_prob.append(classifier.class_prior_) class_counts.append(classifier.class_count_) ########################## #print metrics of test set print('-------------------------') print('prediction of testing set:') print('Precision score: min = {}, max = {}, average = {}' .format(min(precision_scores), max(precision_scores), sum(precision_scores)/float(len(precision_scores)))) print('Recall score: min = {}, max = {}, average = {}' .format(min(recall_scores), max(recall_scores), sum(recall_scores)/float(len(recall_scores)))) print('F1 score: min = {}, max = {}, average = {}' .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores)))) print() # print probability of each class print('probability of each class:') print() print(class_prob) print() print('number of samples of each class:') print() print(class_counts) print() ##### nur für overfit testing ########### #print('overfit testing: prediction of training set') #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'. #format(min(f1_scores_train), max(f1_scores_train), #sum(f1_scores_train)/float(len(f1_scores_train)))) #print() ######## nur für resubstitutionsfehler benötigt ######## def analyze_errors(dataset): '''calculates resubstitution error shows indices of false classified articles uses Gaussian Bayes with train test split ''' X_train_test = dataset['Title'] + ' ' + dataset['Text'] y_train_test = dataset['Label'] count_vector = CountVectorizer() # fit the training data and then return the matrix training_data = count_vector.fit_transform(X_train_test).toarray() # transform testing data and return the matrix testing_data = count_vector.transform(X_train_test).toarray() # Naive Bayes classifier = GaussianNB() # fit classifier classifier.fit(training_data, y_train_test) # Predict class predictions = classifier.predict(testing_data) print('Errors at index:') print() n = 0 for i in range(len(y_train_test)): if y_train_test[i] != predictions[i]: n += 1 print('error no.{}'.format(n)) print('prediction at index {} is: {}, but actual is: {}' .format(i, predictions[i], y_train_test[i])) print(X_train_test[i]) print(y_train_test[i]) print() #print metrics print('F1 score: ', format(f1_score(y_train_test, predictions))) if __name__ == '__main__': print('# starting naive bayes') print('# ...') file = 'classification_labelled_corrected.csv' # read csv file print('# reading dataset') print('# ...') dataset = CsvHandler.read_csv(file) make_naive_bayes(dataset) print('#') print('# ending naive bayes')