''' Naive Bayes Classifier ====================== Naive Bayes is a probabilistic classifier that is able to predict a probability distribution over a set of classes, rather than only outputting the most likely class that the observation should belong to 'Naive' means, that it assumes that the value of a particular feature (word in an article) is independent of the value of any other feature, given the label. It considers each of these features to contribute independently to the probability that it belongs to its category, regardless of any possible correlations between these features. ''' from BagOfWords import BagOfWords from CsvHandler import CsvHandler from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectPercentile from sklearn.metrics import recall_score, precision_score from sklearn.model_selection import StratifiedKFold from sklearn.naive_bayes import GaussianNB class NaiveBayes: def make_naive_bayes(dataset): '''fits naive bayes model with StratifiedKFold, uses my BOW ''' print('# fitting model') print('# ...') # split data into text and label set # join title and text X = dataset['Title'] + ' ' + dataset['Text'] y = dataset['Label'] cv = CountVectorizer() # use stratified k-fold cross-validation as split method skf = StratifiedKFold(n_splits = 10, shuffle=True) classifier = GaussianNB() # lists for metrics recall_scores = [] precision_scores = [] f1_scores = [] # for each fold n = 0 for train, test in skf.split(X,y): n += 1 print('# split no. ' + str(n)) # eigenes BOW => schlechtere ergebnisse vocab = BagOfWords.make_vocab(X[train]) # fit the training data and then return the matrix training_data = BagOfWords.make_matrix(X[train], vocab) # transform testing data and return the matrix testing_data = BagOfWords.make_matrix(X[test], vocab) # # # using CountVectorizer: # # fit the training data and then return the matrix # training_data = cv.fit_transform(X[train], y[train]).toarray() # # transform testing data and return the matrix # testing_data = cv.transform(X[test]).toarray() # # apply select percentile # selector = SelectPercentile(percentile=25) # selector.fit(training_data, y[train]) # training_data_r = selector.transform(training_data) # testing_data_r = selector.transform(testing_data) # #fit classifier # classifier.fit(training_data_r, y[train]) # #predict class # predictions_train = classifier.predict(training_data_r) # predictions_test = classifier.predict(testing_data_r) #fit classifier classifier.fit(training_data, y[train]) #predict class predictions_train = classifier.predict(training_data) predictions_test = classifier.predict(testing_data) #print and store metrics rec = recall_score(y[test], predictions_test) print('rec: ' + str(rec)) recall_scores.append(rec) prec = precision_score(y[train], predictions_train) print('prec: ' + str(prec)) print('#') precision_scores.append(prec) # equation for f1 score f1_scores.append(2 * (prec * rec)/(prec + rec)) ########################## #print metrics of test set print('-------------------------') print('prediction of testing set:') print('Precision score: min = {}, max = {}, average = {}' .format(min(precision_scores), max(precision_scores), sum(precision_scores)/float(len(precision_scores)))) print('Recall score: min = {}, max = {}, average = {}' .format(min(recall_scores), max(recall_scores), sum(recall_scores)/float(len(recall_scores)))) print('F1 score: min = {}, max = {}, average = {}' .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores)))) print() ##### nur für overfit testing ########### #print('overfit testing: prediction of training set') #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'. #format(min(f1_scores_train), max(f1_scores_train), #sum(f1_scores_train)/float(len(f1_scores_train)))) #print() ######## nur für resubstitutionsfehler benötigt ######## def analyze_errors(dataset): '''calculates resubstitution error shows indices of false classified articles uses Gaussian Bayes with train test split ''' X_train_test = dataset['Title'] + ' ' + dataset['Text'] y_train_test = dataset['Label'] count_vector = CountVectorizer() # fit the training data and then return the matrix training_data = count_vector.fit_transform(X_train_test).toarray() # transform testing data and return the matrix testing_data = count_vector.transform(X_train_test).toarray() # Naive Bayes classifier = GaussianNB() # fit classifier classifier.fit(training_data, y_train_test) # Predict class predictions = classifier.predict(testing_data) print('Errors at index:') print() n = 0 for i in range(len(y_train_test)): if y_train_test[i] != predictions[i]: n += 1 print('error no.{}'.format(n)) print('prediction at index {} is: {}, but actual is: {}' .format(i, predictions[i], y_train_test[i])) print(X_train_test[i]) print(y_train_test[i]) print() #print metrics print('F1 score: ', format(f1_score(y_train_test, predictions))) ################################# print('# starting naive bayes') print('# ...') file = 'classification_labelled_corrected.csv' # read csv file print('# reading dataset') print('# ...') dataset = CsvHandler.read_csv(file) make_naive_bayes(dataset) print('#') print('# ending naive bayes')