256 lines
10 KiB
Python
256 lines
10 KiB
Python
'''
|
|
Naive Bayes Classifier
|
|
======================
|
|
|
|
Naive Bayes is a probabilistic classifier that is able to predict a
|
|
probability distribution over a set of classes, rather than only
|
|
outputting the most likely class that the observation should belong to.
|
|
'Naive' means, that it assumes that the value of a particular feature
|
|
(word in an article) is independent of the value of any other feature,
|
|
given the label. It considers each of these features to contribute
|
|
independently to the probability that it belongs to its category,
|
|
regardless of any possible correlations between these features.
|
|
'''
|
|
from BagOfWords import BagOfWords
|
|
from CsvHandler import CsvHandler
|
|
|
|
#from sklearn.feature_extraction.text import CountVectorizer
|
|
#from sklearn.feature_selection import SelectPercentile
|
|
from sklearn.metrics import recall_score, precision_score
|
|
from sklearn.model_selection import StratifiedKFold
|
|
#from sklearn.model_selection import train_test_split
|
|
from sklearn.naive_bayes import GaussianNB
|
|
|
|
class NaiveBayes:
|
|
|
|
def make_naive_bayes(dataset):
|
|
'''fits naive bayes model with StratifiedKFold,
|
|
uses my BOW
|
|
'''
|
|
print('# starting naive bayes')
|
|
print()
|
|
|
|
# join title and text
|
|
X = dataset['Title'] + ' ' + dataset['Text']
|
|
y = dataset['Label']
|
|
|
|
# use stratified k-fold cross-validation as split method
|
|
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
|
|
|
classifier = GaussianNB()
|
|
|
|
# lists for metrics
|
|
recall_scores = []
|
|
precision_scores = []
|
|
f1_scores = []
|
|
|
|
# for each fold
|
|
n = 0
|
|
for train, test in skf.split(X,y):
|
|
# BOW
|
|
vocab = BagOfWords.make_vocab(X[train])
|
|
# fit the training data and then return the matrix
|
|
training_data = BagOfWords.make_matrix(X[train], vocab)
|
|
# transform testing data and return the matrix
|
|
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
|
|
|
#fit classifier
|
|
classifier.fit(training_data, y[train])
|
|
#predict class
|
|
predictions_train = classifier.predict(training_data)
|
|
predictions_test = classifier.predict(testing_data)
|
|
|
|
#store metrics
|
|
rec = recall_score(y[test], predictions_test)
|
|
recall_scores.append(rec)
|
|
prec = precision_score(y[train], predictions_train)
|
|
precision_scores.append(prec)
|
|
# equation for f1 score
|
|
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
|
|
|
#print metrics of test set
|
|
print('prediction of testing set:')
|
|
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
|
|
.format(min(f1_scores), max(f1_scores),
|
|
sum(f1_scores)/float(len(f1_scores))))
|
|
print()
|
|
#print('overfit testing: prediction of training set')
|
|
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
|
#format(min(f1_scores_train), max(f1_scores_train),
|
|
#sum(f1_scores_train)/float(len(f1_scores_train))))
|
|
#print()
|
|
|
|
print('# ending naive bayes')
|
|
print()
|
|
|
|
# def make_naive_bayes_selectpercentile(dataset):
|
|
# '''fits naive bayes model with StratifiedKFold, uses my BOW
|
|
# feature selection: select 0.25-percentile
|
|
# '''
|
|
|
|
# print('# starting naive bayes')
|
|
# print()
|
|
|
|
# # alternative: use only articles' header => may give better results
|
|
# X = dataset['Title'] + ' ' + dataset['Text']
|
|
# y = dataset['Label']
|
|
|
|
# # use stratified k-fold cross-validation as split method
|
|
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
|
|
|
# classifier = GaussianNB()
|
|
|
|
# # lists for metrics
|
|
# recall_scores = []
|
|
# precision_scores = []
|
|
# f1_scores = []
|
|
|
|
# # for each fold
|
|
# n = 0
|
|
# for train, test in skf.split(X,y):
|
|
# # BOW
|
|
# vocab = BagOfWords.make_vocab(X[train])
|
|
# # fit the training data and then return the matrix
|
|
# training_data = BagOfWords.make_matrix(X[train], vocab)
|
|
# # transform testing data and return the matrix
|
|
# testing_data = BagOfWords.make_matrix(X[test], vocab)
|
|
|
|
# # apply select percentile
|
|
# selector = SelectPercentile(percentile=25)
|
|
# selector.fit(training_data, y[train])
|
|
|
|
# training_data_r = selector.transform(training_data)
|
|
# testing_data_r = selector.transform(testing_data)
|
|
|
|
# #fit classifier
|
|
# classifier.fit(training_data_r, y[train])
|
|
# #predict class
|
|
# predictions_train = classifier.predict(training_data_r)
|
|
# predictions_test = classifier.predict(testing_data_r)
|
|
|
|
# #store metrics
|
|
# rec = recall_score(y[test], predictions_test)
|
|
# recall_scores.append(rec)
|
|
# prec = precision_score(y[train], predictions_train)
|
|
# precision_scores.append(prec)
|
|
# # equation for f1 score
|
|
# f1_scores.append(2 * (prec * rec)/(prec + rec))
|
|
|
|
# #print metrics of test set
|
|
# print('prediction of testing set:')
|
|
# print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
|
|
# .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))
|
|
# print()
|
|
# #print('overfit testing: prediction of training set')
|
|
# #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
|
# #format(min(f1_scores_train), max(f1_scores_train),
|
|
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
|
# #print()
|
|
|
|
# print('# ending naive bayes')
|
|
# print()
|
|
|
|
|
|
# def make_naive_bayes_CV(dataset):
|
|
# '''alternative: uses CountVectorizer (faster)
|
|
# '''
|
|
# # alternative: use only articles' header => may give better results
|
|
# X = dataset['Title'] + '.' + dataset['Text'] + '.'
|
|
# y = dataset['Label']
|
|
|
|
# # use stratified k-fold cross-validation as split method
|
|
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
|
|
|
# count_vector = CountVectorizer()
|
|
|
|
# classifier = GaussianNB()
|
|
|
|
# # lists for metrics predicted on test/train set
|
|
# f1_scores, f1_scores_train = []
|
|
|
|
# # for each fold (10 times)
|
|
# # fold number
|
|
# n = 0
|
|
# for train, test in skf.split(X,y):
|
|
|
|
# # fit the training data and then return the matrix
|
|
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
|
|
# # transform testing data and return the matrix
|
|
# testing_data = count_vector.transform(X[test]).toarray()
|
|
|
|
# # apply select percentile
|
|
# selector = SelectPercentile(percentile=25)
|
|
# selector.fit(training_data, y[train])
|
|
|
|
# training_data_r = selector.transform(training_data)
|
|
# testing_data_r = selector.transform(testing_data)
|
|
|
|
# #fit classifier
|
|
# classifier.fit(training_data_r, y[train])
|
|
|
|
# #predict class
|
|
# predictions_train = classifier.predict(training_data_r)
|
|
# predictions_test = classifier.predict(testing_data_r)
|
|
|
|
# #store metrics predicted on test set
|
|
# f1_scores.append(f1_score(y[test], predictions_test))
|
|
|
|
# #store metrics predicted on train set
|
|
# f1_scores_train.append(f1_score(y[train], predictions_train))
|
|
|
|
# #print metrics of test set
|
|
# print('--------------------')
|
|
# print('prediction of testing set:')
|
|
# print('F1 score: min = {}, max = {}, average = {}'
|
|
# .format(min(f1_scores), max(f1_scores),
|
|
# sum(f1_scores)/float(len(f1_scores))))
|
|
|
|
# print()
|
|
# print('prediction of training set:')
|
|
# print('F1 score: min = {}, max = {}, average = {}'
|
|
# .format(min(f1_scores_train), max(f1_scores_train),
|
|
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
|
# print()
|
|
|
|
# def analyze_errors_cv(dataset):
|
|
# '''calculates resubstitution error
|
|
# shows indices of false classified articles
|
|
# uses Gaussian Bayes with train test split
|
|
# '''
|
|
|
|
# X_train_test = dataset['Text']
|
|
# y_train_test = dataset['Label']
|
|
|
|
# count_vector = CountVectorizer()
|
|
|
|
# # fit the training data and then return the matrix
|
|
# training_data = count_vector.fit_transform(X_train_test).toarray()
|
|
|
|
# # transform testing data and return the matrix
|
|
# testing_data = count_vector.transform(X_train_test).toarray()
|
|
|
|
# # Naive Bayes
|
|
# classifier = GaussianNB()
|
|
|
|
# # fit classifier
|
|
# classifier.fit(training_data, y_train_test)
|
|
|
|
# # Predict class
|
|
# predictions = classifier.predict(testing_data)
|
|
|
|
# print()
|
|
# print('errors at index:')
|
|
# n = 0
|
|
# for i in range(len(y_train_test)):
|
|
# if y_train_test[i] != predictions[i]:
|
|
# n += 1
|
|
# print('error no.{}'.format(n))
|
|
# print('prediction at index {} is: {}, but actual is: {}'
|
|
# .format(i, predictions[i], y_train_test[i]))
|
|
# print(X_train_test[i])
|
|
# print(y_train_test[i])
|
|
# print()
|
|
|
|
# print()
|
|
# #print metrics
|
|
# print('F1 score: ', format(f1_score(y_train_test, predictions))) |