thesis-anne/src/MNBInteractive.py
2019-03-01 12:28:29 +01:00

211 lines
6.4 KiB
Python

'''
Multinomial Naive Bayes Classifier for Interactive Labeling
===========================================================
multinomial implementation of naive bayes.
prints out probabilities for classes needed for interactive labeling.
'''
from BagOfWords import BagOfWords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
class MNBInteractive:
'''NOTE: The multinomial distribution normally requires integer feature counts.
However, in practice, fractional counts such as tf-idf may also work.
'''
def estimate_mnb(labeled_data, unlabeled_data, sklearn_cv=False):
'''fits naive bayes model
'''
print('# MNB: starting interactive multinomial naives bayes...')
print()
# split labeled data into text and label set
# join title and text
X = labeled_data['Title'] + '. ' + labeled_data['Text']
y = labeled_data['Label']
# split unlabeled data into text and label set
# join title and text
U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
l = unlabeled_data['Label']
if sklearn_cv:
cv = CountVectorizer()
# fit_prior=False: a uniform prior will be used instead
# of learning class prior probabilities
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
class_probs = []
# number of training samples observed in each class
class_counts = []
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X, y).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(U).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = False
extracted_words = BagOfWords.extract_all_words(X)
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
print('# MNB: fit training data and calculate matrix...')
print()
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
print('# MNB: transform testing data to matrix...')
print()
extracted_words = BagOfWords.extract_all_words(U)
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
#fit classifier
classifier.fit(training_data, y)
# probability estimates for the test vector (testing_data)
class_probs = classifier.predict_proba(testing_data)
# number of samples encountered for each class during fitting
# this value is weighted by the sample weight when provided
class_count = classifier.class_count_
# classes in order used
classes = classifier.classes_
print('# MNB: ending multinomial naive bayes')
# return classes and vector of class estimates
return classes, class_count, class_probs
def measure_mnb(X, y, sklearn_cv=False, percentile=100):
'''fits multinomial naive bayes model
'''
print('# fitting model')
print('# ...')
if sklearn_cv:
cv = CountVectorizer()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 2, shuffle=True, random_state=5)
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
class_prob = []
# counts number of training samples observed in each class
class_counts = []
# for each fold
n = 0
for train, test in skf.split(X,y):
n += 1
print('# split no. ' + str(n))
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = True
extracted_words = BagOfWords.extract_all_words(X[train])
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
extracted_words = BagOfWords.extract_all_words(X[test])
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# apply select percentile
selector = SelectPercentile(percentile=percentile)
selector.fit(training_data, y[train])
# new reduced data sets
training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data)
#fit classifier
classifier.fit(training_data_r, y[train])
#predict class
predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r)
#print and store metrics
rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[test], predictions_test)
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
#class_prob.append(classifier.class_prior_)
#class_counts.append(classifier.class_count_)
##########################
#print metrics of test set
print('-------------------------')
print('prediction of testing set:')
print('Precision score: min = {}, max = {}, average = {}'
.format(min(precision_scores),
max(precision_scores),
sum(precision_scores)/float(len(precision_scores))))
print('Recall score: min = {}, max = {}, average = {}'
.format(min(recall_scores),
max(recall_scores),
sum(recall_scores)/float(len(recall_scores))))
print('F1 score: min = {}, max = {}, average = {}'
.format(min(f1_scores),
max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
# print()
# # print probability of each class
# print('probability of each class:')
# print()
# #print(class_prob)
# print()
# print('number of samples of each class:')
# print()
# #print(class_counts)
# print()