175 lines
5.0 KiB
Python
175 lines
5.0 KiB
Python
|
'''
|
||
|
Multinomial Naive Bayes Classifier
|
||
|
======================
|
||
|
'''
|
||
|
|
||
|
from BagOfWords import BagOfWords
|
||
|
|
||
|
import csv
|
||
|
|
||
|
import pandas as pd
|
||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||
|
from sklearn.feature_selection import SelectPercentile
|
||
|
from sklearn.metrics import recall_score, precision_score
|
||
|
from sklearn.model_selection import StratifiedKFold
|
||
|
from sklearn.naive_bayes import MultinomialNB
|
||
|
|
||
|
class MultinomialNaiveBayes:
|
||
|
|
||
|
def make_mnb(dataset, sklearn_cv=True, percentile=100):
|
||
|
'''fits naive bayes model with StratifiedKFold
|
||
|
'''
|
||
|
print('# starting classical multinomial naive bayes')
|
||
|
print('# ...')
|
||
|
|
||
|
# split data into text and label set
|
||
|
# join title and text
|
||
|
X = dataset['Title'] + '. ' + dataset['Text']
|
||
|
y = dataset['Label']
|
||
|
|
||
|
if sklearn_cv:
|
||
|
cv = CountVectorizer()
|
||
|
|
||
|
# use stratified k-fold cross-validation as split method
|
||
|
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
||
|
|
||
|
classifier = MultinomialNB(alpha=1.0e-10,
|
||
|
fit_prior=False,
|
||
|
class_prior=None)
|
||
|
|
||
|
# metrics
|
||
|
recall_scores = []
|
||
|
precision_scores = []
|
||
|
f1_scores = []
|
||
|
|
||
|
# probabilities of each class (of each fold)
|
||
|
#class_prob = []
|
||
|
# counts number of training samples observed in each class
|
||
|
#class_counts = []
|
||
|
|
||
|
# for each fold
|
||
|
n = 0
|
||
|
for train, test in skf.split(X,y):
|
||
|
|
||
|
n += 1
|
||
|
print('# split no. ' + str(n))
|
||
|
|
||
|
if sklearn_cv:
|
||
|
# use sklearn CountVectorizer
|
||
|
# fit the training data and then return the matrix
|
||
|
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||
|
# transform testing data and return the matrix
|
||
|
testing_data = cv.transform(X[test]).toarray()
|
||
|
else:
|
||
|
# use my own BagOfWords python implementation
|
||
|
stemming = True
|
||
|
rel_freq = True
|
||
|
extracted_words = BagOfWords.extract_all_words(X[train])
|
||
|
vocab = BagOfWords.make_vocab(extracted_words)
|
||
|
|
||
|
# fit the training data and then return the matrix
|
||
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||
|
vocab, rel_freq, stemming)
|
||
|
# transform testing data and return the matrix
|
||
|
extracted_words = BagOfWords.extract_all_words(X[test])
|
||
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||
|
vocab, rel_freq, stemming)
|
||
|
|
||
|
# apply select percentile
|
||
|
selector = SelectPercentile(percentile=percentile)
|
||
|
selector.fit(training_data, y[train])
|
||
|
|
||
|
# new reduced data sets
|
||
|
training_data_r = selector.transform(training_data)
|
||
|
testing_data_r = selector.transform(testing_data)
|
||
|
|
||
|
#fit classifier
|
||
|
classifier.fit(training_data_r, y[train])
|
||
|
#predict class
|
||
|
predictions_train = classifier.predict(training_data_r)
|
||
|
predictions_test = classifier.predict(testing_data_r)
|
||
|
# print('train:')
|
||
|
# print(y[train])
|
||
|
# print('test:')
|
||
|
# print(y[test])
|
||
|
# print()
|
||
|
# print('pred')
|
||
|
# print(predictions_test)
|
||
|
|
||
|
#print and store metrics
|
||
|
rec = recall_score(y[test], predictions_test, average='weighted')
|
||
|
print('rec: ' + str(rec))
|
||
|
recall_scores.append(rec)
|
||
|
prec = precision_score(y[test], predictions_test, average='weighted')
|
||
|
print('prec: ' + str(prec))
|
||
|
print('#')
|
||
|
precision_scores.append(prec)
|
||
|
# equation for f1 score
|
||
|
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||
|
|
||
|
#class_prob.append(classifier.class_prior_)
|
||
|
#class_counts.append(classifier.class_count_)
|
||
|
|
||
|
##########################
|
||
|
# probability estimates for the test vector (testing_data)
|
||
|
class_probs = classifier.predict_proba(testing_data)
|
||
|
|
||
|
# number of samples encountered for each class during fitting
|
||
|
# this value is weighted by the sample weight when provided
|
||
|
class_count = classifier.class_count_
|
||
|
|
||
|
# classes in order used
|
||
|
classes = classifier.classes_
|
||
|
|
||
|
# return classes and vector of class estimates
|
||
|
return recall_scores, precision_scores, f1_scores
|
||
|
|
||
|
######## nur für resubstitutionsfehler benötigt ########
|
||
|
def analyze_errors(training, testing):
|
||
|
'''calculates resubstitution error
|
||
|
shows indices of false classified articles
|
||
|
uses Gaussian Bayes with train test split
|
||
|
'''
|
||
|
X_train = training['Title'] + ' ' + training['Text']
|
||
|
y_train = training['Label']
|
||
|
|
||
|
X_test = testing['Title'] + ' ' + testing['Text']
|
||
|
y_test = testing['Label']
|
||
|
|
||
|
count_vector = CountVectorizer()
|
||
|
|
||
|
# fit the training data and then return the matrix
|
||
|
training_data = count_vector.fit_transform(X_train).toarray()
|
||
|
|
||
|
# transform testing data and return the matrix
|
||
|
testing_data = count_vector.transform(X_test).toarray()
|
||
|
|
||
|
# Naive Bayes
|
||
|
classifier = MultinomialNB(alpha=1.0e-10,
|
||
|
fit_prior=False,
|
||
|
class_prior=None)
|
||
|
# fit classifier
|
||
|
classifier.fit(training_data, y_train)
|
||
|
|
||
|
# Predict class
|
||
|
predictions = classifier.predict(testing_data)
|
||
|
|
||
|
print(type(y_test))
|
||
|
print(len(y_test))
|
||
|
print(type(predictions))
|
||
|
print(len(predictions))
|
||
|
|
||
|
print('Errors at index:')
|
||
|
print()
|
||
|
n = 0
|
||
|
for i in range(len(y_test)):
|
||
|
if y_test[i] != predictions[i]:
|
||
|
n += 1
|
||
|
print('error no.{}'.format(n))
|
||
|
print('prediction at index {} is: {}, but actual is: {}'
|
||
|
.format(i, predictions[i], y_test[i]))
|
||
|
print(X_test[i])
|
||
|
print(y_test[i])
|
||
|
print()
|
||
|
#print metrics
|
||
|
print('F1 score: ', format(f1_score(y_test, predictions)))
|