added multinomial naive bayes

This commit is contained in:
Anne Lorenz 2018-12-10 13:57:39 +01:00
parent 5fb06ba811
commit afe0e96efd
6 changed files with 11283 additions and 11154 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -122,9 +122,11 @@ class BagOfWords:
else: else:
# absolute word frequency # absolute word frequency
df_matrix.loc[i][v] += 1 df_matrix.loc[i][v] += 1
# save df_matrix object
with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f: # size too large :-(
pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL) # # save df_matrix object
# with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
# pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
return df_matrix return df_matrix
@ -288,21 +290,4 @@ class BagOfWords:
#print(BagOfWords.count_features(corpus)) #print(BagOfWords.count_features(corpus))
extracted_words = BagOfWords.extract_all_words(corpus, stemming) extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming) vocab = BagOfWords.make_vocab(extracted_words, stemming)
print(len(vocab)) print(len(vocab))
# for text in corpus:
# print(text)
# print()
# print()
# # ab hier ValueError bei nrows=10000...
# matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
# dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
# print(dict)
if __name__ == '__main__':
# for word in sorted(BagOfWords.set_stop_words(False)):
# print(word)
# print()
# print(PorterStemmer().stem(word))
# print()
BagOfWords.test()

91
src/MNBInteractive.py Normal file
View File

@ -0,0 +1,91 @@
'''
Multinomial Naive Bayes Classifier for Interactive Labeling
===========================================================
multinomial implementation of naive bayes.
prints out probabilities for classes needed for interactive labeling.
'''
from BagOfWords import BagOfWords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
class MNBInteractive:
'''NOTE: The multinomial distribution normally requires integer feature counts.
However, in practice, fractional counts such as tf-idf may also work.
'''
def make_nb(labeled_data, unlabeled_data):
'''fits naive bayes model
'''
# chose BagOfWords implementation (own if false)
sklearn_cv = False
print('# starting multinomial naives bayes...')
print()
# split labeled data into text and label set
# join title and text
X = labeled_data['Title'] + '. ' + labeled_data['Text']
y = labeled_data['Label']
# split unlabeled data into text and label set
# join title and text
U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
if sklearn_cv:
cv = CountVectorizer()
# fit_prior=False: a uniform prior will be used instead
# of learning class prior probabilities
classifier = MultinomialNB(alpha=0.5,
fit_prior=False,
class_prior=None)
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
class_probs = []
# number of training samples observed in each class
class_counts = []
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X, y).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(U).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = True
extracted_words = BagOfWords.extract_all_words(X)
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
extracted_words = BagOfWords.extract_all_words(U)
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
#fit classifier
classifier.fit(training_data, y)
# probability estimates for the test vector (testing_data)
class_probs = classifier.predict_proba(testing_data)
# number of samples encountered for each class during fitting
# this value is weighted by the sample weight when provided
class_count = classifier.class_count_
# return classes and vector of class estimates
return class_count, class_probs

View File

@ -1,202 +0,0 @@
'''
Naive Bayes Classifier
======================
basic implementation of naive bayes.
prints out probabilities for classes needed for interactive labeling.
'''
from BagOfWords import BagOfWords
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
class NaiveBayesInteractive:
def make_naive_bayes(dataset, sklearn_cv=False, percentile=100):
'''fits naive bayes model
'''
print('# fitting model')
print('# ...')
# split data into text and label set
# join title and text
X = dataset['Title'] + '. ' + dataset['Text']
y = dataset['Label']
if sklearn_cv:
cv = CountVectorizer()
# stratified k-fold cross-validation as split method
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)
classifier = GaussianNB()
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
class_prob = []
# counts number of training samples observed in each class
class_counts = []
# for each fold
n = 0
for train, test in kf.split(X,y):
n += 1
print('# split no. ' + str(n))
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = True
extracted_words = BagOfWords.extract_all_words(X[train])
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
extracted_words = BagOfWords.extract_all_words(X[test])
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# apply select percentile
selector = SelectPercentile(percentile=percentile)
selector.fit(training_data, y[train])
# new reduced data sets
training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data)
#fit classifier
classifier.fit(training_data_r, y[train])
#predict class
predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r)
#print and store metrics
rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[test], predictions_test)
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
class_prob.append(classifier.class_prior_)
class_counts.append(classifier.class_count_)
##########################
#print metrics of test set
print('-------------------------')
print('prediction of testing set:')
print('Precision score: min = {}, max = {}, average = {}'
.format(min(precision_scores),
max(precision_scores),
sum(precision_scores)/float(len(precision_scores))))
print('Recall score: min = {}, max = {}, average = {}'
.format(min(recall_scores),
max(recall_scores),
sum(recall_scores)/float(len(recall_scores))))
print('F1 score: min = {}, max = {}, average = {}'
.format(min(f1_scores),
max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
print()
# print probability of each class
print('probability of each class:')
print()
print(class_prob)
print()
print('number of samples of each class:')
print()
print(class_counts)
print()
##### nur für overfit testing ###########
#print('overfit testing: prediction of training set')
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
#format(min(f1_scores_train), max(f1_scores_train),
#sum(f1_scores_train)/float(len(f1_scores_train))))
#print()
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(dataset):
'''calculates resubstitution error
shows indices of false classified articles
uses Gaussian Bayes with train test split
'''
X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label']
count_vector = CountVectorizer()
# fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train_test).toarray()
# transform testing data and return the matrix
testing_data = count_vector.transform(X_train_test).toarray()
# Naive Bayes
classifier = GaussianNB()
# fit classifier
classifier.fit(training_data, y_train_test)
# Predict class
predictions = classifier.predict(testing_data)
print('Errors at index:')
print()
n = 0
for i in range(len(y_train_test)):
if y_train_test[i] != predictions[i]:
n += 1
print('error no.{}'.format(n))
print('prediction at index {} is: {}, but actual is: {}'
.format(i, predictions[i], y_train_test[i]))
print(X_train_test[i])
print(y_train_test[i])
print()
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))
if __name__ == '__main__':
print('# starting naive bayes')
print('# ...')
file = '..\data\\classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
data = pd.read_csv(file,
sep='|',
engine='python',
decimal='.',
quotechar='\'',
quoting=csv.QUOTE_NONE)
# training options
use_count_vectorizer = False
select_percentile = 100
make_naive_bayes(data, use_count_vectorizer, select_percentile)
print('#')
print('# ending naive bayes')