added multinomial naive bayes
This commit is contained in:
parent
5fb06ba811
commit
afe0e96efd
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -122,9 +122,11 @@ class BagOfWords:
|
|||
else:
|
||||
# absolute word frequency
|
||||
df_matrix.loc[i][v] += 1
|
||||
# save df_matrix object
|
||||
with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
|
||||
pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
# size too large :-(
|
||||
# # save df_matrix object
|
||||
# with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
|
||||
# pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return df_matrix
|
||||
|
||||
|
@ -289,20 +291,3 @@ class BagOfWords:
|
|||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||
print(len(vocab))
|
||||
|
||||
# for text in corpus:
|
||||
# print(text)
|
||||
# print()
|
||||
# print()
|
||||
# # ab hier ValueError bei nrows=10000...
|
||||
# matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
||||
# dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
|
||||
# print(dict)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# for word in sorted(BagOfWords.set_stop_words(False)):
|
||||
# print(word)
|
||||
# print()
|
||||
# print(PorterStemmer().stem(word))
|
||||
# print()
|
||||
BagOfWords.test()
|
|
@ -0,0 +1,91 @@
|
|||
'''
|
||||
Multinomial Naive Bayes Classifier for Interactive Labeling
|
||||
===========================================================
|
||||
|
||||
multinomial implementation of naive bayes.
|
||||
prints out probabilities for classes needed for interactive labeling.
|
||||
'''
|
||||
from BagOfWords import BagOfWords
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
|
||||
class MNBInteractive:
|
||||
|
||||
'''NOTE: The multinomial distribution normally requires integer feature counts.
|
||||
However, in practice, fractional counts such as tf-idf may also work.
|
||||
'''
|
||||
|
||||
def make_nb(labeled_data, unlabeled_data):
|
||||
'''fits naive bayes model
|
||||
'''
|
||||
# chose BagOfWords implementation (own if false)
|
||||
sklearn_cv = False
|
||||
|
||||
print('# starting multinomial naives bayes...')
|
||||
print()
|
||||
|
||||
# split labeled data into text and label set
|
||||
# join title and text
|
||||
X = labeled_data['Title'] + '. ' + labeled_data['Text']
|
||||
y = labeled_data['Label']
|
||||
|
||||
# split unlabeled data into text and label set
|
||||
# join title and text
|
||||
U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
|
||||
|
||||
if sklearn_cv:
|
||||
cv = CountVectorizer()
|
||||
|
||||
# fit_prior=False: a uniform prior will be used instead
|
||||
# of learning class prior probabilities
|
||||
classifier = MultinomialNB(alpha=0.5,
|
||||
fit_prior=False,
|
||||
class_prior=None)
|
||||
|
||||
# metrics
|
||||
recall_scores = []
|
||||
precision_scores = []
|
||||
f1_scores = []
|
||||
|
||||
# probabilities of each class (of each fold)
|
||||
class_probs = []
|
||||
|
||||
# number of training samples observed in each class
|
||||
class_counts = []
|
||||
|
||||
if sklearn_cv:
|
||||
# use sklearn CountVectorizer
|
||||
# fit the training data and then return the matrix
|
||||
training_data = cv.fit_transform(X, y).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = cv.transform(U).toarray()
|
||||
else:
|
||||
# use my own BagOfWords python implementation
|
||||
stemming = True
|
||||
rel_freq = True
|
||||
extracted_words = BagOfWords.extract_all_words(X)
|
||||
vocab = BagOfWords.make_vocab(extracted_words)
|
||||
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
|
||||
# transform testing data and return the matrix
|
||||
extracted_words = BagOfWords.extract_all_words(U)
|
||||
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
|
||||
#fit classifier
|
||||
classifier.fit(training_data, y)
|
||||
|
||||
# probability estimates for the test vector (testing_data)
|
||||
class_probs = classifier.predict_proba(testing_data)
|
||||
|
||||
# number of samples encountered for each class during fitting
|
||||
# this value is weighted by the sample weight when provided
|
||||
class_count = classifier.class_count_
|
||||
|
||||
# return classes and vector of class estimates
|
||||
return class_count, class_probs
|
|
@ -1,202 +0,0 @@
|
|||
'''
|
||||
Naive Bayes Classifier
|
||||
======================
|
||||
|
||||
basic implementation of naive bayes.
|
||||
prints out probabilities for classes needed for interactive labeling.
|
||||
'''
|
||||
from BagOfWords import BagOfWords
|
||||
|
||||
import csv
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import recall_score, precision_score
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
|
||||
class NaiveBayesInteractive:
|
||||
|
||||
def make_naive_bayes(dataset, sklearn_cv=False, percentile=100):
|
||||
'''fits naive bayes model
|
||||
'''
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
||||
# split data into text and label set
|
||||
# join title and text
|
||||
X = dataset['Title'] + '. ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
if sklearn_cv:
|
||||
cv = CountVectorizer()
|
||||
|
||||
# stratified k-fold cross-validation as split method
|
||||
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)
|
||||
|
||||
classifier = GaussianNB()
|
||||
|
||||
# metrics
|
||||
recall_scores = []
|
||||
precision_scores = []
|
||||
f1_scores = []
|
||||
|
||||
# probabilities of each class (of each fold)
|
||||
class_prob = []
|
||||
# counts number of training samples observed in each class
|
||||
class_counts = []
|
||||
|
||||
# for each fold
|
||||
n = 0
|
||||
for train, test in kf.split(X,y):
|
||||
|
||||
n += 1
|
||||
print('# split no. ' + str(n))
|
||||
|
||||
if sklearn_cv:
|
||||
# use sklearn CountVectorizer
|
||||
# fit the training data and then return the matrix
|
||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = cv.transform(X[test]).toarray()
|
||||
else:
|
||||
# use my own BagOfWords python implementation
|
||||
stemming = True
|
||||
rel_freq = True
|
||||
extracted_words = BagOfWords.extract_all_words(X[train])
|
||||
vocab = BagOfWords.make_vocab(extracted_words)
|
||||
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
# transform testing data and return the matrix
|
||||
extracted_words = BagOfWords.extract_all_words(X[test])
|
||||
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
|
||||
# apply select percentile
|
||||
selector = SelectPercentile(percentile=percentile)
|
||||
selector.fit(training_data, y[train])
|
||||
|
||||
# new reduced data sets
|
||||
training_data_r = selector.transform(training_data)
|
||||
testing_data_r = selector.transform(testing_data)
|
||||
|
||||
#fit classifier
|
||||
classifier.fit(training_data_r, y[train])
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data_r)
|
||||
predictions_test = classifier.predict(testing_data_r)
|
||||
|
||||
#print and store metrics
|
||||
rec = recall_score(y[test], predictions_test)
|
||||
print('rec: ' + str(rec))
|
||||
recall_scores.append(rec)
|
||||
prec = precision_score(y[test], predictions_test)
|
||||
print('prec: ' + str(prec))
|
||||
print('#')
|
||||
precision_scores.append(prec)
|
||||
# equation for f1 score
|
||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||
|
||||
class_prob.append(classifier.class_prior_)
|
||||
class_counts.append(classifier.class_count_)
|
||||
|
||||
##########################
|
||||
#print metrics of test set
|
||||
print('-------------------------')
|
||||
print('prediction of testing set:')
|
||||
print('Precision score: min = {}, max = {}, average = {}'
|
||||
.format(min(precision_scores),
|
||||
max(precision_scores),
|
||||
sum(precision_scores)/float(len(precision_scores))))
|
||||
print('Recall score: min = {}, max = {}, average = {}'
|
||||
.format(min(recall_scores),
|
||||
max(recall_scores),
|
||||
sum(recall_scores)/float(len(recall_scores))))
|
||||
print('F1 score: min = {}, max = {}, average = {}'
|
||||
.format(min(f1_scores),
|
||||
max(f1_scores),
|
||||
sum(f1_scores)/float(len(f1_scores))))
|
||||
print()
|
||||
# print probability of each class
|
||||
print('probability of each class:')
|
||||
print()
|
||||
print(class_prob)
|
||||
print()
|
||||
print('number of samples of each class:')
|
||||
print()
|
||||
print(class_counts)
|
||||
print()
|
||||
|
||||
##### nur für overfit testing ###########
|
||||
#print('overfit testing: prediction of training set')
|
||||
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||
#format(min(f1_scores_train), max(f1_scores_train),
|
||||
#sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
#print()
|
||||
|
||||
######## nur für resubstitutionsfehler benötigt ########
|
||||
def analyze_errors(dataset):
|
||||
'''calculates resubstitution error
|
||||
shows indices of false classified articles
|
||||
uses Gaussian Bayes with train test split
|
||||
'''
|
||||
X_train_test = dataset['Title'] + ' ' + dataset['Text']
|
||||
y_train_test = dataset['Label']
|
||||
|
||||
count_vector = CountVectorizer()
|
||||
# fit the training data and then return the matrix
|
||||
training_data = count_vector.fit_transform(X_train_test).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = count_vector.transform(X_train_test).toarray()
|
||||
|
||||
# Naive Bayes
|
||||
classifier = GaussianNB()
|
||||
# fit classifier
|
||||
classifier.fit(training_data, y_train_test)
|
||||
|
||||
# Predict class
|
||||
predictions = classifier.predict(testing_data)
|
||||
print('Errors at index:')
|
||||
print()
|
||||
n = 0
|
||||
for i in range(len(y_train_test)):
|
||||
if y_train_test[i] != predictions[i]:
|
||||
n += 1
|
||||
print('error no.{}'.format(n))
|
||||
print('prediction at index {} is: {}, but actual is: {}'
|
||||
.format(i, predictions[i], y_train_test[i]))
|
||||
print(X_train_test[i])
|
||||
print(y_train_test[i])
|
||||
print()
|
||||
#print metrics
|
||||
print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
print('# starting naive bayes')
|
||||
print('# ...')
|
||||
|
||||
file = '..\data\\classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
data = pd.read_csv(file,
|
||||
sep='|',
|
||||
engine='python',
|
||||
decimal='.',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONE)
|
||||
|
||||
# training options
|
||||
use_count_vectorizer = False
|
||||
select_percentile = 100
|
||||
|
||||
make_naive_bayes(data, use_count_vectorizer, select_percentile)
|
||||
|
||||
print('#')
|
||||
print('# ending naive bayes')
|
Loading…
Reference in New Issue