added multinomial naive bayes
This commit is contained in:
parent
5fb06ba811
commit
afe0e96efd
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
1186
src/2018-12-01-al-interactive-labeling.ipynb
Normal file
1186
src/2018-12-01-al-interactive-labeling.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -122,9 +122,11 @@ class BagOfWords:
|
|||||||
else:
|
else:
|
||||||
# absolute word frequency
|
# absolute word frequency
|
||||||
df_matrix.loc[i][v] += 1
|
df_matrix.loc[i][v] += 1
|
||||||
# save df_matrix object
|
|
||||||
with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
|
# size too large :-(
|
||||||
pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
|
# # save df_matrix object
|
||||||
|
# with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
|
||||||
|
# pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
return df_matrix
|
return df_matrix
|
||||||
|
|
||||||
@ -288,21 +290,4 @@ class BagOfWords:
|
|||||||
#print(BagOfWords.count_features(corpus))
|
#print(BagOfWords.count_features(corpus))
|
||||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
print(len(vocab))
|
print(len(vocab))
|
||||||
|
|
||||||
# for text in corpus:
|
|
||||||
# print(text)
|
|
||||||
# print()
|
|
||||||
# print()
|
|
||||||
# # ab hier ValueError bei nrows=10000...
|
|
||||||
# matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
|
||||||
# dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
|
|
||||||
# print(dict)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
# for word in sorted(BagOfWords.set_stop_words(False)):
|
|
||||||
# print(word)
|
|
||||||
# print()
|
|
||||||
# print(PorterStemmer().stem(word))
|
|
||||||
# print()
|
|
||||||
BagOfWords.test()
|
|
91
src/MNBInteractive.py
Normal file
91
src/MNBInteractive.py
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
'''
|
||||||
|
Multinomial Naive Bayes Classifier for Interactive Labeling
|
||||||
|
===========================================================
|
||||||
|
|
||||||
|
multinomial implementation of naive bayes.
|
||||||
|
prints out probabilities for classes needed for interactive labeling.
|
||||||
|
'''
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
|
||||||
|
class MNBInteractive:
|
||||||
|
|
||||||
|
'''NOTE: The multinomial distribution normally requires integer feature counts.
|
||||||
|
However, in practice, fractional counts such as tf-idf may also work.
|
||||||
|
'''
|
||||||
|
|
||||||
|
def make_nb(labeled_data, unlabeled_data):
|
||||||
|
'''fits naive bayes model
|
||||||
|
'''
|
||||||
|
# chose BagOfWords implementation (own if false)
|
||||||
|
sklearn_cv = False
|
||||||
|
|
||||||
|
print('# starting multinomial naives bayes...')
|
||||||
|
print()
|
||||||
|
|
||||||
|
# split labeled data into text and label set
|
||||||
|
# join title and text
|
||||||
|
X = labeled_data['Title'] + '. ' + labeled_data['Text']
|
||||||
|
y = labeled_data['Label']
|
||||||
|
|
||||||
|
# split unlabeled data into text and label set
|
||||||
|
# join title and text
|
||||||
|
U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
|
||||||
|
|
||||||
|
if sklearn_cv:
|
||||||
|
cv = CountVectorizer()
|
||||||
|
|
||||||
|
# fit_prior=False: a uniform prior will be used instead
|
||||||
|
# of learning class prior probabilities
|
||||||
|
classifier = MultinomialNB(alpha=0.5,
|
||||||
|
fit_prior=False,
|
||||||
|
class_prior=None)
|
||||||
|
|
||||||
|
# metrics
|
||||||
|
recall_scores = []
|
||||||
|
precision_scores = []
|
||||||
|
f1_scores = []
|
||||||
|
|
||||||
|
# probabilities of each class (of each fold)
|
||||||
|
class_probs = []
|
||||||
|
|
||||||
|
# number of training samples observed in each class
|
||||||
|
class_counts = []
|
||||||
|
|
||||||
|
if sklearn_cv:
|
||||||
|
# use sklearn CountVectorizer
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = cv.fit_transform(X, y).toarray()
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
testing_data = cv.transform(U).toarray()
|
||||||
|
else:
|
||||||
|
# use my own BagOfWords python implementation
|
||||||
|
stemming = True
|
||||||
|
rel_freq = True
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X)
|
||||||
|
vocab = BagOfWords.make_vocab(extracted_words)
|
||||||
|
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
extracted_words = BagOfWords.extract_all_words(U)
|
||||||
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
|
#fit classifier
|
||||||
|
classifier.fit(training_data, y)
|
||||||
|
|
||||||
|
# probability estimates for the test vector (testing_data)
|
||||||
|
class_probs = classifier.predict_proba(testing_data)
|
||||||
|
|
||||||
|
# number of samples encountered for each class during fitting
|
||||||
|
# this value is weighted by the sample weight when provided
|
||||||
|
class_count = classifier.class_count_
|
||||||
|
|
||||||
|
# return classes and vector of class estimates
|
||||||
|
return class_count, class_probs
|
@ -1,202 +0,0 @@
|
|||||||
'''
|
|
||||||
Naive Bayes Classifier
|
|
||||||
======================
|
|
||||||
|
|
||||||
basic implementation of naive bayes.
|
|
||||||
prints out probabilities for classes needed for interactive labeling.
|
|
||||||
'''
|
|
||||||
from BagOfWords import BagOfWords
|
|
||||||
|
|
||||||
import csv
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
|
||||||
from sklearn.feature_selection import SelectPercentile
|
|
||||||
from sklearn.metrics import recall_score, precision_score
|
|
||||||
from sklearn.model_selection import StratifiedKFold
|
|
||||||
from sklearn.naive_bayes import GaussianNB
|
|
||||||
|
|
||||||
class NaiveBayesInteractive:
|
|
||||||
|
|
||||||
def make_naive_bayes(dataset, sklearn_cv=False, percentile=100):
|
|
||||||
'''fits naive bayes model
|
|
||||||
'''
|
|
||||||
print('# fitting model')
|
|
||||||
print('# ...')
|
|
||||||
|
|
||||||
# split data into text and label set
|
|
||||||
# join title and text
|
|
||||||
X = dataset['Title'] + '. ' + dataset['Text']
|
|
||||||
y = dataset['Label']
|
|
||||||
|
|
||||||
if sklearn_cv:
|
|
||||||
cv = CountVectorizer()
|
|
||||||
|
|
||||||
# stratified k-fold cross-validation as split method
|
|
||||||
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)
|
|
||||||
|
|
||||||
classifier = GaussianNB()
|
|
||||||
|
|
||||||
# metrics
|
|
||||||
recall_scores = []
|
|
||||||
precision_scores = []
|
|
||||||
f1_scores = []
|
|
||||||
|
|
||||||
# probabilities of each class (of each fold)
|
|
||||||
class_prob = []
|
|
||||||
# counts number of training samples observed in each class
|
|
||||||
class_counts = []
|
|
||||||
|
|
||||||
# for each fold
|
|
||||||
n = 0
|
|
||||||
for train, test in kf.split(X,y):
|
|
||||||
|
|
||||||
n += 1
|
|
||||||
print('# split no. ' + str(n))
|
|
||||||
|
|
||||||
if sklearn_cv:
|
|
||||||
# use sklearn CountVectorizer
|
|
||||||
# fit the training data and then return the matrix
|
|
||||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
|
||||||
# transform testing data and return the matrix
|
|
||||||
testing_data = cv.transform(X[test]).toarray()
|
|
||||||
else:
|
|
||||||
# use my own BagOfWords python implementation
|
|
||||||
stemming = True
|
|
||||||
rel_freq = True
|
|
||||||
extracted_words = BagOfWords.extract_all_words(X[train])
|
|
||||||
vocab = BagOfWords.make_vocab(extracted_words)
|
|
||||||
|
|
||||||
# fit the training data and then return the matrix
|
|
||||||
training_data = BagOfWords.make_matrix(extracted_words,
|
|
||||||
vocab, rel_freq, stemming)
|
|
||||||
# transform testing data and return the matrix
|
|
||||||
extracted_words = BagOfWords.extract_all_words(X[test])
|
|
||||||
testing_data = BagOfWords.make_matrix(extracted_words,
|
|
||||||
vocab, rel_freq, stemming)
|
|
||||||
|
|
||||||
# apply select percentile
|
|
||||||
selector = SelectPercentile(percentile=percentile)
|
|
||||||
selector.fit(training_data, y[train])
|
|
||||||
|
|
||||||
# new reduced data sets
|
|
||||||
training_data_r = selector.transform(training_data)
|
|
||||||
testing_data_r = selector.transform(testing_data)
|
|
||||||
|
|
||||||
#fit classifier
|
|
||||||
classifier.fit(training_data_r, y[train])
|
|
||||||
#predict class
|
|
||||||
predictions_train = classifier.predict(training_data_r)
|
|
||||||
predictions_test = classifier.predict(testing_data_r)
|
|
||||||
|
|
||||||
#print and store metrics
|
|
||||||
rec = recall_score(y[test], predictions_test)
|
|
||||||
print('rec: ' + str(rec))
|
|
||||||
recall_scores.append(rec)
|
|
||||||
prec = precision_score(y[test], predictions_test)
|
|
||||||
print('prec: ' + str(prec))
|
|
||||||
print('#')
|
|
||||||
precision_scores.append(prec)
|
|
||||||
# equation for f1 score
|
|
||||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
|
||||||
|
|
||||||
class_prob.append(classifier.class_prior_)
|
|
||||||
class_counts.append(classifier.class_count_)
|
|
||||||
|
|
||||||
##########################
|
|
||||||
#print metrics of test set
|
|
||||||
print('-------------------------')
|
|
||||||
print('prediction of testing set:')
|
|
||||||
print('Precision score: min = {}, max = {}, average = {}'
|
|
||||||
.format(min(precision_scores),
|
|
||||||
max(precision_scores),
|
|
||||||
sum(precision_scores)/float(len(precision_scores))))
|
|
||||||
print('Recall score: min = {}, max = {}, average = {}'
|
|
||||||
.format(min(recall_scores),
|
|
||||||
max(recall_scores),
|
|
||||||
sum(recall_scores)/float(len(recall_scores))))
|
|
||||||
print('F1 score: min = {}, max = {}, average = {}'
|
|
||||||
.format(min(f1_scores),
|
|
||||||
max(f1_scores),
|
|
||||||
sum(f1_scores)/float(len(f1_scores))))
|
|
||||||
print()
|
|
||||||
# print probability of each class
|
|
||||||
print('probability of each class:')
|
|
||||||
print()
|
|
||||||
print(class_prob)
|
|
||||||
print()
|
|
||||||
print('number of samples of each class:')
|
|
||||||
print()
|
|
||||||
print(class_counts)
|
|
||||||
print()
|
|
||||||
|
|
||||||
##### nur für overfit testing ###########
|
|
||||||
#print('overfit testing: prediction of training set')
|
|
||||||
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
|
||||||
#format(min(f1_scores_train), max(f1_scores_train),
|
|
||||||
#sum(f1_scores_train)/float(len(f1_scores_train))))
|
|
||||||
#print()
|
|
||||||
|
|
||||||
######## nur für resubstitutionsfehler benötigt ########
|
|
||||||
def analyze_errors(dataset):
|
|
||||||
'''calculates resubstitution error
|
|
||||||
shows indices of false classified articles
|
|
||||||
uses Gaussian Bayes with train test split
|
|
||||||
'''
|
|
||||||
X_train_test = dataset['Title'] + ' ' + dataset['Text']
|
|
||||||
y_train_test = dataset['Label']
|
|
||||||
|
|
||||||
count_vector = CountVectorizer()
|
|
||||||
# fit the training data and then return the matrix
|
|
||||||
training_data = count_vector.fit_transform(X_train_test).toarray()
|
|
||||||
# transform testing data and return the matrix
|
|
||||||
testing_data = count_vector.transform(X_train_test).toarray()
|
|
||||||
|
|
||||||
# Naive Bayes
|
|
||||||
classifier = GaussianNB()
|
|
||||||
# fit classifier
|
|
||||||
classifier.fit(training_data, y_train_test)
|
|
||||||
|
|
||||||
# Predict class
|
|
||||||
predictions = classifier.predict(testing_data)
|
|
||||||
print('Errors at index:')
|
|
||||||
print()
|
|
||||||
n = 0
|
|
||||||
for i in range(len(y_train_test)):
|
|
||||||
if y_train_test[i] != predictions[i]:
|
|
||||||
n += 1
|
|
||||||
print('error no.{}'.format(n))
|
|
||||||
print('prediction at index {} is: {}, but actual is: {}'
|
|
||||||
.format(i, predictions[i], y_train_test[i]))
|
|
||||||
print(X_train_test[i])
|
|
||||||
print(y_train_test[i])
|
|
||||||
print()
|
|
||||||
#print metrics
|
|
||||||
print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
print('# starting naive bayes')
|
|
||||||
print('# ...')
|
|
||||||
|
|
||||||
file = '..\data\\classification_labelled_corrected.csv'
|
|
||||||
|
|
||||||
# read csv file
|
|
||||||
print('# reading dataset')
|
|
||||||
print('# ...')
|
|
||||||
|
|
||||||
data = pd.read_csv(file,
|
|
||||||
sep='|',
|
|
||||||
engine='python',
|
|
||||||
decimal='.',
|
|
||||||
quotechar='\'',
|
|
||||||
quoting=csv.QUOTE_NONE)
|
|
||||||
|
|
||||||
# training options
|
|
||||||
use_count_vectorizer = False
|
|
||||||
select_percentile = 100
|
|
||||||
|
|
||||||
make_naive_bayes(data, use_count_vectorizer, select_percentile)
|
|
||||||
|
|
||||||
print('#')
|
|
||||||
print('# ending naive bayes')
|
|
Loading…
x
Reference in New Issue
Block a user