thesis-anne/NaiveBayes.py

195 lines
7.0 KiB
Python
Raw Normal View History

2018-09-05 12:08:13 +00:00
'''
Naive Bayes Classifier
2018-09-17 19:16:19 +00:00
======================
2018-09-05 12:08:13 +00:00
2018-09-17 12:47:50 +00:00
Naive Bayes is a probabilistic classifier that is able to predict a
probability distribution over a set of classes, rather than only
outputting the most likely class that the observation should belong to
'Naive' means, that it assumes that the value of a particular feature
(word in an article) is independent of the value of any other feature,
given the label. It considers each of these features to contribute
2018-09-10 08:38:24 +00:00
independently to the probability that it belongs to its category,
2018-09-17 12:47:50 +00:00
regardless of any possible correlations between these features.
2018-09-05 12:08:13 +00:00
'''
2018-09-05 12:08:13 +00:00
from BagOfWords import BagOfWords
2018-09-17 19:16:19 +00:00
from CsvHandler import CsvHandler
2018-09-05 12:08:13 +00:00
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
2018-09-14 07:19:12 +00:00
from sklearn.metrics import recall_score, precision_score
2018-09-05 12:08:13 +00:00
from sklearn.model_selection import StratifiedKFold
2018-09-14 07:19:12 +00:00
from sklearn.naive_bayes import GaussianNB
2018-09-05 12:08:13 +00:00
2018-09-10 08:38:24 +00:00
class NaiveBayes:
2018-09-05 12:08:13 +00:00
def make_naive_bayes(dataset):
2018-09-17 12:47:50 +00:00
'''fits naive bayes model with StratifiedKFold,
2018-09-14 07:19:12 +00:00
uses my BOW
2018-09-17 12:47:50 +00:00
'''
2018-09-17 19:16:19 +00:00
print('# fitting model')
print('# ...')
2018-09-17 12:47:50 +00:00
# split data into text and label set
2018-09-14 07:19:12 +00:00
# join title and text
2018-09-17 12:47:50 +00:00
X = dataset['Title'] + ' ' + dataset['Text']
2018-09-05 12:08:13 +00:00
y = dataset['Label']
2018-09-17 12:47:50 +00:00
2018-09-14 07:19:12 +00:00
cv = CountVectorizer()
2018-09-17 12:47:50 +00:00
# use stratified k-fold cross-validation as split method
2018-09-24 11:50:11 +00:00
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
2018-09-17 12:47:50 +00:00
classifier = GaussianNB()
2018-09-24 11:50:11 +00:00
# metrics
2018-09-14 07:19:12 +00:00
recall_scores = []
precision_scores = []
f1_scores = []
2018-09-17 12:47:50 +00:00
2018-09-24 11:50:11 +00:00
# probabilities of each class (of each fold)
class_prob = []
# counts number of training samples observed in each class
class_counts = []
2018-09-14 07:19:12 +00:00
# for each fold
n = 0
2018-09-17 12:47:50 +00:00
for train, test in skf.split(X,y):
2018-09-14 07:19:12 +00:00
n += 1
print('# split no. ' + str(n))
2018-09-17 12:47:50 +00:00
2018-09-24 11:50:11 +00:00
# # eigenes BOW => schlechtere ergebnisse
# vocab = BagOfWords.make_vocab(X[train])
2018-09-14 07:49:56 +00:00
# # fit the training data and then return the matrix
2018-09-24 11:50:11 +00:00
# training_data = BagOfWords.make_matrix(X[train], vocab)
2018-09-14 07:49:56 +00:00
# # transform testing data and return the matrix
2018-09-24 11:50:11 +00:00
# testing_data = BagOfWords.make_matrix(X[test], vocab)
# using CountVectorizer:
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
2018-09-17 12:47:50 +00:00
2018-09-14 07:19:12 +00:00
# # apply select percentile
2018-09-17 12:47:50 +00:00
# selector = SelectPercentile(percentile=25)
2018-09-14 07:19:12 +00:00
# selector.fit(training_data, y[train])
2018-09-17 12:47:50 +00:00
# training_data_r = selector.transform(training_data)
2018-09-14 07:19:12 +00:00
# testing_data_r = selector.transform(testing_data)
2018-09-17 12:47:50 +00:00
2018-09-14 07:19:12 +00:00
# #fit classifier
2018-09-17 12:47:50 +00:00
# classifier.fit(training_data_r, y[train])
# #predict class
2018-09-14 07:19:12 +00:00
# predictions_train = classifier.predict(training_data_r)
# predictions_test = classifier.predict(testing_data_r)
2018-09-17 12:47:50 +00:00
2018-09-14 07:19:12 +00:00
#fit classifier
2018-09-17 12:47:50 +00:00
classifier.fit(training_data, y[train])
#predict class
2018-09-14 07:19:12 +00:00
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
2018-09-17 12:47:50 +00:00
2018-09-14 07:19:12 +00:00
#print and store metrics
rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec))
2018-09-17 12:47:50 +00:00
recall_scores.append(rec)
2018-09-14 07:19:12 +00:00
prec = precision_score(y[train], predictions_train)
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
2018-09-17 12:47:50 +00:00
2018-09-24 11:50:11 +00:00
class_prob.append(classifier.class_prior_)
class_counts.append(classifier.class_count_)
2018-09-14 07:19:12 +00:00
##########################
2018-09-17 12:47:50 +00:00
#print metrics of test set
2018-09-14 07:19:12 +00:00
print('-------------------------')
print('prediction of testing set:')
print('Precision score: min = {}, max = {}, average = {}'
.format(min(precision_scores),
max(precision_scores),
2018-09-17 12:47:50 +00:00
sum(precision_scores)/float(len(precision_scores))))
2018-09-14 07:19:12 +00:00
print('Recall score: min = {}, max = {}, average = {}'
.format(min(recall_scores),
max(recall_scores),
2018-09-17 12:47:50 +00:00
sum(recall_scores)/float(len(recall_scores))))
2018-09-14 07:19:12 +00:00
print('F1 score: min = {}, max = {}, average = {}'
2018-09-17 19:16:19 +00:00
.format(min(f1_scores),
2018-09-14 07:19:12 +00:00
max(f1_scores),
2018-09-17 12:47:50 +00:00
sum(f1_scores)/float(len(f1_scores))))
2018-09-14 07:19:12 +00:00
print()
2018-09-24 11:50:11 +00:00
# print probability of each class
print('probability of each class:')
print()
print(class_prob)
print()
print('number of samples of each class:')
print()
print(class_counts)
print()
2018-09-17 12:47:50 +00:00
2018-09-14 07:19:12 +00:00
##### nur für overfit testing ###########
#print('overfit testing: prediction of training set')
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
#format(min(f1_scores_train), max(f1_scores_train),
#sum(f1_scores_train)/float(len(f1_scores_train))))
2018-09-17 19:16:19 +00:00
#print()
2018-09-14 07:19:12 +00:00
2018-09-17 12:47:50 +00:00
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(dataset):
'''calculates resubstitution error
shows indices of false classified articles
uses Gaussian Bayes with train test split
2018-09-17 12:47:50 +00:00
'''
X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label']
2018-09-17 19:16:19 +00:00
2018-09-17 12:47:50 +00:00
count_vector = CountVectorizer()
# fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train_test).toarray()
# transform testing data and return the matrix
testing_data = count_vector.transform(X_train_test).toarray()
2018-09-05 12:08:13 +00:00
# Naive Bayes
2018-09-17 12:47:50 +00:00
classifier = GaussianNB()
# fit classifier
classifier.fit(training_data, y_train_test)
2018-09-17 12:47:50 +00:00
# Predict class
2018-09-17 12:47:50 +00:00
predictions = classifier.predict(testing_data)
print('Errors at index:')
print()
n = 0
for i in range(len(y_train_test)):
if y_train_test[i] != predictions[i]:
n += 1
print('error no.{}'.format(n))
print('prediction at index {} is: {}, but actual is: {}'
.format(i, predictions[i], y_train_test[i]))
print(X_train_test[i])
print(y_train_test[i])
2018-09-17 12:47:50 +00:00
print()
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))
2018-09-20 08:37:18 +00:00
if __name__ == '__main__':
2018-09-17 19:16:19 +00:00
2018-09-20 08:37:18 +00:00
print('# starting naive bayes')
print('# ...')
file = 'classification_labelled_corrected.csv'
2018-09-17 19:16:19 +00:00
2018-09-20 08:37:18 +00:00
# read csv file
print('# reading dataset')
print('# ...')
2018-09-17 19:16:19 +00:00
2018-09-20 08:37:18 +00:00
dataset = CsvHandler.read_csv(file)
2018-09-17 19:16:19 +00:00
2018-09-20 08:37:18 +00:00
make_naive_bayes(dataset)
2018-09-17 19:16:19 +00:00
2018-09-20 08:37:18 +00:00
print('#')
print('# ending naive bayes')