thesis-anne/NaiveBayes.py

139 lines
5.0 KiB
Python
Raw Normal View History

2018-09-05 12:08:13 +00:00
'''
Naive Bayes Classifier
======================
2018-09-10 08:38:24 +00:00
Naive Bayes is a probabilistic classifier that is able to predict a
probability distribution over a set of classes, rather than only
outputting the most likely class that the observation should belong to.
2018-09-05 12:08:13 +00:00
'Naive' means, that it assumes that the value of a particular feature
(word in an article) is independent of the value of any other feature,
2018-09-10 08:38:24 +00:00
given the label. It considers each of these features to contribute
independently to the probability that it belongs to its category,
2018-09-05 12:08:13 +00:00
regardless of any possible correlations between these features.
'''
#!!
# The multinomial Naive Bayes classifier is suitable
#for classification with discrete features (e.g.,
#word counts for text classification).
#The multinomial distribution normally requires
#integer feature counts. However, in practice,
#fractional counts such as tf-idf may also work.
# => nur bei eigenem BOW berücksichtigt
2018-09-05 12:08:13 +00:00
from BagOfWords import BagOfWords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
2018-09-05 12:08:13 +00:00
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
# MultinomialNB statt GaussianNB benutzt => OK?
#from sklearn.naive_bayes import GaussianNB
2018-09-05 12:08:13 +00:00
2018-09-10 08:38:24 +00:00
class NaiveBayes:
2018-09-05 12:08:13 +00:00
def make_naive_bayes(dataset):
'''fits naive bayes model
2018-09-10 08:38:24 +00:00
'''
2018-09-05 12:08:13 +00:00
print('# starting naive bayes')
print('#')
2018-09-05 12:08:13 +00:00
# split data into text and label set
2018-09-05 12:08:13 +00:00
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
# Bag of Words
print('# calculating bag of words')
print('#')
2018-09-05 12:08:13 +00:00
# fit the training data and then return the matrix
2018-09-05 12:08:13 +00:00
# toDO: warum so andere (schlechte) werte mit meinem BOW?
#X = BagOfWords.fit_transform(X, False)
2018-09-05 12:08:13 +00:00
X = CountVectorizer().fit_transform(X).toarray()
2018-09-05 12:08:13 +00:00
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
2018-09-10 08:38:24 +00:00
# use only most important features
selector = SelectPercentile()
pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
'NB__alpha': [0.00000001, 0.0000001,
0.000001, 0.00001,
0.0001, 0.001, 0.01,
0.1]},
cv=skf,
scoring=make_scorer(f1_score))
2018-09-05 12:08:13 +00:00
print('# fit classifier')
print('#')
grid.fit(X,y)
# DataFrame of results
df_results = grid.cv_results_
# print results
######################
print('RESULTS:')
print('#')
print('mean_test_score:')
print(df_results['mean_test_score'])
print('#')
print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('#')
print('best score:')
print(grid.best_score_)
print('#')
print('best parameters set found on development set:')
print(grid.best_params_)
print('#')
2018-09-05 12:08:13 +00:00
print('# ending naive bayes')
print('#')
def analyze_errors(dataset):
'''calculates resubstitution error
shows indices of false classified articles
uses Gaussian Bayes with train test split
'''
X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label']
count_vector = CountVectorizer()
# fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train_test).toarray()
# transform testing data and return the matrix
testing_data = count_vector.transform(X_train_test).toarray()
2018-09-05 12:08:13 +00:00
# Naive Bayes
classifier = GaussianNB()
# fit classifier
classifier.fit(training_data, y_train_test)
# Predict class
predictions = classifier.predict(testing_data)
print('Errors at index:')
print()
n = 0
for i in range(len(y_train_test)):
if y_train_test[i] != predictions[i]:
n += 1
print('error no.{}'.format(n))
print('prediction at index {} is: {}, but actual is: {}'
.format(i, predictions[i], y_train_test[i]))
print(X_train_test[i])
print(y_train_test[i])
print()
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))