2018-09-05 12:08:13 +00:00
|
|
|
'''
|
|
|
|
Naive Bayes Classifier
|
|
|
|
======================
|
|
|
|
|
2018-09-10 08:38:24 +00:00
|
|
|
Naive Bayes is a probabilistic classifier that is able to predict a
|
|
|
|
probability distribution over a set of classes, rather than only
|
|
|
|
outputting the most likely class that the observation should belong to.
|
2018-09-05 12:08:13 +00:00
|
|
|
'Naive' means, that it assumes that the value of a particular feature
|
|
|
|
(word in an article) is independent of the value of any other feature,
|
2018-09-10 08:38:24 +00:00
|
|
|
given the label. It considers each of these features to contribute
|
|
|
|
independently to the probability that it belongs to its category,
|
2018-09-05 12:08:13 +00:00
|
|
|
regardless of any possible correlations between these features.
|
|
|
|
'''
|
2018-09-12 12:21:50 +00:00
|
|
|
|
|
|
|
#!!
|
|
|
|
# The multinomial Naive Bayes classifier is suitable
|
|
|
|
#for classification with discrete features (e.g.,
|
|
|
|
#word counts for text classification).
|
|
|
|
#The multinomial distribution normally requires
|
|
|
|
#integer feature counts. However, in practice,
|
|
|
|
#fractional counts such as tf-idf may also work.
|
|
|
|
|
|
|
|
# => nur bei eigenem BOW berücksichtigt
|
|
|
|
|
2018-09-05 12:08:13 +00:00
|
|
|
from BagOfWords import BagOfWords
|
|
|
|
|
2018-09-12 12:21:50 +00:00
|
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
from sklearn.feature_selection import SelectPercentile
|
|
|
|
from sklearn.metrics import f1_score, make_scorer
|
2018-09-05 12:08:13 +00:00
|
|
|
from sklearn.model_selection import StratifiedKFold
|
2018-09-12 12:21:50 +00:00
|
|
|
from sklearn.model_selection import GridSearchCV
|
|
|
|
from sklearn.pipeline import Pipeline
|
|
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
|
|
|
|
|
|
# MultinomialNB statt GaussianNB benutzt => OK?
|
|
|
|
#from sklearn.naive_bayes import GaussianNB
|
2018-09-05 12:08:13 +00:00
|
|
|
|
2018-09-10 08:38:24 +00:00
|
|
|
class NaiveBayes:
|
2018-09-05 12:08:13 +00:00
|
|
|
|
|
|
|
def make_naive_bayes(dataset):
|
2018-09-12 12:21:50 +00:00
|
|
|
'''fits naive bayes model
|
2018-09-10 08:38:24 +00:00
|
|
|
'''
|
2018-09-05 12:08:13 +00:00
|
|
|
print('# starting naive bayes')
|
2018-09-12 12:21:50 +00:00
|
|
|
print('#')
|
2018-09-05 12:08:13 +00:00
|
|
|
|
2018-09-12 12:21:50 +00:00
|
|
|
# split data into text and label set
|
2018-09-05 12:08:13 +00:00
|
|
|
X = dataset['Title'] + ' ' + dataset['Text']
|
|
|
|
y = dataset['Label']
|
|
|
|
|
2018-09-12 12:21:50 +00:00
|
|
|
# Bag of Words
|
|
|
|
print('# calculating bag of words')
|
|
|
|
print('#')
|
2018-09-05 12:08:13 +00:00
|
|
|
|
2018-09-12 12:21:50 +00:00
|
|
|
# fit the training data and then return the matrix
|
2018-09-05 12:08:13 +00:00
|
|
|
|
2018-09-12 12:21:50 +00:00
|
|
|
# toDO: warum so andere (schlechte) werte mit meinem BOW?
|
|
|
|
#X = BagOfWords.fit_transform(X, False)
|
2018-09-05 12:08:13 +00:00
|
|
|
|
2018-09-12 12:21:50 +00:00
|
|
|
X = CountVectorizer().fit_transform(X).toarray()
|
2018-09-05 12:08:13 +00:00
|
|
|
|
2018-09-12 12:21:50 +00:00
|
|
|
# use stratified k-fold cross-validation as split method
|
|
|
|
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
2018-09-10 08:38:24 +00:00
|
|
|
|
2018-09-12 12:21:50 +00:00
|
|
|
# use only most important features
|
|
|
|
selector = SelectPercentile()
|
|
|
|
|
|
|
|
pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
|
|
|
|
|
|
|
|
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
|
|
|
|
'NB__alpha': [0.00000001, 0.0000001,
|
|
|
|
0.000001, 0.00001,
|
|
|
|
0.0001, 0.001, 0.01,
|
|
|
|
0.1]},
|
|
|
|
cv=skf,
|
|
|
|
scoring=make_scorer(f1_score))
|
2018-09-05 12:08:13 +00:00
|
|
|
|
2018-09-12 12:21:50 +00:00
|
|
|
print('# fit classifier')
|
|
|
|
print('#')
|
|
|
|
|
|
|
|
grid.fit(X,y)
|
|
|
|
|
|
|
|
# DataFrame of results
|
|
|
|
df_results = grid.cv_results_
|
|
|
|
|
|
|
|
# print results
|
|
|
|
######################
|
|
|
|
print('RESULTS:')
|
|
|
|
print('#')
|
|
|
|
print('mean_test_score:')
|
|
|
|
print(df_results['mean_test_score'])
|
|
|
|
print('#')
|
|
|
|
print('mean of means:')
|
|
|
|
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
|
|
|
|
print('#')
|
|
|
|
print('best score:')
|
|
|
|
print(grid.best_score_)
|
|
|
|
print('#')
|
|
|
|
print('best parameters set found on development set:')
|
|
|
|
print(grid.best_params_)
|
|
|
|
print('#')
|
2018-09-05 12:08:13 +00:00
|
|
|
|
2018-09-12 12:21:50 +00:00
|
|
|
print('# ending naive bayes')
|
|
|
|
print('#')
|
|
|
|
|
|
|
|
def analyze_errors(dataset):
|
|
|
|
'''calculates resubstitution error
|
|
|
|
shows indices of false classified articles
|
|
|
|
uses Gaussian Bayes with train test split
|
|
|
|
'''
|
|
|
|
X_train_test = dataset['Title'] + ' ' + dataset['Text']
|
|
|
|
y_train_test = dataset['Label']
|
|
|
|
|
|
|
|
count_vector = CountVectorizer()
|
|
|
|
# fit the training data and then return the matrix
|
|
|
|
training_data = count_vector.fit_transform(X_train_test).toarray()
|
|
|
|
# transform testing data and return the matrix
|
|
|
|
testing_data = count_vector.transform(X_train_test).toarray()
|
2018-09-05 12:08:13 +00:00
|
|
|
|
2018-09-12 12:21:50 +00:00
|
|
|
# Naive Bayes
|
|
|
|
classifier = GaussianNB()
|
|
|
|
# fit classifier
|
|
|
|
classifier.fit(training_data, y_train_test)
|
|
|
|
|
|
|
|
# Predict class
|
|
|
|
predictions = classifier.predict(testing_data)
|
|
|
|
print('Errors at index:')
|
|
|
|
print()
|
|
|
|
n = 0
|
|
|
|
for i in range(len(y_train_test)):
|
|
|
|
if y_train_test[i] != predictions[i]:
|
|
|
|
n += 1
|
|
|
|
print('error no.{}'.format(n))
|
|
|
|
print('prediction at index {} is: {}, but actual is: {}'
|
|
|
|
.format(i, predictions[i], y_train_test[i]))
|
|
|
|
print(X_train_test[i])
|
|
|
|
print(y_train_test[i])
|
|
|
|
print()
|
|
|
|
#print metrics
|
|
|
|
print('F1 score: ', format(f1_score(y_train_test, predictions)))
|