172 lines
5.7 KiB
Python
172 lines
5.7 KiB
Python
'''
|
|
Naive Bayes Classifier
|
|
======================
|
|
|
|
basic implementation of naive bayes.
|
|
prints out probabilities for classes needed for interactive labeling.
|
|
'''
|
|
|
|
import csv
|
|
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.metrics import recall_score, precision_score
|
|
from sklearn.model_selection import StratifiedKFold
|
|
from sklearn.naive_bayes import GaussianNB
|
|
|
|
class NaiveBayes_Interactive:
|
|
|
|
def make_naive_bayes(dataset):
|
|
'''fits naive bayes model
|
|
'''
|
|
print('# fitting model')
|
|
print('# ...')
|
|
|
|
# split data into text and label set
|
|
# join title and text
|
|
X = dataset['Title'] + ' ' + dataset['Text']
|
|
y = dataset['Label']
|
|
|
|
cv = CountVectorizer()
|
|
|
|
# stratified k-fold cross-validation as split method
|
|
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)
|
|
|
|
classifier = GaussianNB()
|
|
|
|
# metrics
|
|
recall_scores = []
|
|
precision_scores = []
|
|
f1_scores = []
|
|
|
|
# probabilities of each class (of each fold)
|
|
class_prob = []
|
|
# counts number of training samples observed in each class
|
|
class_counts = []
|
|
|
|
# for each fold
|
|
n = 0
|
|
for train, test in kf.split(X,y):
|
|
|
|
n += 1
|
|
print('# split no. ' + str(n))
|
|
|
|
# using CountVectorizer:
|
|
# fit the training data and then return the matrix
|
|
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
|
# transform testing data and return the matrix
|
|
testing_data = cv.transform(X[test]).toarray()
|
|
|
|
#fit classifier
|
|
classifier.fit(training_data, y[train])
|
|
#predict class
|
|
predictions_train = classifier.predict(training_data)
|
|
predictions_test = classifier.predict(testing_data)
|
|
|
|
#print and store metrics
|
|
rec = recall_score(y[test], predictions_test)
|
|
print('rec: ' + str(rec))
|
|
recall_scores.append(rec)
|
|
prec = precision_score(y[test], predictions_test)
|
|
print('prec: ' + str(prec))
|
|
print('#')
|
|
precision_scores.append(prec)
|
|
# equation for f1 score
|
|
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
|
|
|
class_prob.append(classifier.class_prior_)
|
|
class_counts.append(classifier.class_count_)
|
|
|
|
##########################
|
|
#print metrics of test set
|
|
print('-------------------------')
|
|
print('prediction of testing set:')
|
|
print('Precision score: min = {}, max = {}, average = {}'
|
|
.format(min(precision_scores),
|
|
max(precision_scores),
|
|
sum(precision_scores)/float(len(precision_scores))))
|
|
print('Recall score: min = {}, max = {}, average = {}'
|
|
.format(min(recall_scores),
|
|
max(recall_scores),
|
|
sum(recall_scores)/float(len(recall_scores))))
|
|
print('F1 score: min = {}, max = {}, average = {}'
|
|
.format(min(f1_scores),
|
|
max(f1_scores),
|
|
sum(f1_scores)/float(len(f1_scores))))
|
|
print()
|
|
# print probability of each class
|
|
print('probability of each class:')
|
|
print()
|
|
print(class_prob)
|
|
print()
|
|
print('number of samples of each class:')
|
|
print()
|
|
print(class_counts)
|
|
print()
|
|
|
|
##### nur für overfit testing ###########
|
|
#print('overfit testing: prediction of training set')
|
|
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
|
#format(min(f1_scores_train), max(f1_scores_train),
|
|
#sum(f1_scores_train)/float(len(f1_scores_train))))
|
|
#print()
|
|
|
|
######## nur für resubstitutionsfehler benötigt ########
|
|
def analyze_errors(dataset):
|
|
'''calculates resubstitution error
|
|
shows indices of false classified articles
|
|
uses Gaussian Bayes with train test split
|
|
'''
|
|
X_train_test = dataset['Title'] + ' ' + dataset['Text']
|
|
y_train_test = dataset['Label']
|
|
|
|
count_vector = CountVectorizer()
|
|
# fit the training data and then return the matrix
|
|
training_data = count_vector.fit_transform(X_train_test).toarray()
|
|
# transform testing data and return the matrix
|
|
testing_data = count_vector.transform(X_train_test).toarray()
|
|
|
|
# Naive Bayes
|
|
classifier = GaussianNB()
|
|
# fit classifier
|
|
classifier.fit(training_data, y_train_test)
|
|
|
|
# Predict class
|
|
predictions = classifier.predict(testing_data)
|
|
print('Errors at index:')
|
|
print()
|
|
n = 0
|
|
for i in range(len(y_train_test)):
|
|
if y_train_test[i] != predictions[i]:
|
|
n += 1
|
|
print('error no.{}'.format(n))
|
|
print('prediction at index {} is: {}, but actual is: {}'
|
|
.format(i, predictions[i], y_train_test[i]))
|
|
print(X_train_test[i])
|
|
print(y_train_test[i])
|
|
print()
|
|
#print metrics
|
|
print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
|
|
|
if __name__ == '__main__':
|
|
|
|
print('# starting naive bayes')
|
|
print('# ...')
|
|
|
|
file = 'classification_labelled_corrected.csv'
|
|
|
|
# read csv file
|
|
print('# reading dataset')
|
|
print('# ...')
|
|
|
|
data = pd.read_csv(file,
|
|
sep='|',
|
|
engine='python',
|
|
decimal='.',
|
|
quotechar='\'',
|
|
quoting=csv.QUOTE_NONE)
|
|
|
|
make_naive_bayes(data)
|
|
|
|
print('#')
|
|
print('# ending naive bayes') |