171 lines
4.9 KiB
Python
171 lines
4.9 KiB
Python
'''
|
|
Multinomial Naive Bayes Classifier
|
|
==================================
|
|
'''
|
|
|
|
from BagOfWords import BagOfWords
|
|
|
|
import csv
|
|
|
|
import gensim
|
|
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.feature_selection import SelectPercentile
|
|
from sklearn.metrics import recall_score, precision_score
|
|
import sklearn
|
|
from sklearn.model_selection import StratifiedKFold
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.svm import LinearSVC
|
|
from sklearn.svm import SVC
|
|
|
|
class MultinomialNaiveBayes_Word2Vec:
|
|
|
|
def make_mnb(dataset):
|
|
'''fits naive bayes model with StratifiedKFold
|
|
'''
|
|
|
|
def read_corpus(data, tokens_only=False):
|
|
list_of_lists = []
|
|
for i, text in enumerate(data):
|
|
if tokens_only:
|
|
list_of_lists.append(BagOfWords.extract_words(text))
|
|
else:
|
|
# For training data, add tags
|
|
list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
|
|
return list_of_lists
|
|
|
|
def normalize_vector(two_dim_array, min, max):
|
|
norm_array = two_dim_array
|
|
for (x,y), value in np.ndenumerate(two_dim_array):
|
|
norm_array[x][y] = ((value - min) / (max - min))
|
|
return norm_array
|
|
|
|
print('# starting multinomial naive bayes with Word2Vec')
|
|
print('# ...')
|
|
|
|
# split data into text and label set
|
|
# join title and text
|
|
X = dataset['Title'] + '. ' + dataset['Text']
|
|
y = dataset['Label']
|
|
|
|
# use stratified k-fold cross-validation as split method
|
|
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
|
|
|
#classifier = MultinomialNB(alpha=1.0e-10,
|
|
# fit_prior=False,
|
|
# class_prior=None)
|
|
|
|
# classifier = SVC(probability=True,
|
|
# gamma='auto')
|
|
classifier = LinearSVC()
|
|
|
|
# metrics
|
|
recall_scores = []
|
|
precision_scores = []
|
|
f1_scores = []
|
|
|
|
# for each fold
|
|
n = 0
|
|
for train, test in skf.split(X,y):
|
|
|
|
n += 1
|
|
print('# split no. ' + str(n))
|
|
|
|
# train model with gensim
|
|
tagged_train_data = read_corpus(X[train], tokens_only=False)
|
|
tagged_test_data = read_corpus(X[test], tokens_only=False)
|
|
|
|
# instantiate a Doc2Vec object
|
|
model = Doc2Vec(vector_size=100,
|
|
min_count=20,
|
|
epochs=40,
|
|
negative=0,
|
|
workers=1,
|
|
seed=5,
|
|
hs=1)
|
|
|
|
model.build_vocab(tagged_train_data)
|
|
|
|
model.train(tagged_train_data,
|
|
total_examples=model.corpus_count,
|
|
epochs=model.epochs)
|
|
|
|
model.docvecs.count
|
|
|
|
X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data]
|
|
X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data]
|
|
|
|
# convert matrix
|
|
X_train=np.vstack(X_train)
|
|
X_test=np.vstack(X_test)
|
|
|
|
# min max for normalization
|
|
minimum = min(X_train.min(), X_test.min())
|
|
maximum = max(X_train.max(), X_test.max())
|
|
|
|
X_test_norm = normalize_vector(X_test, minimum, maximum)
|
|
X_train_norm = normalize_vector(X_train, minimum, maximum)
|
|
|
|
# shape vectors
|
|
X_test_norm.shape
|
|
y[test].shape
|
|
X_train_norm.shape
|
|
y[train].shape
|
|
|
|
#fit classifier
|
|
classifier.fit(X_train_norm, y[train])
|
|
#predict class
|
|
predictions_train = classifier.predict(X_train_norm)
|
|
predictions_test = classifier.predict(X_test_norm)
|
|
|
|
#print and store metrics
|
|
rec = recall_score(y[test], predictions_test, average='weighted')
|
|
print('rec: ' + str(rec))
|
|
recall_scores.append(rec)
|
|
prec = precision_score(y[test], predictions_test, average='weighted')
|
|
print('prec: ' + str(prec))
|
|
print('#')
|
|
precision_scores.append(prec)
|
|
# equation for f1 score
|
|
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
|
|
|
##########################
|
|
# probability estimates for the test vector (testing_data)
|
|
#class_probs = classifier.predict_proba(X_test_norm)
|
|
|
|
# number of samples encountered for each class during fitting
|
|
# this value is weighted by the sample weight when provided
|
|
#class_count = classifier.class_count_
|
|
|
|
# classes in order used
|
|
#classes = classifier.classes_
|
|
|
|
print('Recall (Min): ' + str(min(recall_scores)))
|
|
print('Recall (Max): ' + str(max(recall_scores)))
|
|
print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
|
|
print()
|
|
print('Precision (Min): ' + str(min(precision_scores)))
|
|
print('Precision (Max): ' + str(max(precision_scores)))
|
|
print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
|
|
|
|
# return classes and vector of class estimates
|
|
return recall_scores, precision_scores, f1_scores#, class_probs
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# read csv file
|
|
print('# reading dataset')
|
|
print('# ...')
|
|
|
|
# read current data set from csv
|
|
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
|
sep='|',
|
|
usecols=range(1,13), # drop first column 'unnamed'
|
|
encoding='utf-8',
|
|
quoting=csv.QUOTE_NONNUMERIC,
|
|
quotechar='\'')
|
|
|
|
# select only labeled articles
|
|
MultinomialNaiveBayes_Word2Vec.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True)) |