SVM.py, NaiveBaies.py: built in grid-search, pipeline

This commit is contained in:
Anne Lorenz 2018-09-12 14:21:50 +02:00
parent 1195a161d6
commit 52146158e2
5 changed files with 230 additions and 249 deletions

View File

@ -14,6 +14,12 @@ from nltk.stem.porter import PorterStemmer
class BagOfWords:
def fit_transform(X, relative_word_frequencies=True):
''' similar to CountVectorizer's fit_transform method
'''
vocab = BagOfWords.make_vocab(X)
return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)
def extract_words(text):
'''takes article as argument, removes numbers,
returns list of single words, recurrences included.
@ -37,17 +43,17 @@ class BagOfWords:
return words_cleaned
def reduce_word_to_stem(word):
'''takes normal word as input, returns the word's word stem
'''takes normal word as input, returns the word's stem
'''
stemmer = PorterStemmer()
# replace word by its stem
word = stemmer.stem(word)
return word
def make_matrix(series, vocab):
def make_matrix(series, vocab, relative_word_frequencies):
'''calculates word stem frequencies in input articles.
returns matrix (DataFrame) with relative word frequencies
(0 <= values < 1)
(0 <= values < 1) or absolute word frequencies (int).
(rows: different articles, colums: different words in vocab)
'''
# create list of tuples
@ -64,8 +70,13 @@ class BagOfWords:
vector.append(0)
for w in words:
if w == v:
# add relative word frequency
vector[i] += 1/word_count
if relative_word_frequencies:
# relative word frequency
vector[i] += 1/word_count
else:
# absolute word frequency
vector[i] += 1
# add single vector as tuple
vectors.append(tuple(vector))
df_vectors = pd.DataFrame.from_records(vectors,
@ -89,7 +100,7 @@ class BagOfWords:
def set_stop_words():
'''creates list of all words that will be ignored
'''
# standard stopwords from nltk.corpus stopwords('english')
# stopwords
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
'aren\'t', 'as', 'at', 'be', 'because', 'been',
@ -119,13 +130,12 @@ class BagOfWords:
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
'yourselves']
# add specific words
stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
'wednesday', 'thursday', 'friday'])
# => does this make sense?:
# remove the word 'not' from stop words
##=> ist das sinnvoll?:
#add specific words
#stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
# 'wednesday', 'thursday', 'friday'])
#remove the word 'not' from stop words
#stop_words.remove('not')
for i in range(len(stop_words)):

View File

@ -9,8 +9,7 @@ holding the class labels for the training samples.
'''
import operator
from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
from BagOfWords import BagOfWords
import graphviz
import numpy as np
@ -25,9 +24,8 @@ class DecisionTree:
def make_tree(dataset):
print('# starting decision tree')
print()
print('#')
# note: better results with only title, but other important words
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
@ -94,7 +92,6 @@ class DecisionTree:
#print(sorted_i_w)[:20]
i_w = [x[0] for x in sorted_i_w]
print(i_w[:20])
print()
#print metrics of test set
@ -109,4 +106,4 @@ class DecisionTree:
# print()
print('# ending decision tree')
print()
print('#')

View File

@ -11,246 +11,129 @@ given the label. It considers each of these features to contribute
independently to the probability that it belongs to its category,
regardless of any possible correlations between these features.
'''
from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
#!!
# The multinomial Naive Bayes classifier is suitable
#for classification with discrete features (e.g.,
#word counts for text classification).
#The multinomial distribution normally requires
#integer feature counts. However, in practice,
#fractional counts such as tf-idf may also work.
# => nur bei eigenem BOW berücksichtigt
from BagOfWords import BagOfWords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
#from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
# MultinomialNB statt GaussianNB benutzt => OK?
#from sklearn.naive_bayes import GaussianNB
class NaiveBayes:
def make_naive_bayes(dataset):
'''fits naive bayes model with StratifiedKFold,
uses my BOW
'''fits naive bayes model
'''
print('# starting naive bayes')
print()
print('#')
# join title and text
# split data into text and label set
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
# Bag of Words
print('# calculating bag of words')
print('#')
# fit the training data and then return the matrix
# toDO: warum so andere (schlechte) werte mit meinem BOW?
#X = BagOfWords.fit_transform(X, False)
X = CountVectorizer().fit_transform(X).toarray()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
classifier = GaussianNB()
# lists for metrics
recall_scores = []
precision_scores = []
f1_scores = []
# for each fold
n = 0
for train, test in skf.split(X,y):
# BOW
vocab = BagOfWords.make_vocab(X[train])
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(X[train], vocab)
# transform testing data and return the matrix
testing_data = BagOfWords.make_matrix(X[test], vocab)
#fit classifier
classifier.fit(training_data, y[train])
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#store metrics
rec = recall_score(y[test], predictions_test)
recall_scores.append(rec)
prec = precision_score(y[train], predictions_train)
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
#print metrics of test set
print('prediction of testing set:')
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
.format(min(f1_scores), max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
print()
#print('overfit testing: prediction of training set')
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
#format(min(f1_scores_train), max(f1_scores_train),
#sum(f1_scores_train)/float(len(f1_scores_train))))
#print()
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# use only most important features
selector = SelectPercentile()
pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
'NB__alpha': [0.00000001, 0.0000001,
0.000001, 0.00001,
0.0001, 0.001, 0.01,
0.1]},
cv=skf,
scoring=make_scorer(f1_score))
print('# fit classifier')
print('#')
grid.fit(X,y)
# DataFrame of results
df_results = grid.cv_results_
# print results
######################
print('RESULTS:')
print('#')
print('mean_test_score:')
print(df_results['mean_test_score'])
print('#')
print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('#')
print('best score:')
print(grid.best_score_)
print('#')
print('best parameters set found on development set:')
print(grid.best_params_)
print('#')
print('# ending naive bayes')
print('#')
def analyze_errors(dataset):
'''calculates resubstitution error
shows indices of false classified articles
uses Gaussian Bayes with train test split
'''
X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label']
count_vector = CountVectorizer()
# fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train_test).toarray()
# transform testing data and return the matrix
testing_data = count_vector.transform(X_train_test).toarray()
# Naive Bayes
classifier = GaussianNB()
# fit classifier
classifier.fit(training_data, y_train_test)
# Predict class
predictions = classifier.predict(testing_data)
print('Errors at index:')
print()
# def make_naive_bayes_selectpercentile(dataset):
# '''fits naive bayes model with StratifiedKFold, uses my BOW
# feature selection: select 0.25-percentile
# '''
# print('# starting naive bayes')
# print()
# # alternative: use only articles' header => may give better results
# X = dataset['Title'] + ' ' + dataset['Text']
# y = dataset['Label']
# # use stratified k-fold cross-validation as split method
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
# classifier = GaussianNB()
# # lists for metrics
# recall_scores = []
# precision_scores = []
# f1_scores = []
# # for each fold
# n = 0
# for train, test in skf.split(X,y):
# # BOW
# vocab = BagOfWords.make_vocab(X[train])
# # fit the training data and then return the matrix
# training_data = BagOfWords.make_matrix(X[train], vocab)
# # transform testing data and return the matrix
# testing_data = BagOfWords.make_matrix(X[test], vocab)
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
# #fit classifier
# classifier.fit(training_data_r, y[train])
# #predict class
# predictions_train = classifier.predict(training_data_r)
# predictions_test = classifier.predict(testing_data_r)
# #store metrics
# rec = recall_score(y[test], predictions_test)
# recall_scores.append(rec)
# prec = precision_score(y[train], predictions_train)
# precision_scores.append(prec)
# # equation for f1 score
# f1_scores.append(2 * (prec * rec)/(prec + rec))
# #print metrics of test set
# print('prediction of testing set:')
# print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
# .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))
# print()
# #print('overfit testing: prediction of training set')
# #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
# #format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# #print()
# print('# ending naive bayes')
# print()
# def make_naive_bayes_CV(dataset):
# '''alternative: uses CountVectorizer (faster)
# '''
# # alternative: use only articles' header => may give better results
# X = dataset['Title'] + '.' + dataset['Text'] + '.'
# y = dataset['Label']
# # use stratified k-fold cross-validation as split method
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
# count_vector = CountVectorizer()
# classifier = GaussianNB()
# # lists for metrics predicted on test/train set
# f1_scores, f1_scores_train = []
# # for each fold (10 times)
# # fold number
# n = 0
# for train, test in skf.split(X,y):
# # fit the training data and then return the matrix
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
# # transform testing data and return the matrix
# testing_data = count_vector.transform(X[test]).toarray()
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
# #fit classifier
# classifier.fit(training_data_r, y[train])
# #predict class
# predictions_train = classifier.predict(training_data_r)
# predictions_test = classifier.predict(testing_data_r)
# #store metrics predicted on test set
# f1_scores.append(f1_score(y[test], predictions_test))
# #store metrics predicted on train set
# f1_scores_train.append(f1_score(y[train], predictions_train))
# #print metrics of test set
# print('--------------------')
# print('prediction of testing set:')
# print('F1 score: min = {}, max = {}, average = {}'
# .format(min(f1_scores), max(f1_scores),
# sum(f1_scores)/float(len(f1_scores))))
# print()
# print('prediction of training set:')
# print('F1 score: min = {}, max = {}, average = {}'
# .format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# print()
# def analyze_errors_cv(dataset):
# '''calculates resubstitution error
# shows indices of false classified articles
# uses Gaussian Bayes with train test split
# '''
# X_train_test = dataset['Text']
# y_train_test = dataset['Label']
# count_vector = CountVectorizer()
# # fit the training data and then return the matrix
# training_data = count_vector.fit_transform(X_train_test).toarray()
# # transform testing data and return the matrix
# testing_data = count_vector.transform(X_train_test).toarray()
# # Naive Bayes
# classifier = GaussianNB()
# # fit classifier
# classifier.fit(training_data, y_train_test)
# # Predict class
# predictions = classifier.predict(testing_data)
# print()
# print('errors at index:')
# n = 0
# for i in range(len(y_train_test)):
# if y_train_test[i] != predictions[i]:
# n += 1
# print('error no.{}'.format(n))
# print('prediction at index {} is: {}, but actual is: {}'
# .format(i, predictions[i], y_train_test[i]))
# print(X_train_test[i])
# print(y_train_test[i])
# print()
# print()
# #print metrics
# print('F1 score: ', format(f1_score(y_train_test, predictions)))
n = 0
for i in range(len(y_train_test)):
if y_train_test[i] != predictions[i]:
n += 1
print('error no.{}'.format(n))
print('prediction at index {} is: {}, but actual is: {}'
.format(i, predictions[i], y_train_test[i]))
print(X_train_test[i])
print(y_train_test[i])
print()
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))

87
SVM.py Normal file
View File

@ -0,0 +1,87 @@
'''
Support Vector Machines (SVM) Classifier
========================================
The SVM training algorithm builds a model from the training data that assigns
the test samples to one category ('merger' or 'not merger'),
making it a non-probabilistic binary linear classifier.
An SVM model is a representation of the samples as points in space,
mapped so that the examples of the separate categories are divided
by a clear gap that is as wide as possible.
New samples are then mapped into that same space and predicted
to belong to a category based on which side of the gap they fall.
'''
from BagOfWords import BagOfWords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
class SVM:
def make_svm(dataset):
print('# starting SVM')
print('#')
# split data into text and label set
# articles' text (title + text)
X = dataset['Title'] + ' ' + dataset['Text']
# articles' labels
y = dataset['Label']
# Bag of Words
print('# calculating bag of words')
print('#')
# fit the training data and then return the matrix
#X = BagOfWords.fit_transform(X)
X = CountVectorizer().fit_transform(X).toarray()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# use only most important features
selector = SelectPercentile()
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
'SVC__kernel': ['linear','poly','rbf','sigmoid'],
'SVC__gamma': [0.0001, 0.001, 0.01, 0.1, 1],
'SVC__C': [0.0001, 0.001, 0.01, 0.1, 1]},
cv=skf,
scoring=make_scorer(f1_score))
print('# fit classifier')
print('#')
grid.fit(X,y)
# DataFrame of results
df_results = grid.cv_results_
# print results
######################
print('RESULTS:')
print('')
print('mean_test_score:')
print(df_results['mean_test_score'])
print('')
print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('')
print('best score:')
print(grid.best_score_)
print()
print('best parameters set found on development set:')
print(grid.best_params_)
print()
print('# ending SVM')
print('#')

View File

@ -10,19 +10,23 @@ from CsvHandler import CsvHandler
from DecisionTree import DecisionTree
from NaiveBayes import NaiveBayes
#from Requester import Requester
#from SVM import SVM
from SVM import SVM
print('# starting program')
print()
print('#')
# only if new unlabeled(!) data set is required:
# Requester.save_articles_from_webhoseio()
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('#')
dataset = CsvHandler.read_csv(file)
# DecisionTree.make_tree(dataset)
NaiveBayes.make_naive_bayes(dataset)
# SVM.make_svm(dataset)
SVM.make_svm(dataset)
print('# ending program')