SVM.py, NaiveBaies.py: built in grid-search, pipeline

This commit is contained in:
Anne Lorenz 2018-09-12 14:21:50 +02:00
parent 1195a161d6
commit 52146158e2
5 changed files with 230 additions and 249 deletions

View File

@ -14,6 +14,12 @@ from nltk.stem.porter import PorterStemmer
class BagOfWords: class BagOfWords:
def fit_transform(X, relative_word_frequencies=True):
''' similar to CountVectorizer's fit_transform method
'''
vocab = BagOfWords.make_vocab(X)
return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)
def extract_words(text): def extract_words(text):
'''takes article as argument, removes numbers, '''takes article as argument, removes numbers,
returns list of single words, recurrences included. returns list of single words, recurrences included.
@ -37,17 +43,17 @@ class BagOfWords:
return words_cleaned return words_cleaned
def reduce_word_to_stem(word): def reduce_word_to_stem(word):
'''takes normal word as input, returns the word's word stem '''takes normal word as input, returns the word's stem
''' '''
stemmer = PorterStemmer() stemmer = PorterStemmer()
# replace word by its stem # replace word by its stem
word = stemmer.stem(word) word = stemmer.stem(word)
return word return word
def make_matrix(series, vocab): def make_matrix(series, vocab, relative_word_frequencies):
'''calculates word stem frequencies in input articles. '''calculates word stem frequencies in input articles.
returns matrix (DataFrame) with relative word frequencies returns matrix (DataFrame) with relative word frequencies
(0 <= values < 1) (0 <= values < 1) or absolute word frequencies (int).
(rows: different articles, colums: different words in vocab) (rows: different articles, colums: different words in vocab)
''' '''
# create list of tuples # create list of tuples
@ -64,8 +70,13 @@ class BagOfWords:
vector.append(0) vector.append(0)
for w in words: for w in words:
if w == v: if w == v:
# add relative word frequency if relative_word_frequencies:
# relative word frequency
vector[i] += 1/word_count vector[i] += 1/word_count
else:
# absolute word frequency
vector[i] += 1
# add single vector as tuple # add single vector as tuple
vectors.append(tuple(vector)) vectors.append(tuple(vector))
df_vectors = pd.DataFrame.from_records(vectors, df_vectors = pd.DataFrame.from_records(vectors,
@ -89,7 +100,7 @@ class BagOfWords:
def set_stop_words(): def set_stop_words():
'''creates list of all words that will be ignored '''creates list of all words that will be ignored
''' '''
# standard stopwords from nltk.corpus stopwords('english') # stopwords
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
'aren\'t', 'as', 'at', 'be', 'because', 'been', 'aren\'t', 'as', 'at', 'be', 'because', 'been',
@ -120,11 +131,10 @@ class BagOfWords:
'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
'yourselves'] 'yourselves']
##=> ist das sinnvoll?:
#add specific words #add specific words
stop_words.extend(['reuters', 'also', 'monday', 'tuesday', #stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
'wednesday', 'thursday', 'friday']) # 'wednesday', 'thursday', 'friday'])
# => does this make sense?:
#remove the word 'not' from stop words #remove the word 'not' from stop words
#stop_words.remove('not') #stop_words.remove('not')

View File

@ -10,7 +10,6 @@ holding the class labels for the training samples.
import operator import operator
from BagOfWords import BagOfWords from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
import graphviz import graphviz
import numpy as np import numpy as np
@ -25,9 +24,8 @@ class DecisionTree:
def make_tree(dataset): def make_tree(dataset):
print('# starting decision tree') print('# starting decision tree')
print() print('#')
# note: better results with only title, but other important words
X = dataset['Title'] + ' ' + dataset['Text'] X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label'] y = dataset['Label']
@ -94,7 +92,6 @@ class DecisionTree:
#print(sorted_i_w)[:20] #print(sorted_i_w)[:20]
i_w = [x[0] for x in sorted_i_w] i_w = [x[0] for x in sorted_i_w]
print(i_w[:20]) print(i_w[:20])
print() print()
#print metrics of test set #print metrics of test set
@ -109,4 +106,4 @@ class DecisionTree:
# print() # print()
print('# ending decision tree') print('# ending decision tree')
print() print('#')

View File

@ -11,246 +11,129 @@ given the label. It considers each of these features to contribute
independently to the probability that it belongs to its category, independently to the probability that it belongs to its category,
regardless of any possible correlations between these features. regardless of any possible correlations between these features.
''' '''
from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
#from sklearn.feature_extraction.text import CountVectorizer #!!
#from sklearn.feature_selection import SelectPercentile # The multinomial Naive Bayes classifier is suitable
from sklearn.metrics import recall_score, precision_score #for classification with discrete features (e.g.,
#word counts for text classification).
#The multinomial distribution normally requires
#integer feature counts. However, in practice,
#fractional counts such as tf-idf may also work.
# => nur bei eigenem BOW berücksichtigt
from BagOfWords import BagOfWords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
#from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
# MultinomialNB statt GaussianNB benutzt => OK?
#from sklearn.naive_bayes import GaussianNB
class NaiveBayes: class NaiveBayes:
def make_naive_bayes(dataset): def make_naive_bayes(dataset):
'''fits naive bayes model with StratifiedKFold, '''fits naive bayes model
uses my BOW
''' '''
print('# starting naive bayes') print('# starting naive bayes')
print() print('#')
# join title and text # split data into text and label set
X = dataset['Title'] + ' ' + dataset['Text'] X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label'] y = dataset['Label']
# Bag of Words
print('# calculating bag of words')
print('#')
# fit the training data and then return the matrix
# toDO: warum so andere (schlechte) werte mit meinem BOW?
#X = BagOfWords.fit_transform(X, False)
X = CountVectorizer().fit_transform(X).toarray()
# use stratified k-fold cross-validation as split method # use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True) skf = StratifiedKFold(n_splits = 10, shuffle=True)
classifier = GaussianNB() # use only most important features
selector = SelectPercentile()
# lists for metrics pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
recall_scores = []
precision_scores = []
f1_scores = []
# for each fold grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
n = 0 'NB__alpha': [0.00000001, 0.0000001,
for train, test in skf.split(X,y): 0.000001, 0.00001,
# BOW 0.0001, 0.001, 0.01,
vocab = BagOfWords.make_vocab(X[train]) 0.1]},
# fit the training data and then return the matrix cv=skf,
training_data = BagOfWords.make_matrix(X[train], vocab) scoring=make_scorer(f1_score))
# transform testing data and return the matrix
testing_data = BagOfWords.make_matrix(X[test], vocab)
#fit classifier print('# fit classifier')
classifier.fit(training_data, y[train]) print('#')
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#store metrics grid.fit(X,y)
rec = recall_score(y[test], predictions_test)
recall_scores.append(rec)
prec = precision_score(y[train], predictions_train)
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
#print metrics of test set # DataFrame of results
print('prediction of testing set:') df_results = grid.cv_results_
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
.format(min(f1_scores), max(f1_scores), # print results
sum(f1_scores)/float(len(f1_scores)))) ######################
print() print('RESULTS:')
#print('overfit testing: prediction of training set') print('#')
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'. print('mean_test_score:')
#format(min(f1_scores_train), max(f1_scores_train), print(df_results['mean_test_score'])
#sum(f1_scores_train)/float(len(f1_scores_train)))) print('#')
#print() print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('#')
print('best score:')
print(grid.best_score_)
print('#')
print('best parameters set found on development set:')
print(grid.best_params_)
print('#')
print('# ending naive bayes') print('# ending naive bayes')
print('#')
def analyze_errors(dataset):
'''calculates resubstitution error
shows indices of false classified articles
uses Gaussian Bayes with train test split
'''
X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label']
count_vector = CountVectorizer()
# fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train_test).toarray()
# transform testing data and return the matrix
testing_data = count_vector.transform(X_train_test).toarray()
# Naive Bayes
classifier = GaussianNB()
# fit classifier
classifier.fit(training_data, y_train_test)
# Predict class
predictions = classifier.predict(testing_data)
print('Errors at index:')
print() print()
n = 0
# def make_naive_bayes_selectpercentile(dataset): for i in range(len(y_train_test)):
# '''fits naive bayes model with StratifiedKFold, uses my BOW if y_train_test[i] != predictions[i]:
# feature selection: select 0.25-percentile n += 1
# ''' print('error no.{}'.format(n))
print('prediction at index {} is: {}, but actual is: {}'
# print('# starting naive bayes') .format(i, predictions[i], y_train_test[i]))
# print() print(X_train_test[i])
print(y_train_test[i])
# # alternative: use only articles' header => may give better results print()
# X = dataset['Title'] + ' ' + dataset['Text'] #print metrics
# y = dataset['Label'] print('F1 score: ', format(f1_score(y_train_test, predictions)))
# # use stratified k-fold cross-validation as split method
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
# classifier = GaussianNB()
# # lists for metrics
# recall_scores = []
# precision_scores = []
# f1_scores = []
# # for each fold
# n = 0
# for train, test in skf.split(X,y):
# # BOW
# vocab = BagOfWords.make_vocab(X[train])
# # fit the training data and then return the matrix
# training_data = BagOfWords.make_matrix(X[train], vocab)
# # transform testing data and return the matrix
# testing_data = BagOfWords.make_matrix(X[test], vocab)
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
# #fit classifier
# classifier.fit(training_data_r, y[train])
# #predict class
# predictions_train = classifier.predict(training_data_r)
# predictions_test = classifier.predict(testing_data_r)
# #store metrics
# rec = recall_score(y[test], predictions_test)
# recall_scores.append(rec)
# prec = precision_score(y[train], predictions_train)
# precision_scores.append(prec)
# # equation for f1 score
# f1_scores.append(2 * (prec * rec)/(prec + rec))
# #print metrics of test set
# print('prediction of testing set:')
# print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
# .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))
# print()
# #print('overfit testing: prediction of training set')
# #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
# #format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# #print()
# print('# ending naive bayes')
# print()
# def make_naive_bayes_CV(dataset):
# '''alternative: uses CountVectorizer (faster)
# '''
# # alternative: use only articles' header => may give better results
# X = dataset['Title'] + '.' + dataset['Text'] + '.'
# y = dataset['Label']
# # use stratified k-fold cross-validation as split method
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
# count_vector = CountVectorizer()
# classifier = GaussianNB()
# # lists for metrics predicted on test/train set
# f1_scores, f1_scores_train = []
# # for each fold (10 times)
# # fold number
# n = 0
# for train, test in skf.split(X,y):
# # fit the training data and then return the matrix
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
# # transform testing data and return the matrix
# testing_data = count_vector.transform(X[test]).toarray()
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
# #fit classifier
# classifier.fit(training_data_r, y[train])
# #predict class
# predictions_train = classifier.predict(training_data_r)
# predictions_test = classifier.predict(testing_data_r)
# #store metrics predicted on test set
# f1_scores.append(f1_score(y[test], predictions_test))
# #store metrics predicted on train set
# f1_scores_train.append(f1_score(y[train], predictions_train))
# #print metrics of test set
# print('--------------------')
# print('prediction of testing set:')
# print('F1 score: min = {}, max = {}, average = {}'
# .format(min(f1_scores), max(f1_scores),
# sum(f1_scores)/float(len(f1_scores))))
# print()
# print('prediction of training set:')
# print('F1 score: min = {}, max = {}, average = {}'
# .format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# print()
# def analyze_errors_cv(dataset):
# '''calculates resubstitution error
# shows indices of false classified articles
# uses Gaussian Bayes with train test split
# '''
# X_train_test = dataset['Text']
# y_train_test = dataset['Label']
# count_vector = CountVectorizer()
# # fit the training data and then return the matrix
# training_data = count_vector.fit_transform(X_train_test).toarray()
# # transform testing data and return the matrix
# testing_data = count_vector.transform(X_train_test).toarray()
# # Naive Bayes
# classifier = GaussianNB()
# # fit classifier
# classifier.fit(training_data, y_train_test)
# # Predict class
# predictions = classifier.predict(testing_data)
# print()
# print('errors at index:')
# n = 0
# for i in range(len(y_train_test)):
# if y_train_test[i] != predictions[i]:
# n += 1
# print('error no.{}'.format(n))
# print('prediction at index {} is: {}, but actual is: {}'
# .format(i, predictions[i], y_train_test[i]))
# print(X_train_test[i])
# print(y_train_test[i])
# print()
# print()
# #print metrics
# print('F1 score: ', format(f1_score(y_train_test, predictions)))

87
SVM.py Normal file
View File

@ -0,0 +1,87 @@
'''
Support Vector Machines (SVM) Classifier
========================================
The SVM training algorithm builds a model from the training data that assigns
the test samples to one category ('merger' or 'not merger'),
making it a non-probabilistic binary linear classifier.
An SVM model is a representation of the samples as points in space,
mapped so that the examples of the separate categories are divided
by a clear gap that is as wide as possible.
New samples are then mapped into that same space and predicted
to belong to a category based on which side of the gap they fall.
'''
from BagOfWords import BagOfWords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
class SVM:
def make_svm(dataset):
print('# starting SVM')
print('#')
# split data into text and label set
# articles' text (title + text)
X = dataset['Title'] + ' ' + dataset['Text']
# articles' labels
y = dataset['Label']
# Bag of Words
print('# calculating bag of words')
print('#')
# fit the training data and then return the matrix
#X = BagOfWords.fit_transform(X)
X = CountVectorizer().fit_transform(X).toarray()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# use only most important features
selector = SelectPercentile()
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
'SVC__kernel': ['linear','poly','rbf','sigmoid'],
'SVC__gamma': [0.0001, 0.001, 0.01, 0.1, 1],
'SVC__C': [0.0001, 0.001, 0.01, 0.1, 1]},
cv=skf,
scoring=make_scorer(f1_score))
print('# fit classifier')
print('#')
grid.fit(X,y)
# DataFrame of results
df_results = grid.cv_results_
# print results
######################
print('RESULTS:')
print('')
print('mean_test_score:')
print(df_results['mean_test_score'])
print('')
print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('')
print('best score:')
print(grid.best_score_)
print()
print('best parameters set found on development set:')
print(grid.best_params_)
print()
print('# ending SVM')
print('#')

View File

@ -10,19 +10,23 @@ from CsvHandler import CsvHandler
from DecisionTree import DecisionTree from DecisionTree import DecisionTree
from NaiveBayes import NaiveBayes from NaiveBayes import NaiveBayes
#from Requester import Requester #from Requester import Requester
#from SVM import SVM from SVM import SVM
print('# starting program') print('# starting program')
print() print('#')
# only if new unlabeled(!) data set is required:
# Requester.save_articles_from_webhoseio() # Requester.save_articles_from_webhoseio()
file = 'classification_labelled_corrected.csv' file = 'classification_labelled_corrected.csv'
# read csv file # read csv file
print('# reading dataset')
print('#')
dataset = CsvHandler.read_csv(file) dataset = CsvHandler.read_csv(file)
# DecisionTree.make_tree(dataset) # DecisionTree.make_tree(dataset)
NaiveBayes.make_naive_bayes(dataset) NaiveBayes.make_naive_bayes(dataset)
# SVM.make_svm(dataset) SVM.make_svm(dataset)
print('# ending program') print('# ending program')