SVM.py, NaiveBaies.py: built in grid-search, pipeline
This commit is contained in:
parent
1195a161d6
commit
52146158e2
|
@ -14,6 +14,12 @@ from nltk.stem.porter import PorterStemmer
|
|||
|
||||
class BagOfWords:
|
||||
|
||||
def fit_transform(X, relative_word_frequencies=True):
|
||||
''' similar to CountVectorizer's fit_transform method
|
||||
'''
|
||||
vocab = BagOfWords.make_vocab(X)
|
||||
return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)
|
||||
|
||||
def extract_words(text):
|
||||
'''takes article as argument, removes numbers,
|
||||
returns list of single words, recurrences included.
|
||||
|
@ -37,17 +43,17 @@ class BagOfWords:
|
|||
return words_cleaned
|
||||
|
||||
def reduce_word_to_stem(word):
|
||||
'''takes normal word as input, returns the word's word stem
|
||||
'''takes normal word as input, returns the word's stem
|
||||
'''
|
||||
stemmer = PorterStemmer()
|
||||
# replace word by its stem
|
||||
word = stemmer.stem(word)
|
||||
return word
|
||||
|
||||
def make_matrix(series, vocab):
|
||||
def make_matrix(series, vocab, relative_word_frequencies):
|
||||
'''calculates word stem frequencies in input articles.
|
||||
returns matrix (DataFrame) with relative word frequencies
|
||||
(0 <= values < 1)
|
||||
(0 <= values < 1) or absolute word frequencies (int).
|
||||
(rows: different articles, colums: different words in vocab)
|
||||
'''
|
||||
# create list of tuples
|
||||
|
@ -64,8 +70,13 @@ class BagOfWords:
|
|||
vector.append(0)
|
||||
for w in words:
|
||||
if w == v:
|
||||
# add relative word frequency
|
||||
vector[i] += 1/word_count
|
||||
if relative_word_frequencies:
|
||||
# relative word frequency
|
||||
vector[i] += 1/word_count
|
||||
else:
|
||||
# absolute word frequency
|
||||
vector[i] += 1
|
||||
|
||||
# add single vector as tuple
|
||||
vectors.append(tuple(vector))
|
||||
df_vectors = pd.DataFrame.from_records(vectors,
|
||||
|
@ -89,7 +100,7 @@ class BagOfWords:
|
|||
def set_stop_words():
|
||||
'''creates list of all words that will be ignored
|
||||
'''
|
||||
# standard stopwords from nltk.corpus stopwords('english')
|
||||
# stopwords
|
||||
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
||||
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
|
||||
'aren\'t', 'as', 'at', 'be', 'because', 'been',
|
||||
|
@ -119,13 +130,12 @@ class BagOfWords:
|
|||
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
||||
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
|
||||
'yourselves']
|
||||
|
||||
# add specific words
|
||||
stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
|
||||
'wednesday', 'thursday', 'friday'])
|
||||
|
||||
# => does this make sense?:
|
||||
# remove the word 'not' from stop words
|
||||
|
||||
##=> ist das sinnvoll?:
|
||||
#add specific words
|
||||
#stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
|
||||
# 'wednesday', 'thursday', 'friday'])
|
||||
#remove the word 'not' from stop words
|
||||
#stop_words.remove('not')
|
||||
|
||||
for i in range(len(stop_words)):
|
||||
|
|
|
@ -9,8 +9,7 @@ holding the class labels for the training samples.
|
|||
'''
|
||||
import operator
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvHandler import CsvHandler
|
||||
from BagOfWords import BagOfWords
|
||||
|
||||
import graphviz
|
||||
import numpy as np
|
||||
|
@ -25,9 +24,8 @@ class DecisionTree:
|
|||
def make_tree(dataset):
|
||||
|
||||
print('# starting decision tree')
|
||||
print()
|
||||
print('#')
|
||||
|
||||
# note: better results with only title, but other important words
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
|
@ -94,7 +92,6 @@ class DecisionTree:
|
|||
#print(sorted_i_w)[:20]
|
||||
i_w = [x[0] for x in sorted_i_w]
|
||||
print(i_w[:20])
|
||||
|
||||
print()
|
||||
|
||||
#print metrics of test set
|
||||
|
@ -109,4 +106,4 @@ class DecisionTree:
|
|||
# print()
|
||||
|
||||
print('# ending decision tree')
|
||||
print()
|
||||
print('#')
|
337
NaiveBayes.py
337
NaiveBayes.py
|
@ -11,246 +11,129 @@ given the label. It considers each of these features to contribute
|
|||
independently to the probability that it belongs to its category,
|
||||
regardless of any possible correlations between these features.
|
||||
'''
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
#from sklearn.feature_extraction.text import CountVectorizer
|
||||
#from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import recall_score, precision_score
|
||||
#!!
|
||||
# The multinomial Naive Bayes classifier is suitable
|
||||
#for classification with discrete features (e.g.,
|
||||
#word counts for text classification).
|
||||
#The multinomial distribution normally requires
|
||||
#integer feature counts. However, in practice,
|
||||
#fractional counts such as tf-idf may also work.
|
||||
|
||||
# => nur bei eigenem BOW berücksichtigt
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import f1_score, make_scorer
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
#from sklearn.model_selection import train_test_split
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
|
||||
# MultinomialNB statt GaussianNB benutzt => OK?
|
||||
#from sklearn.naive_bayes import GaussianNB
|
||||
|
||||
class NaiveBayes:
|
||||
|
||||
def make_naive_bayes(dataset):
|
||||
'''fits naive bayes model with StratifiedKFold,
|
||||
uses my BOW
|
||||
'''fits naive bayes model
|
||||
'''
|
||||
print('# starting naive bayes')
|
||||
print()
|
||||
print('#')
|
||||
|
||||
# join title and text
|
||||
# split data into text and label set
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
# Bag of Words
|
||||
print('# calculating bag of words')
|
||||
print('#')
|
||||
|
||||
# fit the training data and then return the matrix
|
||||
|
||||
# toDO: warum so andere (schlechte) werte mit meinem BOW?
|
||||
#X = BagOfWords.fit_transform(X, False)
|
||||
|
||||
X = CountVectorizer().fit_transform(X).toarray()
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
classifier = GaussianNB()
|
||||
|
||||
# lists for metrics
|
||||
recall_scores = []
|
||||
precision_scores = []
|
||||
f1_scores = []
|
||||
|
||||
# for each fold
|
||||
n = 0
|
||||
for train, test in skf.split(X,y):
|
||||
# BOW
|
||||
vocab = BagOfWords.make_vocab(X[train])
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
# transform testing data and return the matrix
|
||||
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||
|
||||
#fit classifier
|
||||
classifier.fit(training_data, y[train])
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data)
|
||||
predictions_test = classifier.predict(testing_data)
|
||||
|
||||
#store metrics
|
||||
rec = recall_score(y[test], predictions_test)
|
||||
recall_scores.append(rec)
|
||||
prec = precision_score(y[train], predictions_train)
|
||||
precision_scores.append(prec)
|
||||
# equation for f1 score
|
||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||
|
||||
#print metrics of test set
|
||||
print('prediction of testing set:')
|
||||
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
|
||||
.format(min(f1_scores), max(f1_scores),
|
||||
sum(f1_scores)/float(len(f1_scores))))
|
||||
print()
|
||||
#print('overfit testing: prediction of training set')
|
||||
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||
#format(min(f1_scores_train), max(f1_scores_train),
|
||||
#sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
#print()
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
# use only most important features
|
||||
selector = SelectPercentile()
|
||||
|
||||
pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
|
||||
|
||||
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
|
||||
'NB__alpha': [0.00000001, 0.0000001,
|
||||
0.000001, 0.00001,
|
||||
0.0001, 0.001, 0.01,
|
||||
0.1]},
|
||||
cv=skf,
|
||||
scoring=make_scorer(f1_score))
|
||||
|
||||
print('# fit classifier')
|
||||
print('#')
|
||||
|
||||
grid.fit(X,y)
|
||||
|
||||
# DataFrame of results
|
||||
df_results = grid.cv_results_
|
||||
|
||||
# print results
|
||||
######################
|
||||
print('RESULTS:')
|
||||
print('#')
|
||||
print('mean_test_score:')
|
||||
print(df_results['mean_test_score'])
|
||||
print('#')
|
||||
print('mean of means:')
|
||||
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
|
||||
print('#')
|
||||
print('best score:')
|
||||
print(grid.best_score_)
|
||||
print('#')
|
||||
print('best parameters set found on development set:')
|
||||
print(grid.best_params_)
|
||||
print('#')
|
||||
|
||||
print('# ending naive bayes')
|
||||
print('#')
|
||||
|
||||
def analyze_errors(dataset):
|
||||
'''calculates resubstitution error
|
||||
shows indices of false classified articles
|
||||
uses Gaussian Bayes with train test split
|
||||
'''
|
||||
X_train_test = dataset['Title'] + ' ' + dataset['Text']
|
||||
y_train_test = dataset['Label']
|
||||
|
||||
count_vector = CountVectorizer()
|
||||
# fit the training data and then return the matrix
|
||||
training_data = count_vector.fit_transform(X_train_test).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = count_vector.transform(X_train_test).toarray()
|
||||
|
||||
# Naive Bayes
|
||||
classifier = GaussianNB()
|
||||
# fit classifier
|
||||
classifier.fit(training_data, y_train_test)
|
||||
|
||||
# Predict class
|
||||
predictions = classifier.predict(testing_data)
|
||||
print('Errors at index:')
|
||||
print()
|
||||
|
||||
# def make_naive_bayes_selectpercentile(dataset):
|
||||
# '''fits naive bayes model with StratifiedKFold, uses my BOW
|
||||
# feature selection: select 0.25-percentile
|
||||
# '''
|
||||
|
||||
# print('# starting naive bayes')
|
||||
# print()
|
||||
|
||||
# # alternative: use only articles' header => may give better results
|
||||
# X = dataset['Title'] + ' ' + dataset['Text']
|
||||
# y = dataset['Label']
|
||||
|
||||
# # use stratified k-fold cross-validation as split method
|
||||
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
# classifier = GaussianNB()
|
||||
|
||||
# # lists for metrics
|
||||
# recall_scores = []
|
||||
# precision_scores = []
|
||||
# f1_scores = []
|
||||
|
||||
# # for each fold
|
||||
# n = 0
|
||||
# for train, test in skf.split(X,y):
|
||||
# # BOW
|
||||
# vocab = BagOfWords.make_vocab(X[train])
|
||||
# # fit the training data and then return the matrix
|
||||
# training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
# # transform testing data and return the matrix
|
||||
# testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||
|
||||
# # apply select percentile
|
||||
# selector = SelectPercentile(percentile=25)
|
||||
# selector.fit(training_data, y[train])
|
||||
|
||||
# training_data_r = selector.transform(training_data)
|
||||
# testing_data_r = selector.transform(testing_data)
|
||||
|
||||
# #fit classifier
|
||||
# classifier.fit(training_data_r, y[train])
|
||||
# #predict class
|
||||
# predictions_train = classifier.predict(training_data_r)
|
||||
# predictions_test = classifier.predict(testing_data_r)
|
||||
|
||||
# #store metrics
|
||||
# rec = recall_score(y[test], predictions_test)
|
||||
# recall_scores.append(rec)
|
||||
# prec = precision_score(y[train], predictions_train)
|
||||
# precision_scores.append(prec)
|
||||
# # equation for f1 score
|
||||
# f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||
|
||||
# #print metrics of test set
|
||||
# print('prediction of testing set:')
|
||||
# print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
|
||||
# .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))
|
||||
# print()
|
||||
# #print('overfit testing: prediction of training set')
|
||||
# #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||
# #format(min(f1_scores_train), max(f1_scores_train),
|
||||
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
# #print()
|
||||
|
||||
# print('# ending naive bayes')
|
||||
# print()
|
||||
|
||||
|
||||
# def make_naive_bayes_CV(dataset):
|
||||
# '''alternative: uses CountVectorizer (faster)
|
||||
# '''
|
||||
# # alternative: use only articles' header => may give better results
|
||||
# X = dataset['Title'] + '.' + dataset['Text'] + '.'
|
||||
# y = dataset['Label']
|
||||
|
||||
# # use stratified k-fold cross-validation as split method
|
||||
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
# count_vector = CountVectorizer()
|
||||
|
||||
# classifier = GaussianNB()
|
||||
|
||||
# # lists for metrics predicted on test/train set
|
||||
# f1_scores, f1_scores_train = []
|
||||
|
||||
# # for each fold (10 times)
|
||||
# # fold number
|
||||
# n = 0
|
||||
# for train, test in skf.split(X,y):
|
||||
|
||||
# # fit the training data and then return the matrix
|
||||
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
|
||||
# # transform testing data and return the matrix
|
||||
# testing_data = count_vector.transform(X[test]).toarray()
|
||||
|
||||
# # apply select percentile
|
||||
# selector = SelectPercentile(percentile=25)
|
||||
# selector.fit(training_data, y[train])
|
||||
|
||||
# training_data_r = selector.transform(training_data)
|
||||
# testing_data_r = selector.transform(testing_data)
|
||||
|
||||
# #fit classifier
|
||||
# classifier.fit(training_data_r, y[train])
|
||||
|
||||
# #predict class
|
||||
# predictions_train = classifier.predict(training_data_r)
|
||||
# predictions_test = classifier.predict(testing_data_r)
|
||||
|
||||
# #store metrics predicted on test set
|
||||
# f1_scores.append(f1_score(y[test], predictions_test))
|
||||
|
||||
# #store metrics predicted on train set
|
||||
# f1_scores_train.append(f1_score(y[train], predictions_train))
|
||||
|
||||
# #print metrics of test set
|
||||
# print('--------------------')
|
||||
# print('prediction of testing set:')
|
||||
# print('F1 score: min = {}, max = {}, average = {}'
|
||||
# .format(min(f1_scores), max(f1_scores),
|
||||
# sum(f1_scores)/float(len(f1_scores))))
|
||||
|
||||
# print()
|
||||
# print('prediction of training set:')
|
||||
# print('F1 score: min = {}, max = {}, average = {}'
|
||||
# .format(min(f1_scores_train), max(f1_scores_train),
|
||||
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
# print()
|
||||
|
||||
# def analyze_errors_cv(dataset):
|
||||
# '''calculates resubstitution error
|
||||
# shows indices of false classified articles
|
||||
# uses Gaussian Bayes with train test split
|
||||
# '''
|
||||
|
||||
# X_train_test = dataset['Text']
|
||||
# y_train_test = dataset['Label']
|
||||
|
||||
# count_vector = CountVectorizer()
|
||||
|
||||
# # fit the training data and then return the matrix
|
||||
# training_data = count_vector.fit_transform(X_train_test).toarray()
|
||||
|
||||
# # transform testing data and return the matrix
|
||||
# testing_data = count_vector.transform(X_train_test).toarray()
|
||||
|
||||
# # Naive Bayes
|
||||
# classifier = GaussianNB()
|
||||
|
||||
# # fit classifier
|
||||
# classifier.fit(training_data, y_train_test)
|
||||
|
||||
# # Predict class
|
||||
# predictions = classifier.predict(testing_data)
|
||||
|
||||
# print()
|
||||
# print('errors at index:')
|
||||
# n = 0
|
||||
# for i in range(len(y_train_test)):
|
||||
# if y_train_test[i] != predictions[i]:
|
||||
# n += 1
|
||||
# print('error no.{}'.format(n))
|
||||
# print('prediction at index {} is: {}, but actual is: {}'
|
||||
# .format(i, predictions[i], y_train_test[i]))
|
||||
# print(X_train_test[i])
|
||||
# print(y_train_test[i])
|
||||
# print()
|
||||
|
||||
# print()
|
||||
# #print metrics
|
||||
# print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
||||
n = 0
|
||||
for i in range(len(y_train_test)):
|
||||
if y_train_test[i] != predictions[i]:
|
||||
n += 1
|
||||
print('error no.{}'.format(n))
|
||||
print('prediction at index {} is: {}, but actual is: {}'
|
||||
.format(i, predictions[i], y_train_test[i]))
|
||||
print(X_train_test[i])
|
||||
print(y_train_test[i])
|
||||
print()
|
||||
#print metrics
|
||||
print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
|
@ -0,0 +1,87 @@
|
|||
'''
|
||||
Support Vector Machines (SVM) Classifier
|
||||
========================================
|
||||
|
||||
The SVM training algorithm builds a model from the training data that assigns
|
||||
the test samples to one category ('merger' or 'not merger'),
|
||||
making it a non-probabilistic binary linear classifier.
|
||||
An SVM model is a representation of the samples as points in space,
|
||||
mapped so that the examples of the separate categories are divided
|
||||
by a clear gap that is as wide as possible.
|
||||
New samples are then mapped into that same space and predicted
|
||||
to belong to a category based on which side of the gap they fall.
|
||||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import f1_score, make_scorer
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.svm import SVC
|
||||
|
||||
class SVM:
|
||||
|
||||
def make_svm(dataset):
|
||||
|
||||
print('# starting SVM')
|
||||
print('#')
|
||||
|
||||
# split data into text and label set
|
||||
|
||||
# articles' text (title + text)
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
# articles' labels
|
||||
y = dataset['Label']
|
||||
|
||||
# Bag of Words
|
||||
print('# calculating bag of words')
|
||||
print('#')
|
||||
# fit the training data and then return the matrix
|
||||
#X = BagOfWords.fit_transform(X)
|
||||
X = CountVectorizer().fit_transform(X).toarray()
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
# use only most important features
|
||||
selector = SelectPercentile()
|
||||
|
||||
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
||||
|
||||
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
|
||||
'SVC__kernel': ['linear','poly','rbf','sigmoid'],
|
||||
'SVC__gamma': [0.0001, 0.001, 0.01, 0.1, 1],
|
||||
'SVC__C': [0.0001, 0.001, 0.01, 0.1, 1]},
|
||||
cv=skf,
|
||||
scoring=make_scorer(f1_score))
|
||||
|
||||
print('# fit classifier')
|
||||
print('#')
|
||||
|
||||
grid.fit(X,y)
|
||||
|
||||
# DataFrame of results
|
||||
df_results = grid.cv_results_
|
||||
|
||||
# print results
|
||||
######################
|
||||
print('RESULTS:')
|
||||
print('')
|
||||
print('mean_test_score:')
|
||||
print(df_results['mean_test_score'])
|
||||
print('')
|
||||
print('mean of means:')
|
||||
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
|
||||
print('')
|
||||
print('best score:')
|
||||
print(grid.best_score_)
|
||||
print()
|
||||
print('best parameters set found on development set:')
|
||||
print(grid.best_params_)
|
||||
print()
|
||||
|
||||
print('# ending SVM')
|
||||
print('#')
|
10
Starter.py
10
Starter.py
|
@ -10,19 +10,23 @@ from CsvHandler import CsvHandler
|
|||
from DecisionTree import DecisionTree
|
||||
from NaiveBayes import NaiveBayes
|
||||
#from Requester import Requester
|
||||
#from SVM import SVM
|
||||
from SVM import SVM
|
||||
|
||||
print('# starting program')
|
||||
print()
|
||||
print('#')
|
||||
|
||||
# only if new unlabeled(!) data set is required:
|
||||
# Requester.save_articles_from_webhoseio()
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('#')
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
# DecisionTree.make_tree(dataset)
|
||||
NaiveBayes.make_naive_bayes(dataset)
|
||||
# SVM.make_svm(dataset)
|
||||
SVM.make_svm(dataset)
|
||||
|
||||
print('# ending program')
|
Loading…
Reference in New Issue