SVM.py, NaiveBaies.py: built in grid-search, pipeline
This commit is contained in:
parent
1195a161d6
commit
52146158e2
|
@ -14,6 +14,12 @@ from nltk.stem.porter import PorterStemmer
|
||||||
|
|
||||||
class BagOfWords:
|
class BagOfWords:
|
||||||
|
|
||||||
|
def fit_transform(X, relative_word_frequencies=True):
|
||||||
|
''' similar to CountVectorizer's fit_transform method
|
||||||
|
'''
|
||||||
|
vocab = BagOfWords.make_vocab(X)
|
||||||
|
return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)
|
||||||
|
|
||||||
def extract_words(text):
|
def extract_words(text):
|
||||||
'''takes article as argument, removes numbers,
|
'''takes article as argument, removes numbers,
|
||||||
returns list of single words, recurrences included.
|
returns list of single words, recurrences included.
|
||||||
|
@ -37,17 +43,17 @@ class BagOfWords:
|
||||||
return words_cleaned
|
return words_cleaned
|
||||||
|
|
||||||
def reduce_word_to_stem(word):
|
def reduce_word_to_stem(word):
|
||||||
'''takes normal word as input, returns the word's word stem
|
'''takes normal word as input, returns the word's stem
|
||||||
'''
|
'''
|
||||||
stemmer = PorterStemmer()
|
stemmer = PorterStemmer()
|
||||||
# replace word by its stem
|
# replace word by its stem
|
||||||
word = stemmer.stem(word)
|
word = stemmer.stem(word)
|
||||||
return word
|
return word
|
||||||
|
|
||||||
def make_matrix(series, vocab):
|
def make_matrix(series, vocab, relative_word_frequencies):
|
||||||
'''calculates word stem frequencies in input articles.
|
'''calculates word stem frequencies in input articles.
|
||||||
returns matrix (DataFrame) with relative word frequencies
|
returns matrix (DataFrame) with relative word frequencies
|
||||||
(0 <= values < 1)
|
(0 <= values < 1) or absolute word frequencies (int).
|
||||||
(rows: different articles, colums: different words in vocab)
|
(rows: different articles, colums: different words in vocab)
|
||||||
'''
|
'''
|
||||||
# create list of tuples
|
# create list of tuples
|
||||||
|
@ -64,8 +70,13 @@ class BagOfWords:
|
||||||
vector.append(0)
|
vector.append(0)
|
||||||
for w in words:
|
for w in words:
|
||||||
if w == v:
|
if w == v:
|
||||||
# add relative word frequency
|
if relative_word_frequencies:
|
||||||
vector[i] += 1/word_count
|
# relative word frequency
|
||||||
|
vector[i] += 1/word_count
|
||||||
|
else:
|
||||||
|
# absolute word frequency
|
||||||
|
vector[i] += 1
|
||||||
|
|
||||||
# add single vector as tuple
|
# add single vector as tuple
|
||||||
vectors.append(tuple(vector))
|
vectors.append(tuple(vector))
|
||||||
df_vectors = pd.DataFrame.from_records(vectors,
|
df_vectors = pd.DataFrame.from_records(vectors,
|
||||||
|
@ -89,7 +100,7 @@ class BagOfWords:
|
||||||
def set_stop_words():
|
def set_stop_words():
|
||||||
'''creates list of all words that will be ignored
|
'''creates list of all words that will be ignored
|
||||||
'''
|
'''
|
||||||
# standard stopwords from nltk.corpus stopwords('english')
|
# stopwords
|
||||||
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
||||||
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
|
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
|
||||||
'aren\'t', 'as', 'at', 'be', 'because', 'been',
|
'aren\'t', 'as', 'at', 'be', 'because', 'been',
|
||||||
|
@ -119,13 +130,12 @@ class BagOfWords:
|
||||||
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
||||||
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
|
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
|
||||||
'yourselves']
|
'yourselves']
|
||||||
|
|
||||||
# add specific words
|
##=> ist das sinnvoll?:
|
||||||
stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
|
#add specific words
|
||||||
'wednesday', 'thursday', 'friday'])
|
#stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
|
||||||
|
# 'wednesday', 'thursday', 'friday'])
|
||||||
# => does this make sense?:
|
#remove the word 'not' from stop words
|
||||||
# remove the word 'not' from stop words
|
|
||||||
#stop_words.remove('not')
|
#stop_words.remove('not')
|
||||||
|
|
||||||
for i in range(len(stop_words)):
|
for i in range(len(stop_words)):
|
||||||
|
|
|
@ -9,8 +9,7 @@ holding the class labels for the training samples.
|
||||||
'''
|
'''
|
||||||
import operator
|
import operator
|
||||||
|
|
||||||
from BagOfWords import BagOfWords
|
from BagOfWords import BagOfWords
|
||||||
from CsvHandler import CsvHandler
|
|
||||||
|
|
||||||
import graphviz
|
import graphviz
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -25,9 +24,8 @@ class DecisionTree:
|
||||||
def make_tree(dataset):
|
def make_tree(dataset):
|
||||||
|
|
||||||
print('# starting decision tree')
|
print('# starting decision tree')
|
||||||
print()
|
print('#')
|
||||||
|
|
||||||
# note: better results with only title, but other important words
|
|
||||||
X = dataset['Title'] + ' ' + dataset['Text']
|
X = dataset['Title'] + ' ' + dataset['Text']
|
||||||
y = dataset['Label']
|
y = dataset['Label']
|
||||||
|
|
||||||
|
@ -94,7 +92,6 @@ class DecisionTree:
|
||||||
#print(sorted_i_w)[:20]
|
#print(sorted_i_w)[:20]
|
||||||
i_w = [x[0] for x in sorted_i_w]
|
i_w = [x[0] for x in sorted_i_w]
|
||||||
print(i_w[:20])
|
print(i_w[:20])
|
||||||
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
#print metrics of test set
|
#print metrics of test set
|
||||||
|
@ -109,4 +106,4 @@ class DecisionTree:
|
||||||
# print()
|
# print()
|
||||||
|
|
||||||
print('# ending decision tree')
|
print('# ending decision tree')
|
||||||
print()
|
print('#')
|
337
NaiveBayes.py
337
NaiveBayes.py
|
@ -11,246 +11,129 @@ given the label. It considers each of these features to contribute
|
||||||
independently to the probability that it belongs to its category,
|
independently to the probability that it belongs to its category,
|
||||||
regardless of any possible correlations between these features.
|
regardless of any possible correlations between these features.
|
||||||
'''
|
'''
|
||||||
from BagOfWords import BagOfWords
|
|
||||||
from CsvHandler import CsvHandler
|
|
||||||
|
|
||||||
#from sklearn.feature_extraction.text import CountVectorizer
|
#!!
|
||||||
#from sklearn.feature_selection import SelectPercentile
|
# The multinomial Naive Bayes classifier is suitable
|
||||||
from sklearn.metrics import recall_score, precision_score
|
#for classification with discrete features (e.g.,
|
||||||
|
#word counts for text classification).
|
||||||
|
#The multinomial distribution normally requires
|
||||||
|
#integer feature counts. However, in practice,
|
||||||
|
#fractional counts such as tf-idf may also work.
|
||||||
|
|
||||||
|
# => nur bei eigenem BOW berücksichtigt
|
||||||
|
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_selection import SelectPercentile
|
||||||
|
from sklearn.metrics import f1_score, make_scorer
|
||||||
from sklearn.model_selection import StratifiedKFold
|
from sklearn.model_selection import StratifiedKFold
|
||||||
#from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import GridSearchCV
|
||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
|
||||||
|
# MultinomialNB statt GaussianNB benutzt => OK?
|
||||||
|
#from sklearn.naive_bayes import GaussianNB
|
||||||
|
|
||||||
class NaiveBayes:
|
class NaiveBayes:
|
||||||
|
|
||||||
def make_naive_bayes(dataset):
|
def make_naive_bayes(dataset):
|
||||||
'''fits naive bayes model with StratifiedKFold,
|
'''fits naive bayes model
|
||||||
uses my BOW
|
|
||||||
'''
|
'''
|
||||||
print('# starting naive bayes')
|
print('# starting naive bayes')
|
||||||
print()
|
print('#')
|
||||||
|
|
||||||
# join title and text
|
# split data into text and label set
|
||||||
X = dataset['Title'] + ' ' + dataset['Text']
|
X = dataset['Title'] + ' ' + dataset['Text']
|
||||||
y = dataset['Label']
|
y = dataset['Label']
|
||||||
|
|
||||||
|
# Bag of Words
|
||||||
|
print('# calculating bag of words')
|
||||||
|
print('#')
|
||||||
|
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
|
||||||
|
# toDO: warum so andere (schlechte) werte mit meinem BOW?
|
||||||
|
#X = BagOfWords.fit_transform(X, False)
|
||||||
|
|
||||||
|
X = CountVectorizer().fit_transform(X).toarray()
|
||||||
|
|
||||||
# use stratified k-fold cross-validation as split method
|
# use stratified k-fold cross-validation as split method
|
||||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||||
|
|
||||||
classifier = GaussianNB()
|
|
||||||
|
|
||||||
# lists for metrics
|
|
||||||
recall_scores = []
|
|
||||||
precision_scores = []
|
|
||||||
f1_scores = []
|
|
||||||
|
|
||||||
# for each fold
|
|
||||||
n = 0
|
|
||||||
for train, test in skf.split(X,y):
|
|
||||||
# BOW
|
|
||||||
vocab = BagOfWords.make_vocab(X[train])
|
|
||||||
# fit the training data and then return the matrix
|
|
||||||
training_data = BagOfWords.make_matrix(X[train], vocab)
|
|
||||||
# transform testing data and return the matrix
|
|
||||||
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
|
||||||
|
|
||||||
#fit classifier
|
|
||||||
classifier.fit(training_data, y[train])
|
|
||||||
#predict class
|
|
||||||
predictions_train = classifier.predict(training_data)
|
|
||||||
predictions_test = classifier.predict(testing_data)
|
|
||||||
|
|
||||||
#store metrics
|
|
||||||
rec = recall_score(y[test], predictions_test)
|
|
||||||
recall_scores.append(rec)
|
|
||||||
prec = precision_score(y[train], predictions_train)
|
|
||||||
precision_scores.append(prec)
|
|
||||||
# equation for f1 score
|
|
||||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
|
||||||
|
|
||||||
#print metrics of test set
|
|
||||||
print('prediction of testing set:')
|
|
||||||
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
|
|
||||||
.format(min(f1_scores), max(f1_scores),
|
|
||||||
sum(f1_scores)/float(len(f1_scores))))
|
|
||||||
print()
|
|
||||||
#print('overfit testing: prediction of training set')
|
|
||||||
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
|
||||||
#format(min(f1_scores_train), max(f1_scores_train),
|
|
||||||
#sum(f1_scores_train)/float(len(f1_scores_train))))
|
|
||||||
#print()
|
|
||||||
|
|
||||||
|
# use only most important features
|
||||||
|
selector = SelectPercentile()
|
||||||
|
|
||||||
|
pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
|
||||||
|
|
||||||
|
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
|
||||||
|
'NB__alpha': [0.00000001, 0.0000001,
|
||||||
|
0.000001, 0.00001,
|
||||||
|
0.0001, 0.001, 0.01,
|
||||||
|
0.1]},
|
||||||
|
cv=skf,
|
||||||
|
scoring=make_scorer(f1_score))
|
||||||
|
|
||||||
|
print('# fit classifier')
|
||||||
|
print('#')
|
||||||
|
|
||||||
|
grid.fit(X,y)
|
||||||
|
|
||||||
|
# DataFrame of results
|
||||||
|
df_results = grid.cv_results_
|
||||||
|
|
||||||
|
# print results
|
||||||
|
######################
|
||||||
|
print('RESULTS:')
|
||||||
|
print('#')
|
||||||
|
print('mean_test_score:')
|
||||||
|
print(df_results['mean_test_score'])
|
||||||
|
print('#')
|
||||||
|
print('mean of means:')
|
||||||
|
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
|
||||||
|
print('#')
|
||||||
|
print('best score:')
|
||||||
|
print(grid.best_score_)
|
||||||
|
print('#')
|
||||||
|
print('best parameters set found on development set:')
|
||||||
|
print(grid.best_params_)
|
||||||
|
print('#')
|
||||||
|
|
||||||
print('# ending naive bayes')
|
print('# ending naive bayes')
|
||||||
|
print('#')
|
||||||
|
|
||||||
|
def analyze_errors(dataset):
|
||||||
|
'''calculates resubstitution error
|
||||||
|
shows indices of false classified articles
|
||||||
|
uses Gaussian Bayes with train test split
|
||||||
|
'''
|
||||||
|
X_train_test = dataset['Title'] + ' ' + dataset['Text']
|
||||||
|
y_train_test = dataset['Label']
|
||||||
|
|
||||||
|
count_vector = CountVectorizer()
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = count_vector.fit_transform(X_train_test).toarray()
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
testing_data = count_vector.transform(X_train_test).toarray()
|
||||||
|
|
||||||
|
# Naive Bayes
|
||||||
|
classifier = GaussianNB()
|
||||||
|
# fit classifier
|
||||||
|
classifier.fit(training_data, y_train_test)
|
||||||
|
|
||||||
|
# Predict class
|
||||||
|
predictions = classifier.predict(testing_data)
|
||||||
|
print('Errors at index:')
|
||||||
print()
|
print()
|
||||||
|
n = 0
|
||||||
# def make_naive_bayes_selectpercentile(dataset):
|
for i in range(len(y_train_test)):
|
||||||
# '''fits naive bayes model with StratifiedKFold, uses my BOW
|
if y_train_test[i] != predictions[i]:
|
||||||
# feature selection: select 0.25-percentile
|
n += 1
|
||||||
# '''
|
print('error no.{}'.format(n))
|
||||||
|
print('prediction at index {} is: {}, but actual is: {}'
|
||||||
# print('# starting naive bayes')
|
.format(i, predictions[i], y_train_test[i]))
|
||||||
# print()
|
print(X_train_test[i])
|
||||||
|
print(y_train_test[i])
|
||||||
# # alternative: use only articles' header => may give better results
|
print()
|
||||||
# X = dataset['Title'] + ' ' + dataset['Text']
|
#print metrics
|
||||||
# y = dataset['Label']
|
print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
||||||
|
|
||||||
# # use stratified k-fold cross-validation as split method
|
|
||||||
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
|
||||||
|
|
||||||
# classifier = GaussianNB()
|
|
||||||
|
|
||||||
# # lists for metrics
|
|
||||||
# recall_scores = []
|
|
||||||
# precision_scores = []
|
|
||||||
# f1_scores = []
|
|
||||||
|
|
||||||
# # for each fold
|
|
||||||
# n = 0
|
|
||||||
# for train, test in skf.split(X,y):
|
|
||||||
# # BOW
|
|
||||||
# vocab = BagOfWords.make_vocab(X[train])
|
|
||||||
# # fit the training data and then return the matrix
|
|
||||||
# training_data = BagOfWords.make_matrix(X[train], vocab)
|
|
||||||
# # transform testing data and return the matrix
|
|
||||||
# testing_data = BagOfWords.make_matrix(X[test], vocab)
|
|
||||||
|
|
||||||
# # apply select percentile
|
|
||||||
# selector = SelectPercentile(percentile=25)
|
|
||||||
# selector.fit(training_data, y[train])
|
|
||||||
|
|
||||||
# training_data_r = selector.transform(training_data)
|
|
||||||
# testing_data_r = selector.transform(testing_data)
|
|
||||||
|
|
||||||
# #fit classifier
|
|
||||||
# classifier.fit(training_data_r, y[train])
|
|
||||||
# #predict class
|
|
||||||
# predictions_train = classifier.predict(training_data_r)
|
|
||||||
# predictions_test = classifier.predict(testing_data_r)
|
|
||||||
|
|
||||||
# #store metrics
|
|
||||||
# rec = recall_score(y[test], predictions_test)
|
|
||||||
# recall_scores.append(rec)
|
|
||||||
# prec = precision_score(y[train], predictions_train)
|
|
||||||
# precision_scores.append(prec)
|
|
||||||
# # equation for f1 score
|
|
||||||
# f1_scores.append(2 * (prec * rec)/(prec + rec))
|
|
||||||
|
|
||||||
# #print metrics of test set
|
|
||||||
# print('prediction of testing set:')
|
|
||||||
# print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
|
|
||||||
# .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))
|
|
||||||
# print()
|
|
||||||
# #print('overfit testing: prediction of training set')
|
|
||||||
# #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
|
||||||
# #format(min(f1_scores_train), max(f1_scores_train),
|
|
||||||
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
|
||||||
# #print()
|
|
||||||
|
|
||||||
# print('# ending naive bayes')
|
|
||||||
# print()
|
|
||||||
|
|
||||||
|
|
||||||
# def make_naive_bayes_CV(dataset):
|
|
||||||
# '''alternative: uses CountVectorizer (faster)
|
|
||||||
# '''
|
|
||||||
# # alternative: use only articles' header => may give better results
|
|
||||||
# X = dataset['Title'] + '.' + dataset['Text'] + '.'
|
|
||||||
# y = dataset['Label']
|
|
||||||
|
|
||||||
# # use stratified k-fold cross-validation as split method
|
|
||||||
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
|
||||||
|
|
||||||
# count_vector = CountVectorizer()
|
|
||||||
|
|
||||||
# classifier = GaussianNB()
|
|
||||||
|
|
||||||
# # lists for metrics predicted on test/train set
|
|
||||||
# f1_scores, f1_scores_train = []
|
|
||||||
|
|
||||||
# # for each fold (10 times)
|
|
||||||
# # fold number
|
|
||||||
# n = 0
|
|
||||||
# for train, test in skf.split(X,y):
|
|
||||||
|
|
||||||
# # fit the training data and then return the matrix
|
|
||||||
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
|
|
||||||
# # transform testing data and return the matrix
|
|
||||||
# testing_data = count_vector.transform(X[test]).toarray()
|
|
||||||
|
|
||||||
# # apply select percentile
|
|
||||||
# selector = SelectPercentile(percentile=25)
|
|
||||||
# selector.fit(training_data, y[train])
|
|
||||||
|
|
||||||
# training_data_r = selector.transform(training_data)
|
|
||||||
# testing_data_r = selector.transform(testing_data)
|
|
||||||
|
|
||||||
# #fit classifier
|
|
||||||
# classifier.fit(training_data_r, y[train])
|
|
||||||
|
|
||||||
# #predict class
|
|
||||||
# predictions_train = classifier.predict(training_data_r)
|
|
||||||
# predictions_test = classifier.predict(testing_data_r)
|
|
||||||
|
|
||||||
# #store metrics predicted on test set
|
|
||||||
# f1_scores.append(f1_score(y[test], predictions_test))
|
|
||||||
|
|
||||||
# #store metrics predicted on train set
|
|
||||||
# f1_scores_train.append(f1_score(y[train], predictions_train))
|
|
||||||
|
|
||||||
# #print metrics of test set
|
|
||||||
# print('--------------------')
|
|
||||||
# print('prediction of testing set:')
|
|
||||||
# print('F1 score: min = {}, max = {}, average = {}'
|
|
||||||
# .format(min(f1_scores), max(f1_scores),
|
|
||||||
# sum(f1_scores)/float(len(f1_scores))))
|
|
||||||
|
|
||||||
# print()
|
|
||||||
# print('prediction of training set:')
|
|
||||||
# print('F1 score: min = {}, max = {}, average = {}'
|
|
||||||
# .format(min(f1_scores_train), max(f1_scores_train),
|
|
||||||
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
|
||||||
# print()
|
|
||||||
|
|
||||||
# def analyze_errors_cv(dataset):
|
|
||||||
# '''calculates resubstitution error
|
|
||||||
# shows indices of false classified articles
|
|
||||||
# uses Gaussian Bayes with train test split
|
|
||||||
# '''
|
|
||||||
|
|
||||||
# X_train_test = dataset['Text']
|
|
||||||
# y_train_test = dataset['Label']
|
|
||||||
|
|
||||||
# count_vector = CountVectorizer()
|
|
||||||
|
|
||||||
# # fit the training data and then return the matrix
|
|
||||||
# training_data = count_vector.fit_transform(X_train_test).toarray()
|
|
||||||
|
|
||||||
# # transform testing data and return the matrix
|
|
||||||
# testing_data = count_vector.transform(X_train_test).toarray()
|
|
||||||
|
|
||||||
# # Naive Bayes
|
|
||||||
# classifier = GaussianNB()
|
|
||||||
|
|
||||||
# # fit classifier
|
|
||||||
# classifier.fit(training_data, y_train_test)
|
|
||||||
|
|
||||||
# # Predict class
|
|
||||||
# predictions = classifier.predict(testing_data)
|
|
||||||
|
|
||||||
# print()
|
|
||||||
# print('errors at index:')
|
|
||||||
# n = 0
|
|
||||||
# for i in range(len(y_train_test)):
|
|
||||||
# if y_train_test[i] != predictions[i]:
|
|
||||||
# n += 1
|
|
||||||
# print('error no.{}'.format(n))
|
|
||||||
# print('prediction at index {} is: {}, but actual is: {}'
|
|
||||||
# .format(i, predictions[i], y_train_test[i]))
|
|
||||||
# print(X_train_test[i])
|
|
||||||
# print(y_train_test[i])
|
|
||||||
# print()
|
|
||||||
|
|
||||||
# print()
|
|
||||||
# #print metrics
|
|
||||||
# print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
|
|
@ -0,0 +1,87 @@
|
||||||
|
'''
|
||||||
|
Support Vector Machines (SVM) Classifier
|
||||||
|
========================================
|
||||||
|
|
||||||
|
The SVM training algorithm builds a model from the training data that assigns
|
||||||
|
the test samples to one category ('merger' or 'not merger'),
|
||||||
|
making it a non-probabilistic binary linear classifier.
|
||||||
|
An SVM model is a representation of the samples as points in space,
|
||||||
|
mapped so that the examples of the separate categories are divided
|
||||||
|
by a clear gap that is as wide as possible.
|
||||||
|
New samples are then mapped into that same space and predicted
|
||||||
|
to belong to a category based on which side of the gap they fall.
|
||||||
|
'''
|
||||||
|
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_selection import SelectPercentile
|
||||||
|
from sklearn.metrics import f1_score, make_scorer
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
|
||||||
|
class SVM:
|
||||||
|
|
||||||
|
def make_svm(dataset):
|
||||||
|
|
||||||
|
print('# starting SVM')
|
||||||
|
print('#')
|
||||||
|
|
||||||
|
# split data into text and label set
|
||||||
|
|
||||||
|
# articles' text (title + text)
|
||||||
|
X = dataset['Title'] + ' ' + dataset['Text']
|
||||||
|
# articles' labels
|
||||||
|
y = dataset['Label']
|
||||||
|
|
||||||
|
# Bag of Words
|
||||||
|
print('# calculating bag of words')
|
||||||
|
print('#')
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
#X = BagOfWords.fit_transform(X)
|
||||||
|
X = CountVectorizer().fit_transform(X).toarray()
|
||||||
|
|
||||||
|
# use stratified k-fold cross-validation as split method
|
||||||
|
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||||
|
|
||||||
|
# use only most important features
|
||||||
|
selector = SelectPercentile()
|
||||||
|
|
||||||
|
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
||||||
|
|
||||||
|
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
|
||||||
|
'SVC__kernel': ['linear','poly','rbf','sigmoid'],
|
||||||
|
'SVC__gamma': [0.0001, 0.001, 0.01, 0.1, 1],
|
||||||
|
'SVC__C': [0.0001, 0.001, 0.01, 0.1, 1]},
|
||||||
|
cv=skf,
|
||||||
|
scoring=make_scorer(f1_score))
|
||||||
|
|
||||||
|
print('# fit classifier')
|
||||||
|
print('#')
|
||||||
|
|
||||||
|
grid.fit(X,y)
|
||||||
|
|
||||||
|
# DataFrame of results
|
||||||
|
df_results = grid.cv_results_
|
||||||
|
|
||||||
|
# print results
|
||||||
|
######################
|
||||||
|
print('RESULTS:')
|
||||||
|
print('')
|
||||||
|
print('mean_test_score:')
|
||||||
|
print(df_results['mean_test_score'])
|
||||||
|
print('')
|
||||||
|
print('mean of means:')
|
||||||
|
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
|
||||||
|
print('')
|
||||||
|
print('best score:')
|
||||||
|
print(grid.best_score_)
|
||||||
|
print()
|
||||||
|
print('best parameters set found on development set:')
|
||||||
|
print(grid.best_params_)
|
||||||
|
print()
|
||||||
|
|
||||||
|
print('# ending SVM')
|
||||||
|
print('#')
|
10
Starter.py
10
Starter.py
|
@ -10,19 +10,23 @@ from CsvHandler import CsvHandler
|
||||||
from DecisionTree import DecisionTree
|
from DecisionTree import DecisionTree
|
||||||
from NaiveBayes import NaiveBayes
|
from NaiveBayes import NaiveBayes
|
||||||
#from Requester import Requester
|
#from Requester import Requester
|
||||||
#from SVM import SVM
|
from SVM import SVM
|
||||||
|
|
||||||
print('# starting program')
|
print('# starting program')
|
||||||
print()
|
print('#')
|
||||||
|
|
||||||
|
# only if new unlabeled(!) data set is required:
|
||||||
# Requester.save_articles_from_webhoseio()
|
# Requester.save_articles_from_webhoseio()
|
||||||
|
|
||||||
file = 'classification_labelled_corrected.csv'
|
file = 'classification_labelled_corrected.csv'
|
||||||
|
|
||||||
# read csv file
|
# read csv file
|
||||||
|
print('# reading dataset')
|
||||||
|
print('#')
|
||||||
dataset = CsvHandler.read_csv(file)
|
dataset = CsvHandler.read_csv(file)
|
||||||
|
|
||||||
# DecisionTree.make_tree(dataset)
|
# DecisionTree.make_tree(dataset)
|
||||||
NaiveBayes.make_naive_bayes(dataset)
|
NaiveBayes.make_naive_bayes(dataset)
|
||||||
# SVM.make_svm(dataset)
|
SVM.make_svm(dataset)
|
||||||
|
|
||||||
print('# ending program')
|
print('# ending program')
|
Loading…
Reference in New Issue