thesis-anne/SVM.py

109 lines
3.2 KiB
Python

'''
Support Vector Machines (SVM) Classifier
========================================
The SVM training algorithm builds a model from the training data that assigns
the test samples to one category ('merger' or 'not merger'),
making it a non-probabilistic binary linear classifier.
An SVM model is a representation of the samples as points in space,
mapped so that the examples of the separate categories are divided
by a clear gap that is as wide as possible.
New samples are then mapped into that same space and predicted
to belong to a category based on which side of the gap they fall.
'''
from BagOfWords import BagOfWords
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
class SVM:
def make_svm(dataset):
print('# fitting model')
print('# ...')
# split data into text and label set
# articles' text (title + text)
X = dataset['Title'] + ' ' + dataset['Text']
# articles' labels
y = dataset['Label']
# Bag of Words
print('# calculating bag of words')
print('# ...')
# fit the training data and then return the matrix
#X = BagOfWords.fit_transform(X)
X = CountVectorizer().fit_transform(X).toarray()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# use only most important features
selector = SelectPercentile()
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75],
'SVC__kernel': ['linear'],
'SVC__gamma': [0.00001, 0.0001],
'SVC__C': [0.1, 1]},
cv=skf,
scoring=make_scorer(f1_score))
print('# fit classifier')
print('# ...')
grid.fit(X,y)
# DataFrame of results
df_results = grid.cv_results_
# print results
######################
print('RESULTS:')
print('')
print('mean_test_score:')
print(df_results['mean_test_score'])
print('')
print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('')
print('best score:')
print(grid.best_score_)
print()
print('best parameters set found on development set:')
print(grid.best_params_)
print()
if __name__ == '__main__':
print('# starting svm')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
data = pd.read_csv(file,
sep='|',
engine='python',
decimal='.',
quotechar='\'',
quoting=csv.QUOTE_NONE)
make_svm(data)
print('# ending svm')