thesis-anne/src/SVM_multiclass_grid.py

123 lines
3.4 KiB
Python

'''
Support Vector Machines (SVM) Classifier
========================================
The SVM training algorithm builds a model from the training data that assigns
the test samples to one category ('merger' or 'not merger'),
making it a non-probabilistic binary linear classifier.
An SVM model is a representation of the samples as points in space,
mapped so that the examples of the separate categories are divided
by a clear gap that is as wide as possible.
New samples are then mapped into that same space and predicted
to belong to a category based on which side of the gap they fall.
'''
from BagOfWords import BagOfWords
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
class SVM_multiclass_grid:
def make_svm(dataset, sklearn_cv=True):
print('# fitting model')
print('# ...')
# split data into text and label set
# articles' text (title + text)
X = dataset['Title'] + '. ' + dataset['Text']
# articles' labels
y = dataset['Label']
matrix = pd.DataFrame()
# fit the training data and then return the matrix
if sklearn_cv:
# use sklearn CountVectorizer
company_names_list = BagOfWords.load_company_names()
stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
matrix = CountVectorizer(stop_words = stopwords).fit_transform(X).toarray()
else:
# use own BOW implementation
matrix = BagOfWords.fit_transform(X)
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# use only most important features
selector = SelectPercentile()
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100],
'SVC__kernel': ['linear'],
'SVC__gamma': [0.000001, 0.00001],
'SVC__C': [0.01, 0.1]},
cv=skf,
scoring=make_scorer(recall_score, average='micro'))
print('# fit classifier')
print('# ...')
grid.fit(matrix,y)
# DataFrame of results
df_results = grid.cv_results_
# print results
######################
print('RESULTS:')
print('')
print('mean_test_score:')
print(df_results['mean_test_score'])
print('')
print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('')
print('best score:')
print(grid.best_score_)
print()
print('best parameters set found on development set:')
print(grid.best_params_)
print()
if __name__ == '__main__':
print('# starting svm')
print('# ...')
#file = '..\\data\\classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
# data = pd.read_csv(file,
# sep='|',
# engine='python',
# decimal='.',
# quotechar='\'',
# quoting=csv.QUOTE_NONE)
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
data = df.loc[df['Label'] != -1].reset_index(drop=True)
use_count_vectorizer = True
make_svm(data, use_count_vectorizer)
print('# ending svm')