thesis-anne/src/SVM_multiclass_grid.py

'''
Support Vector Machines (SVM) Classifier
========================================

The SVM training algorithm builds a model from the training data that assigns
the test samples to one category ('merger' or 'not merger'),
making it a non-probabilistic binary linear classifier.
An SVM model is a representation of the samples as points in space,
mapped so that the examples of the separate categories are divided
by a clear gap that is as wide as possible.
New samples are then mapped into that same space and predicted
to belong to a category based on which side of the gap they fall.
'''

from BagOfWords import BagOfWords

import csv

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

class SVM_multiclass_grid:

	def make_svm(dataset, sklearn_cv=True):

		print('# fitting model')
		print('# ...')

		# split data into text and label set

		# articles' text (title + text)
		X = dataset['Title'] + '. ' + dataset['Text']
		# articles' labels
		y = dataset['Label']
		matrix = pd.DataFrame()

		# fit the training data and then return the matrix
		if sklearn_cv:
			# use sklearn CountVectorizer
			company_names_list = BagOfWords.load_company_names()
			stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
			matrix = CountVectorizer(stop_words = stopwords).fit_transform(X).toarray()
		else:
			# use own BOW implementation
			matrix = BagOfWords.fit_transform(X)

		# use stratified k-fold cross-validation as split method
		skf = StratifiedKFold(n_splits = 10, shuffle=True)

		# use only most important features
		selector = SelectPercentile()

		pipeline = Pipeline([('perc', selector), ('SVC', SVC())])

		grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100],
							'SVC__kernel': ['linear'],
							'SVC__gamma': [0.000001, 0.00001],
							'SVC__C': [0.01, 0.1]},
							cv=skf,
							scoring=make_scorer(recall_score, average='micro'))

		print('# fit classifier')
		print('# ...')

		grid.fit(matrix,y)

		# DataFrame of results
		df_results = grid.cv_results_

		# print results
		######################
		print('RESULTS:')
		print('')
		print('mean_test_score:')
		print(df_results['mean_test_score'])
		print('')
		print('mean of means:')
		print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
		print('')
		print('best score:')
		print(grid.best_score_)
		print()
		print('best parameters set found on development set:')
		print(grid.best_params_)
		print()

	if __name__ == '__main__':

		print('# starting svm')
		print('# ...')

		#file = '..\\data\\classification_labelled_corrected.csv'

		# read csv file
		print('# reading dataset')
		print('# ...')

		# data = pd.read_csv(file,
				   # sep='|',
				   # engine='python',
				   # decimal='.',
				   # quotechar='\'',
				   # quoting=csv.QUOTE_NONE)
		# read current data set from csv

		df = pd.read_csv('../data/interactive_labeling_round_11.csv',
						  sep='|',
						  usecols=range(1,13), # drop first column 'unnamed'
						  encoding='utf-8',
						  quoting=csv.QUOTE_NONNUMERIC,
						  quotechar='\'')
		data = df.loc[df['Label'] != -1].reset_index(drop=True)

		use_count_vectorizer = True
		make_svm(data, use_count_vectorizer)

		print('# ending svm')