thesis-anne/SVM.py

'''
Support Vector Machines (SVM) Classifier
========================================

The SVM training algorithm builds a model from the training data that assigns
the test samples to one category ('merger' or 'not merger'),
making it a non-probabilistic binary linear classifier.
An SVM model is a representation of the samples as points in space,
mapped so that the examples of the separate categories are divided
by a clear gap that is as wide as possible.
New samples are then mapped into that same space and predicted
to belong to a category based on which side of the gap they fall.
'''

from BagOfWords import BagOfWords

import csv

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

class SVM:

    def make_svm(dataset):

        print('# fitting model')
        print('# ...')

        # split data into text and label set

        # articles' text (title + text)
        X = dataset['Title'] + ' ' + dataset['Text']
        # articles' labels
        y = dataset['Label']

        # Bag of Words
        print('# calculating bag of words')
        print('# ...')
        # fit the training data and then return the matrix
        #X = BagOfWords.fit_transform(X)
        X = CountVectorizer().fit_transform(X).toarray()

        # use stratified k-fold cross-validation as split method
        skf = StratifiedKFold(n_splits = 10, shuffle=True)

        # use only most important features
        selector = SelectPercentile()

        pipeline = Pipeline([('perc', selector), ('SVC', SVC())])

        grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75],
                            'SVC__kernel': ['linear'],
                            'SVC__gamma': [0.00001, 0.0001],
                            'SVC__C': [0.1, 1]},
                            cv=skf,
                            scoring=make_scorer(f1_score))

        print('# fit classifier')
        print('# ...')

        grid.fit(X,y)

        # DataFrame of results
        df_results = grid.cv_results_

        # print results
        ######################
        print('RESULTS:')
        print('')
        print('mean_test_score:')
        print(df_results['mean_test_score'])
        print('')
        print('mean of means:')
        print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
        print('')
        print('best score:')
        print(grid.best_score_)
        print()
        print('best parameters set found on development set:')
        print(grid.best_params_)
        print()

    if __name__ == '__main__':

        print('# starting svm')
        print('# ...')

        file = 'classification_labelled_corrected.csv'

        # read csv file
        print('# reading dataset')
        print('# ...')

        data = pd.read_csv(file,
                   sep='|',
                   engine='python',
                   decimal='.',
                   quotechar='\'',
                   quoting=csv.QUOTE_NONE)

        make_svm(data)

        print('# ending svm')