''' Support Vector Machines (SVM) Classifier ======================================== The SVM training algorithm builds a model from the training data that assigns the test samples to one category ('merger' or 'not merger'), making it a non-probabilistic binary linear classifier. An SVM model is a representation of the samples as points in space, mapped so that the examples of the separate categories are divided by a clear gap that is as wide as possible. New samples are then mapped into that same space and predicted to belong to a category based on which side of the gap they fall. ''' from BagOfWords import BagOfWords import csv import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectPercentile from sklearn.metrics import f1_score, make_scorer from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.svm import SVC class SVM: def make_svm(dataset, sklearn_cv=True): print('# fitting model') print('# ...') # split data into text and label set # articles' text (title + text) X = dataset['Title'] + '. ' + dataset['Text'] # articles' labels y = dataset['Label'] matrix = pd.DataFrame() # fit the training data and then return the matrix if sklearn_cv: # use sklearn CountVectorizer matrix = CountVectorizer().fit_transform(X).toarray() else: # use own BOW implementation matrix = BagOfWords.fit_transform(X) # use stratified k-fold cross-validation as split method skf = StratifiedKFold(n_splits = 10, shuffle=True) # use only most important features selector = SelectPercentile() pipeline = Pipeline([('perc', selector), ('SVC', SVC())]) grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75], 'SVC__kernel': ['linear'], 'SVC__gamma': [0.00001, 0.0001], 'SVC__C': [0.1, 1]}, cv=skf, scoring=make_scorer(f1_score)) print('# fit classifier') print('# ...') grid.fit(matrix,y) # DataFrame of results df_results = grid.cv_results_ # print results ###################### print('RESULTS:') print('') print('mean_test_score:') print(df_results['mean_test_score']) print('') print('mean of means:') print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score'])) print('') print('best score:') print(grid.best_score_) print() print('best parameters set found on development set:') print(grid.best_params_) print() if __name__ == '__main__': print('# starting svm') print('# ...') file = '..\\data\\classification_labelled_corrected.csv' # read csv file print('# reading dataset') print('# ...') data = pd.read_csv(file, sep='|', engine='python', decimal='.', quotechar='\'', quoting=csv.QUOTE_NONE) use_count_vectorizer = True make_svm(data, use_count_vectorizer) print('# ending svm')