''' Support Vector Machines (SVM) Classifier ======================================== The SVM training algorithm builds a model from the training data that assigns the test samples to one category ('merger' or 'not merger'), making it a non-probabilistic binary linear classifier. An SVM model is a representation of the samples as points in space, mapped so that the examples of the separate categories are divided by a clear gap that is as wide as possible. New samples are then mapped into that same space and predicted to belong to a category based on which side of the gap they fall. ''' from BagOfWords import BagOfWords import csv import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectPercentile from sklearn.metrics import recall_score, f1_score, make_scorer from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.svm import SVC class SVM_multiclass_grid: def make_svm(dataset, sklearn_cv=True): print('# fitting model') print('# ...') # split data into text and label set # articles' text (title + text) X = dataset['Title'] + '. ' + dataset['Text'] # articles' labels y = dataset['Label'] matrix = pd.DataFrame() # fit the training data and then return the matrix if sklearn_cv: # use sklearn CountVectorizer company_names_list = BagOfWords.load_company_names() stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list) matrix = CountVectorizer(stop_words = stopwords).fit_transform(X).toarray() else: # use own BOW implementation matrix = BagOfWords.fit_transform(X) # use stratified k-fold cross-validation as split method skf = StratifiedKFold(n_splits = 10, shuffle=True) # use only most important features selector = SelectPercentile() pipeline = Pipeline([('perc', selector), ('SVC', SVC())]) grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100], 'SVC__kernel': ['linear'], 'SVC__gamma': [0.000001, 0.00001], 'SVC__C': [0.01, 0.1]}, cv=skf, scoring=make_scorer(recall_score, average='micro')) print('# fit classifier') print('# ...') grid.fit(matrix,y) # DataFrame of results df_results = grid.cv_results_ # print results ###################### print('RESULTS:') print('') print('mean_test_score:') print(df_results['mean_test_score']) print('') print('mean of means:') print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score'])) print('') print('best score:') print(grid.best_score_) print() print('best parameters set found on development set:') print(grid.best_params_) print() if __name__ == '__main__': print('# starting svm') print('# ...') #file = '..\\data\\classification_labelled_corrected.csv' # read csv file print('# reading dataset') print('# ...') # data = pd.read_csv(file, # sep='|', # engine='python', # decimal='.', # quotechar='\'', # quoting=csv.QUOTE_NONE) # read current data set from csv df = pd.read_csv('../data/interactive_labeling_round_11.csv', sep='|', usecols=range(1,13), # drop first column 'unnamed' encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC, quotechar='\'') data = df.loc[df['Label'] != -1].reset_index(drop=True) use_count_vectorizer = True make_svm(data, use_count_vectorizer) print('# ending svm')