109 lines
3.2 KiB
Python
109 lines
3.2 KiB
Python
'''
|
|
Support Vector Machines (SVM) Classifier
|
|
========================================
|
|
|
|
The SVM training algorithm builds a model from the training data that assigns
|
|
the test samples to one category ('merger' or 'not merger'),
|
|
making it a non-probabilistic binary linear classifier.
|
|
An SVM model is a representation of the samples as points in space,
|
|
mapped so that the examples of the separate categories are divided
|
|
by a clear gap that is as wide as possible.
|
|
New samples are then mapped into that same space and predicted
|
|
to belong to a category based on which side of the gap they fall.
|
|
'''
|
|
|
|
from BagOfWords import BagOfWords
|
|
|
|
import csv
|
|
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.feature_selection import SelectPercentile
|
|
from sklearn.metrics import f1_score, make_scorer
|
|
from sklearn.model_selection import StratifiedKFold
|
|
from sklearn.model_selection import GridSearchCV
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.svm import SVC
|
|
|
|
class SVM:
|
|
|
|
def make_svm(dataset):
|
|
|
|
print('# fitting model')
|
|
print('# ...')
|
|
|
|
# split data into text and label set
|
|
|
|
# articles' text (title + text)
|
|
X = dataset['Title'] + ' ' + dataset['Text']
|
|
# articles' labels
|
|
y = dataset['Label']
|
|
|
|
# Bag of Words
|
|
print('# calculating bag of words')
|
|
print('# ...')
|
|
# fit the training data and then return the matrix
|
|
#X = BagOfWords.fit_transform(X)
|
|
X = CountVectorizer().fit_transform(X).toarray()
|
|
|
|
# use stratified k-fold cross-validation as split method
|
|
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
|
|
|
# use only most important features
|
|
selector = SelectPercentile()
|
|
|
|
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
|
|
|
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75],
|
|
'SVC__kernel': ['linear'],
|
|
'SVC__gamma': [0.00001, 0.0001],
|
|
'SVC__C': [0.1, 1]},
|
|
cv=skf,
|
|
scoring=make_scorer(f1_score))
|
|
|
|
print('# fit classifier')
|
|
print('# ...')
|
|
|
|
grid.fit(X,y)
|
|
|
|
# DataFrame of results
|
|
df_results = grid.cv_results_
|
|
|
|
# print results
|
|
######################
|
|
print('RESULTS:')
|
|
print('')
|
|
print('mean_test_score:')
|
|
print(df_results['mean_test_score'])
|
|
print('')
|
|
print('mean of means:')
|
|
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
|
|
print('')
|
|
print('best score:')
|
|
print(grid.best_score_)
|
|
print()
|
|
print('best parameters set found on development set:')
|
|
print(grid.best_params_)
|
|
print()
|
|
|
|
if __name__ == '__main__':
|
|
|
|
print('# starting svm')
|
|
print('# ...')
|
|
|
|
file = 'classification_labelled_corrected.csv'
|
|
|
|
# read csv file
|
|
print('# reading dataset')
|
|
print('# ...')
|
|
|
|
data = pd.read_csv(file,
|
|
sep='|',
|
|
engine='python',
|
|
decimal='.',
|
|
quotechar='\'',
|
|
quoting=csv.QUOTE_NONE)
|
|
|
|
make_svm(data)
|
|
|
|
print('# ending svm') |