161 lines
5.2 KiB
Python
161 lines
5.2 KiB
Python
'''
|
|
Support Vector Machines (SVM) Classifier
|
|
========================================
|
|
|
|
The SVM training algorithm builds a model from the training data that assigns
|
|
the test samples to one category ('merger' or 'not merger'),
|
|
making it a non-probabilistic binary linear classifier.
|
|
An SVM model is a representation of the samples as points in space,
|
|
mapped so that the examples of the separate categories are divided
|
|
by a clear gap that is as wide as possible.
|
|
New samples are then mapped into that same space and predicted
|
|
to belong to a category based on which side of the gap they fall.
|
|
'''
|
|
|
|
from BagOfWords import BagOfWords
|
|
|
|
import csv
|
|
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.feature_selection import SelectPercentile
|
|
from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer, accuracy_score
|
|
from sklearn.model_selection import StratifiedKFold
|
|
from sklearn.model_selection import GridSearchCV
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.svm import LinearSVC
|
|
from sklearn.svm import SVC
|
|
from sklearn.svm import NuSVC
|
|
|
|
class SVM_multiclass:
|
|
|
|
def make_svm(dataset, sklearn_cv=True, percentile=100):
|
|
|
|
print('# starting multinomial svm')
|
|
print('# ...')
|
|
|
|
# split data into text and label set
|
|
# join title and text
|
|
X = dataset['Title'] + '. ' + dataset['Text']
|
|
y = dataset['Label']
|
|
|
|
if sklearn_cv:
|
|
|
|
# ignore company names
|
|
company_names_list = BagOfWords.load_company_names()
|
|
stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
|
|
cv = CountVectorizer(stop_words = stopwords)
|
|
|
|
# use stratified k-fold cross-validation as split method
|
|
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
|
|
|
classifier = LinearSVC()
|
|
|
|
# for predict proba:
|
|
#classifier = SVC(probability=True,
|
|
# gamma='auto')
|
|
|
|
# metrics
|
|
recall_scores = []
|
|
precision_scores = []
|
|
accuracy_scores = []
|
|
f1_scores = []
|
|
|
|
# for each fold
|
|
n = 0
|
|
for train, test in skf.split(X,y):
|
|
|
|
n += 1
|
|
print('# split no. ' + str(n))
|
|
|
|
if sklearn_cv:
|
|
# use sklearn CountVectorizer
|
|
# fit the training data and then return the matrix
|
|
|
|
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
|
# transform testing data and return the matrix
|
|
testing_data = cv.transform(X[test]).toarray()
|
|
else:
|
|
# use my own BagOfWords python implementation
|
|
stemming = True
|
|
rel_freq = True
|
|
extracted_words = BagOfWords.extract_all_words(X[train])
|
|
vocab = BagOfWords.make_vocab(extracted_words)
|
|
|
|
# fit the training data and then return the matrix
|
|
training_data = BagOfWords.make_matrix(extracted_words,
|
|
vocab, rel_freq, stemming)
|
|
# transform testing data and return the matrix
|
|
extracted_words = BagOfWords.extract_all_words(X[test])
|
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
|
vocab, rel_freq, stemming)
|
|
|
|
# apply select percentile
|
|
selector = SelectPercentile(percentile=percentile)
|
|
selector.fit(training_data, y[train])
|
|
|
|
# new reduced data sets
|
|
training_data_r = selector.transform(training_data)
|
|
testing_data_r = selector.transform(testing_data)
|
|
|
|
#fit classifier
|
|
classifier.fit(training_data_r, y[train])
|
|
#predict class
|
|
predictions_train = classifier.predict(training_data_r)
|
|
predictions_test = classifier.predict(testing_data_r)
|
|
|
|
#print and store metrics
|
|
rec = recall_score(y[test], predictions_test, average='weighted')
|
|
print('rec: ' + str(rec))
|
|
recall_scores.append(rec)
|
|
prec = precision_score(y[test], predictions_test, average='weighted')
|
|
print('prec: ' + str(prec))
|
|
print('#')
|
|
precision_scores.append(prec)
|
|
acc = recall_score(y[test], predictions_test, average='weighted')
|
|
accuracy_scores.append(acc)
|
|
print('acc: ' + str(acc))
|
|
print('#')
|
|
# equation for f1 score
|
|
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
|
|
|
#class_prob.append(classifier.class_prior_)
|
|
#class_counts.append(classifier.class_count_)
|
|
#print(classifier.predict_proba(testing_data_r))
|
|
|
|
##########################
|
|
# classes in order used
|
|
classes = classifier.classes_
|
|
|
|
print('Recall (Min): ' + str(min(recall_scores)))
|
|
print('Recall (Max): ' + str(max(recall_scores)))
|
|
print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
|
|
print()
|
|
print('Precision (Min): ' + str(min(precision_scores)))
|
|
print('Precision (Max): ' + str(max(precision_scores)))
|
|
print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
|
|
print()
|
|
print('Accuracy (Min): ' + str(min(accuracy_scores)))
|
|
print('Accuracy (Max): ' + str(max(accuracy_scores)))
|
|
print('Accuracy (Average) :' + str(sum(accuracy_scores)/len(accuracy_scores)))
|
|
|
|
# return classes and vector of class estimates
|
|
return recall_scores, precision_scores
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# read csv file
|
|
print('# reading dataset')
|
|
print('# ...')
|
|
|
|
# read current data set from csv
|
|
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
|
sep='|',
|
|
usecols=range(1,13), # drop first column 'unnamed'
|
|
encoding='utf-8',
|
|
quoting=csv.QUOTE_NONNUMERIC,
|
|
quotechar='\'')
|
|
|
|
# select only labeled articles
|
|
SVM_multiclass.make_svm(df.loc[df['Label'] != -1].reset_index(drop=True),
|
|
sklearn_cv=True) |