''' Support Vector Machines (SVM) Classifier ======================================== The SVM training algorithm builds a model from the training data that assigns the test samples to one category ('merger' or 'not merger'), making it a non-probabilistic binary linear classifier. An SVM model is a representation of the samples as points in space, mapped so that the examples of the separate categories are divided by a clear gap that is as wide as possible. New samples are then mapped into that same space and predicted to belong to a category based on which side of the gap they fall. ''' from BagOfWords import BagOfWords import csv import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectPercentile from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer, accuracy_score from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC from sklearn.svm import SVC from sklearn.svm import NuSVC class SVM_multiclass: def make_svm(dataset, sklearn_cv=True, percentile=100): print('# starting multinomial svm') print('# ...') # split data into text and label set # join title and text X = dataset['Title'] + '. ' + dataset['Text'] y = dataset['Label'] if sklearn_cv: # ignore company names company_names_list = BagOfWords.load_company_names() stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list) cv = CountVectorizer(stop_words = stopwords) # use stratified k-fold cross-validation as split method skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5) classifier = LinearSVC() # for predict proba: #classifier = SVC(probability=True, # gamma='auto') # metrics recall_scores = [] precision_scores = [] accuracy_scores = [] f1_scores = [] # for each fold n = 0 for train, test in skf.split(X,y): n += 1 print('# split no. ' + str(n)) if sklearn_cv: # use sklearn CountVectorizer # fit the training data and then return the matrix training_data = cv.fit_transform(X[train], y[train]).toarray() # transform testing data and return the matrix testing_data = cv.transform(X[test]).toarray() else: # use my own BagOfWords python implementation stemming = True rel_freq = True extracted_words = BagOfWords.extract_all_words(X[train]) vocab = BagOfWords.make_vocab(extracted_words) # fit the training data and then return the matrix training_data = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming) # transform testing data and return the matrix extracted_words = BagOfWords.extract_all_words(X[test]) testing_data = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming) # apply select percentile selector = SelectPercentile(percentile=percentile) selector.fit(training_data, y[train]) # new reduced data sets training_data_r = selector.transform(training_data) testing_data_r = selector.transform(testing_data) #fit classifier classifier.fit(training_data_r, y[train]) #predict class predictions_train = classifier.predict(training_data_r) predictions_test = classifier.predict(testing_data_r) #print and store metrics rec = recall_score(y[test], predictions_test, average='weighted') print('rec: ' + str(rec)) recall_scores.append(rec) prec = precision_score(y[test], predictions_test, average='weighted') print('prec: ' + str(prec)) print('#') precision_scores.append(prec) acc = recall_score(y[test], predictions_test, average='weighted') accuracy_scores.append(acc) print('acc: ' + str(acc)) print('#') # equation for f1 score f1_scores.append(2 * (prec * rec)/(prec + rec)) #class_prob.append(classifier.class_prior_) #class_counts.append(classifier.class_count_) #print(classifier.predict_proba(testing_data_r)) ########################## # classes in order used classes = classifier.classes_ print('Recall (Min): ' + str(min(recall_scores))) print('Recall (Max): ' + str(max(recall_scores))) print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores))) print() print('Precision (Min): ' + str(min(precision_scores))) print('Precision (Max): ' + str(max(precision_scores))) print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores))) print() print('Accuracy (Min): ' + str(min(accuracy_scores))) print('Accuracy (Max): ' + str(max(accuracy_scores))) print('Accuracy (Average) :' + str(sum(accuracy_scores)/len(accuracy_scores))) # return classes and vector of class estimates return recall_scores, precision_scores if __name__ == '__main__': # read csv file print('# reading dataset') print('# ...') # read current data set from csv df = pd.read_csv('../data/interactive_labeling_round_11.csv', sep='|', usecols=range(1,13), # drop first column 'unnamed' encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC, quotechar='\'') # select only labeled articles SVM_multiclass.make_svm(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=True)