83 lines
2.5 KiB
Python
83 lines
2.5 KiB
Python
'''
|
|
SVM Classifier for Interactive Labeling
|
|
=======================================
|
|
|
|
returns probabilities for classes needed for interactive labeling.
|
|
'''
|
|
from BagOfWords import BagOfWords
|
|
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.feature_selection import SelectPercentile
|
|
from sklearn.metrics import recall_score, precision_score
|
|
from sklearn.model_selection import StratifiedKFold
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.svm import SVC
|
|
|
|
class SVMInteractive:
|
|
|
|
def estimate_svm(labeled_data, unlabeled_data, sklearn_cv=True):
|
|
|
|
print('# SVM: starting interactive SVM...')
|
|
print()
|
|
|
|
# split labeled data into text and label set
|
|
# join title and text
|
|
X = labeled_data['Title'] + '. ' + labeled_data['Text']
|
|
y = labeled_data['Label']
|
|
|
|
# split unlabeled data into text and label set
|
|
# join title and text
|
|
U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
|
|
l = unlabeled_data['Label']
|
|
|
|
if sklearn_cv:
|
|
cv = CountVectorizer()
|
|
|
|
# fit_prior=False: a uniform prior will be used instead
|
|
# of learning class prior probabilities
|
|
classifier = SVC(probability=True,
|
|
gamma='auto')
|
|
|
|
# probabilities of each class (of each fold)
|
|
class_probs = []
|
|
|
|
if sklearn_cv:
|
|
# use sklearn CountVectorizer
|
|
# fit the training data and then return the matrix
|
|
training_data = cv.fit_transform(X, y).toarray()
|
|
# transform testing data and return the matrix
|
|
testing_data = cv.transform(U).toarray()
|
|
else:
|
|
# use my own BagOfWords python implementation
|
|
stemming = True
|
|
rel_freq = False
|
|
extracted_words = BagOfWords.extract_all_words(X)
|
|
vocab = BagOfWords.make_vocab(extracted_words)
|
|
|
|
# fit the training data and then return the matrix
|
|
print('# MNB: fit training data and calculate matrix...')
|
|
print()
|
|
training_data = BagOfWords.make_matrix(extracted_words,
|
|
vocab, rel_freq, stemming)
|
|
|
|
# transform testing data and return the matrix
|
|
print('# MNB: transform testing data to matrix...')
|
|
print()
|
|
extracted_words = BagOfWords.extract_all_words(U)
|
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
|
vocab, rel_freq, stemming)
|
|
|
|
#fit classifier
|
|
classifier.fit(training_data, y)
|
|
|
|
# probability estimates for the test vector (testing_data)
|
|
class_probs = classifier.predict_proba(testing_data)
|
|
|
|
# classes in order used
|
|
classes = classifier.classes_
|
|
|
|
print('# ending SVM')
|
|
|
|
# return classes and vector of class estimates
|
|
return classes, class_probs |