interactive labeling process

This commit is contained in:
annealias 2019-01-24 17:44:44 +01:00
parent 6471a81196
commit d4b0de35d4
10 changed files with 24456 additions and 56352 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -121,12 +121,6 @@ class BagOfWords:
else:
# absolute word frequency
df_matrix.loc[i][v] += 1
# size too large :-(
# # save df_matrix object
# with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
# pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
return df_matrix
def make_vocab(extracted_words, stemming=True):
@ -290,3 +284,6 @@ class BagOfWords:
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
print(len(vocab))
if __name__ == '__main__':
BagOfWords.test()

View File

@ -9,6 +9,9 @@ from BagOfWords import BagOfWords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
class MNBInteractive:
@ -17,7 +20,7 @@ class MNBInteractive:
However, in practice, fractional counts such as tf-idf may also work.
'''
def make_nb(labeled_data, unlabeled_data, sklearn_cv=False):
def estimate_mnb(labeled_data, unlabeled_data, sklearn_cv=False):
'''fits naive bayes model
'''
@ -97,3 +100,112 @@ class MNBInteractive:
# return classes and vector of class estimates
return classes, class_count, class_probs
def measure_mnb(X, y, sklearn_cv=False, percentile=100):
'''fits multinomial naive bayes model
'''
print('# fitting model')
print('# ...')
if sklearn_cv:
cv = CountVectorizer()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 2, shuffle=True, random_state=5)
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
class_prob = []
# counts number of training samples observed in each class
class_counts = []
# for each fold
n = 0
for train, test in skf.split(X,y):
n += 1
print('# split no. ' + str(n))
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = True
extracted_words = BagOfWords.extract_all_words(X[train])
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
extracted_words = BagOfWords.extract_all_words(X[test])
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# apply select percentile
selector = SelectPercentile(percentile=percentile)
selector.fit(training_data, y[train])
# new reduced data sets
training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data)
#fit classifier
classifier.fit(training_data_r, y[train])
#predict class
predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r)
#print and store metrics
rec = recall_score(y[test], predictions_test, average=None)
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[test], predictions_test, average=None)
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
#class_prob.append(classifier.class_prior_)
#class_counts.append(classifier.class_count_)
##########################
#print metrics of test set
print('-------------------------')
print('prediction of testing set:')
print('Precision score: min = {}, max = {}, average = {}'
.format(min(precision_scores),
max(precision_scores),
sum(precision_scores)/float(len(precision_scores))))
print('Recall score: min = {}, max = {}, average = {}'
.format(min(recall_scores),
max(recall_scores),
sum(recall_scores)/float(len(recall_scores))))
print('F1 score: min = {}, max = {}, average = {}'
.format(min(f1_scores),
max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
# print()
# # print probability of each class
# print('probability of each class:')
# print()
# #print(class_prob)
# print()
# print('number of samples of each class:')
# print()
# #print(class_counts)
# print()

View File

@ -208,14 +208,21 @@ class NER:
'Russell Investments','Royal London Asset Management','Conservative party','Blom Bank','Banco Santander',
'Guardian Money','Financial Services Agency','Munich Re','Banca Popolare di Vicenza','SoftBank', 'Sberbank',
'Financial Conduct Authority','Qatar National Bank','Welt am Sonntag','Sueddeutsche Zeitung','Der Spiegel',
'Bank of England', 'Bank of America Merrill Lynch', 'Barclays', 'London Metal Exchange', 'EMEA', 'G20',
'Bank of England', 'Bank of America Merrill Lynch', 'Barclays', 'London Metal Exchange', 'EMEA', 'G20', 'The'
'Petroleum Exporting Countries', 'Facebook Twitter Pinterest', 'Moody', 'Allianz', 'Citi', 'Bank', 'CME',
'JPMorgan Chase &', 'Trade Alert', 'Abu Dhabi', 'MILAN', 'Journal', 'MSCI', 'KKR', 'CNBC', 'Feb', 'OECD',
'Gulf Cooperation Council', 'Societe Generale', 'Takata', 'SEC', 'Republican', 'Energy Information Administration',
'Organization of the Petroleum Exporting Countries', 'CBOE', 'LME', 'BOJ', 'BlackRock', 'Banco Popular',
'United Nations', 'CET STOCKS Latest Previo Daily Change', 'Citibank', 'International Energy Agency',
'Confederation of British Industry', 'American Petroleum Institute', 'Deutsche', 'United', 'Pentagon',
'Southern District of New York']
'United Nations', 'CET STOCKS Latest Previo Daily Change', 'Citibank', 'International Energy Agency', 'Office',
'Confederation of British Industry', 'American Petroleum Institute', 'Deutsche', 'United', 'Pentagon', 'Lehman',
'Southern District of New York', 'City Index', 'Hong Kong China Enterprises Index', 'Fitch Ratings Espana',
'EIKON', 'First Capital Equities ( Pvt )', 'China Securities Journal', 'English Premier League', 'Allfunds Bank',
'Bank Indonesia', 'Hong Kong Exchanges and Clearing', 'Fitch ) Fitch Ratings', 'University of Delaware',
'University of British Columbia', 'Abu Dhabi Investment Authority', 'Bill & Melinda Gates Foundation',
'Gates Foundation', 'Allfunds Bank', 'Bank Indonesia', 'Swedbank', 'Handelsbanken', 'Al Rajhi Bank', 'SAO PAULO',
'National Weather Service', 'Clydesdale Bank', 'First Republic Bank', 'Tesco Bank', 'Alpha Bank', 'Bank of Spain',
'Transatlantic Trade and Investment Partnership', 'Raiffeisen Bank International', 'Deutsche Boerse CEO',
'Capital Bank', 'National Crime Agency', 'TD Bank']
for k, v in dict.items():
for org in black_list: