interactive labeling process
This commit is contained in:
parent
6471a81196
commit
d4b0de35d4
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -121,12 +121,6 @@ class BagOfWords:
|
|||
else:
|
||||
# absolute word frequency
|
||||
df_matrix.loc[i][v] += 1
|
||||
|
||||
# size too large :-(
|
||||
# # save df_matrix object
|
||||
# with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
|
||||
# pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return df_matrix
|
||||
|
||||
def make_vocab(extracted_words, stemming=True):
|
||||
|
@ -290,3 +284,6 @@ class BagOfWords:
|
|||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||
print(len(vocab))
|
||||
|
||||
if __name__ == '__main__':
|
||||
BagOfWords.test()
|
|
@ -9,6 +9,9 @@ from BagOfWords import BagOfWords
|
|||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import recall_score, precision_score
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
|
||||
class MNBInteractive:
|
||||
|
@ -17,7 +20,7 @@ class MNBInteractive:
|
|||
However, in practice, fractional counts such as tf-idf may also work.
|
||||
'''
|
||||
|
||||
def make_nb(labeled_data, unlabeled_data, sklearn_cv=False):
|
||||
def estimate_mnb(labeled_data, unlabeled_data, sklearn_cv=False):
|
||||
'''fits naive bayes model
|
||||
'''
|
||||
|
||||
|
@ -97,3 +100,112 @@ class MNBInteractive:
|
|||
|
||||
# return classes and vector of class estimates
|
||||
return classes, class_count, class_probs
|
||||
|
||||
def measure_mnb(X, y, sklearn_cv=False, percentile=100):
|
||||
'''fits multinomial naive bayes model
|
||||
'''
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
||||
if sklearn_cv:
|
||||
cv = CountVectorizer()
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 2, shuffle=True, random_state=5)
|
||||
|
||||
classifier = MultinomialNB(alpha=1.0e-10,
|
||||
fit_prior=False,
|
||||
class_prior=None)
|
||||
|
||||
# metrics
|
||||
recall_scores = []
|
||||
precision_scores = []
|
||||
f1_scores = []
|
||||
|
||||
# probabilities of each class (of each fold)
|
||||
class_prob = []
|
||||
# counts number of training samples observed in each class
|
||||
class_counts = []
|
||||
|
||||
# for each fold
|
||||
n = 0
|
||||
for train, test in skf.split(X,y):
|
||||
|
||||
n += 1
|
||||
print('# split no. ' + str(n))
|
||||
|
||||
if sklearn_cv:
|
||||
# use sklearn CountVectorizer
|
||||
# fit the training data and then return the matrix
|
||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = cv.transform(X[test]).toarray()
|
||||
else:
|
||||
# use my own BagOfWords python implementation
|
||||
stemming = True
|
||||
rel_freq = True
|
||||
extracted_words = BagOfWords.extract_all_words(X[train])
|
||||
vocab = BagOfWords.make_vocab(extracted_words)
|
||||
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
# transform testing data and return the matrix
|
||||
extracted_words = BagOfWords.extract_all_words(X[test])
|
||||
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
|
||||
# apply select percentile
|
||||
selector = SelectPercentile(percentile=percentile)
|
||||
selector.fit(training_data, y[train])
|
||||
|
||||
# new reduced data sets
|
||||
training_data_r = selector.transform(training_data)
|
||||
testing_data_r = selector.transform(testing_data)
|
||||
|
||||
#fit classifier
|
||||
classifier.fit(training_data_r, y[train])
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data_r)
|
||||
predictions_test = classifier.predict(testing_data_r)
|
||||
|
||||
#print and store metrics
|
||||
rec = recall_score(y[test], predictions_test, average=None)
|
||||
print('rec: ' + str(rec))
|
||||
recall_scores.append(rec)
|
||||
prec = precision_score(y[test], predictions_test, average=None)
|
||||
print('prec: ' + str(prec))
|
||||
print('#')
|
||||
precision_scores.append(prec)
|
||||
# equation for f1 score
|
||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||
|
||||
#class_prob.append(classifier.class_prior_)
|
||||
#class_counts.append(classifier.class_count_)
|
||||
|
||||
##########################
|
||||
#print metrics of test set
|
||||
print('-------------------------')
|
||||
print('prediction of testing set:')
|
||||
print('Precision score: min = {}, max = {}, average = {}'
|
||||
.format(min(precision_scores),
|
||||
max(precision_scores),
|
||||
sum(precision_scores)/float(len(precision_scores))))
|
||||
print('Recall score: min = {}, max = {}, average = {}'
|
||||
.format(min(recall_scores),
|
||||
max(recall_scores),
|
||||
sum(recall_scores)/float(len(recall_scores))))
|
||||
print('F1 score: min = {}, max = {}, average = {}'
|
||||
.format(min(f1_scores),
|
||||
max(f1_scores),
|
||||
sum(f1_scores)/float(len(f1_scores))))
|
||||
# print()
|
||||
# # print probability of each class
|
||||
# print('probability of each class:')
|
||||
# print()
|
||||
# #print(class_prob)
|
||||
# print()
|
||||
# print('number of samples of each class:')
|
||||
# print()
|
||||
# #print(class_counts)
|
||||
# print()
|
15
src/NER.py
15
src/NER.py
|
@ -208,14 +208,21 @@ class NER:
|
|||
'Russell Investments','Royal London Asset Management','Conservative party','Blom Bank','Banco Santander',
|
||||
'Guardian Money','Financial Services Agency','Munich Re','Banca Popolare di Vicenza','SoftBank', 'Sberbank',
|
||||
'Financial Conduct Authority','Qatar National Bank','Welt am Sonntag','Sueddeutsche Zeitung','Der Spiegel',
|
||||
'Bank of England', 'Bank of America Merrill Lynch', 'Barclays', 'London Metal Exchange', 'EMEA', 'G20',
|
||||
'Bank of England', 'Bank of America Merrill Lynch', 'Barclays', 'London Metal Exchange', 'EMEA', 'G20', 'The'
|
||||
'Petroleum Exporting Countries', 'Facebook Twitter Pinterest', 'Moody', 'Allianz', 'Citi', 'Bank', 'CME',
|
||||
'JPMorgan Chase &', 'Trade Alert', 'Abu Dhabi', 'MILAN', 'Journal', 'MSCI', 'KKR', 'CNBC', 'Feb', 'OECD',
|
||||
'Gulf Cooperation Council', 'Societe Generale', 'Takata', 'SEC', 'Republican', 'Energy Information Administration',
|
||||
'Organization of the Petroleum Exporting Countries', 'CBOE', 'LME', 'BOJ', 'BlackRock', 'Banco Popular',
|
||||
'United Nations', 'CET STOCKS Latest Previo Daily Change', 'Citibank', 'International Energy Agency',
|
||||
'Confederation of British Industry', 'American Petroleum Institute', 'Deutsche', 'United', 'Pentagon',
|
||||
'Southern District of New York']
|
||||
'United Nations', 'CET STOCKS Latest Previo Daily Change', 'Citibank', 'International Energy Agency', 'Office',
|
||||
'Confederation of British Industry', 'American Petroleum Institute', 'Deutsche', 'United', 'Pentagon', 'Lehman',
|
||||
'Southern District of New York', 'City Index', 'Hong Kong China Enterprises Index', 'Fitch Ratings Espana',
|
||||
'EIKON', 'First Capital Equities ( Pvt )', 'China Securities Journal', 'English Premier League', 'Allfunds Bank',
|
||||
'Bank Indonesia', 'Hong Kong Exchanges and Clearing', 'Fitch ) Fitch Ratings', 'University of Delaware',
|
||||
'University of British Columbia', 'Abu Dhabi Investment Authority', 'Bill & Melinda Gates Foundation',
|
||||
'Gates Foundation', 'Allfunds Bank', 'Bank Indonesia', 'Swedbank', 'Handelsbanken', 'Al Rajhi Bank', 'SAO PAULO',
|
||||
'National Weather Service', 'Clydesdale Bank', 'First Republic Bank', 'Tesco Bank', 'Alpha Bank', 'Bank of Spain',
|
||||
'Transatlantic Trade and Investment Partnership', 'Raiffeisen Bank International', 'Deutsche Boerse CEO',
|
||||
'Capital Bank', 'National Crime Agency', 'TD Bank']
|
||||
|
||||
for k, v in dict.items():
|
||||
for org in black_list:
|
||||
|
|
Loading…
Reference in New Issue