updated labeling
parent
ee911377bf
commit
9367457199
File diff suppressed because one or more lines are too long
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -94,7 +94,9 @@ class MNBInteractive:
|
|||
# classes in order used
|
||||
classes = classifier.classes_
|
||||
|
||||
class_count = classifier.class_count_
|
||||
|
||||
print('# MNB: ending multinomial naive bayes')
|
||||
|
||||
# return classes and vector of class estimates
|
||||
return classes, class_probs
|
||||
return classes, class_count, class_probs
|
62
src/NER.py
62
src/NER.py
|
@ -193,23 +193,51 @@ class NER:
|
|||
n_dict[next_highest[0]] = next_highest[1]
|
||||
print(n_dict)
|
||||
|
||||
def remove_banks_from_dict():
|
||||
''' removes bank, news agencies and other organizations we do not need
|
||||
'''
|
||||
# load pickle object
|
||||
with open('../obj/dict_articles_organizations.pkl', 'rb') as input:
|
||||
dict = pickle.load(input)
|
||||
|
||||
black_list = ['Eastern and Southern African Trade and Development Bank', 'PTA Bank', 'Citigroup',
|
||||
'Rand Merchant Bank', 'Banca Carige', 'World Bank', 'Bank of America', 'Deutsche Bank', 'HSBC', 'JP Morgan',
|
||||
'Credit Suisse', 'JPMorgan', 'BNP Paribas', 'Goldman Sachs', 'Commerzbank', 'Deutsche Boerse', 'Handelsblatt',
|
||||
'Sky News', 'Labour', 'UN', 'Bank of Japan', 'Goldman', 'Goldman Sachs Asset Management', 'New York Times',
|
||||
'Bank of Scotland','World Economic Forum','Organisation for Economic Cooperation and Development',
|
||||
'Russell Investments','Royal London Asset Management','Conservative party','Blom Bank','Banco Santander',
|
||||
'Guardian Money','Financial Services Agency','Munich Re','Banca Popolare di Vicenza','SoftBank',
|
||||
'Financial Conduct Authority','Qatar National Bank','Welt am Sonntag','Sueddeutsche Zeitung','Der Spiegel',
|
||||
'Bank of England', 'Bank of America Merrill Lynch', 'Barclays', 'London Metal Exchange', 'Petroleum Exporting Countries']
|
||||
|
||||
for k, v in dict.items():
|
||||
for org in black_list:
|
||||
if org in v:
|
||||
v.remove(org)
|
||||
|
||||
# save new dict
|
||||
with open('../obj/'+ 'dict_articles_organizations_without_banks' + '.pkl', 'wb') as f:
|
||||
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
print('# starting NER...')
|
||||
print()
|
||||
# read data set
|
||||
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
df = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
# usecols=[1,2],
|
||||
# nrows=100,
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
#print(df)
|
||||
texts = df[1] + '. ' + df[2]
|
||||
NER.make_article_orgs_dict(texts)
|
||||
# print('# starting NER...')
|
||||
# print()
|
||||
# # read data set
|
||||
# file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
# df = pd.read_csv(file,
|
||||
# delimiter='|',
|
||||
# header=None,
|
||||
# index_col=None,
|
||||
# engine='python',
|
||||
# # usecols=[1,2],
|
||||
# # nrows=100,
|
||||
# quoting=csv.QUOTE_NONNUMERIC,
|
||||
# quotechar='\'')
|
||||
# #print(df)
|
||||
# texts = df[1] + '. ' + df[2]
|
||||
# NER.make_article_orgs_dict(texts)
|
||||
# NER.show_most_common_companies()
|
||||
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
||||
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
||||
|
||||
NER.remove_banks_from_dict()
|
Loading…
Reference in New Issue