updated labeling
This commit is contained in:
parent
ee911377bf
commit
9367457199
File diff suppressed because one or more lines are too long
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -94,7 +94,9 @@ class MNBInteractive:
|
|||
# classes in order used
|
||||
classes = classifier.classes_
|
||||
|
||||
class_count = classifier.class_count_
|
||||
|
||||
print('# MNB: ending multinomial naive bayes')
|
||||
|
||||
# return classes and vector of class estimates
|
||||
return classes, class_probs
|
||||
return classes, class_count, class_probs
|
62
src/NER.py
62
src/NER.py
|
@ -193,23 +193,51 @@ class NER:
|
|||
n_dict[next_highest[0]] = next_highest[1]
|
||||
print(n_dict)
|
||||
|
||||
def remove_banks_from_dict():
|
||||
''' removes bank, news agencies and other organizations we do not need
|
||||
'''
|
||||
# load pickle object
|
||||
with open('../obj/dict_articles_organizations.pkl', 'rb') as input:
|
||||
dict = pickle.load(input)
|
||||
|
||||
black_list = ['Eastern and Southern African Trade and Development Bank', 'PTA Bank', 'Citigroup',
|
||||
'Rand Merchant Bank', 'Banca Carige', 'World Bank', 'Bank of America', 'Deutsche Bank', 'HSBC', 'JP Morgan',
|
||||
'Credit Suisse', 'JPMorgan', 'BNP Paribas', 'Goldman Sachs', 'Commerzbank', 'Deutsche Boerse', 'Handelsblatt',
|
||||
'Sky News', 'Labour', 'UN', 'Bank of Japan', 'Goldman', 'Goldman Sachs Asset Management', 'New York Times',
|
||||
'Bank of Scotland','World Economic Forum','Organisation for Economic Cooperation and Development',
|
||||
'Russell Investments','Royal London Asset Management','Conservative party','Blom Bank','Banco Santander',
|
||||
'Guardian Money','Financial Services Agency','Munich Re','Banca Popolare di Vicenza','SoftBank',
|
||||
'Financial Conduct Authority','Qatar National Bank','Welt am Sonntag','Sueddeutsche Zeitung','Der Spiegel',
|
||||
'Bank of England', 'Bank of America Merrill Lynch', 'Barclays', 'London Metal Exchange', 'Petroleum Exporting Countries']
|
||||
|
||||
for k, v in dict.items():
|
||||
for org in black_list:
|
||||
if org in v:
|
||||
v.remove(org)
|
||||
|
||||
# save new dict
|
||||
with open('../obj/'+ 'dict_articles_organizations_without_banks' + '.pkl', 'wb') as f:
|
||||
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
print('# starting NER...')
|
||||
print()
|
||||
# read data set
|
||||
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
df = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
# usecols=[1,2],
|
||||
# nrows=100,
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
#print(df)
|
||||
texts = df[1] + '. ' + df[2]
|
||||
NER.make_article_orgs_dict(texts)
|
||||
# print('# starting NER...')
|
||||
# print()
|
||||
# # read data set
|
||||
# file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
# df = pd.read_csv(file,
|
||||
# delimiter='|',
|
||||
# header=None,
|
||||
# index_col=None,
|
||||
# engine='python',
|
||||
# # usecols=[1,2],
|
||||
# # nrows=100,
|
||||
# quoting=csv.QUOTE_NONNUMERIC,
|
||||
# quotechar='\'')
|
||||
# #print(df)
|
||||
# texts = df[1] + '. ' + df[2]
|
||||
# NER.make_article_orgs_dict(texts)
|
||||
# NER.show_most_common_companies()
|
||||
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
||||
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
||||
|
||||
NER.remove_banks_from_dict()
|
Loading…
Reference in New Issue