updated labeling
This commit is contained in:
parent
ee911377bf
commit
9367457199
File diff suppressed because one or more lines are too long
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -94,7 +94,9 @@ class MNBInteractive:
|
||||||
# classes in order used
|
# classes in order used
|
||||||
classes = classifier.classes_
|
classes = classifier.classes_
|
||||||
|
|
||||||
|
class_count = classifier.class_count_
|
||||||
|
|
||||||
print('# MNB: ending multinomial naive bayes')
|
print('# MNB: ending multinomial naive bayes')
|
||||||
|
|
||||||
# return classes and vector of class estimates
|
# return classes and vector of class estimates
|
||||||
return classes, class_probs
|
return classes, class_count, class_probs
|
62
src/NER.py
62
src/NER.py
|
@ -193,23 +193,51 @@ class NER:
|
||||||
n_dict[next_highest[0]] = next_highest[1]
|
n_dict[next_highest[0]] = next_highest[1]
|
||||||
print(n_dict)
|
print(n_dict)
|
||||||
|
|
||||||
|
def remove_banks_from_dict():
|
||||||
|
''' removes bank, news agencies and other organizations we do not need
|
||||||
|
'''
|
||||||
|
# load pickle object
|
||||||
|
with open('../obj/dict_articles_organizations.pkl', 'rb') as input:
|
||||||
|
dict = pickle.load(input)
|
||||||
|
|
||||||
|
black_list = ['Eastern and Southern African Trade and Development Bank', 'PTA Bank', 'Citigroup',
|
||||||
|
'Rand Merchant Bank', 'Banca Carige', 'World Bank', 'Bank of America', 'Deutsche Bank', 'HSBC', 'JP Morgan',
|
||||||
|
'Credit Suisse', 'JPMorgan', 'BNP Paribas', 'Goldman Sachs', 'Commerzbank', 'Deutsche Boerse', 'Handelsblatt',
|
||||||
|
'Sky News', 'Labour', 'UN', 'Bank of Japan', 'Goldman', 'Goldman Sachs Asset Management', 'New York Times',
|
||||||
|
'Bank of Scotland','World Economic Forum','Organisation for Economic Cooperation and Development',
|
||||||
|
'Russell Investments','Royal London Asset Management','Conservative party','Blom Bank','Banco Santander',
|
||||||
|
'Guardian Money','Financial Services Agency','Munich Re','Banca Popolare di Vicenza','SoftBank',
|
||||||
|
'Financial Conduct Authority','Qatar National Bank','Welt am Sonntag','Sueddeutsche Zeitung','Der Spiegel',
|
||||||
|
'Bank of England', 'Bank of America Merrill Lynch', 'Barclays', 'London Metal Exchange', 'Petroleum Exporting Countries']
|
||||||
|
|
||||||
|
for k, v in dict.items():
|
||||||
|
for org in black_list:
|
||||||
|
if org in v:
|
||||||
|
v.remove(org)
|
||||||
|
|
||||||
|
# save new dict
|
||||||
|
with open('../obj/'+ 'dict_articles_organizations_without_banks' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
print('# starting NER...')
|
# print('# starting NER...')
|
||||||
print()
|
# print()
|
||||||
# read data set
|
# # read data set
|
||||||
file = '..\\data\\cleaned_data_set_without_header.csv'
|
# file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
df = pd.read_csv(file,
|
# df = pd.read_csv(file,
|
||||||
delimiter='|',
|
# delimiter='|',
|
||||||
header=None,
|
# header=None,
|
||||||
index_col=None,
|
# index_col=None,
|
||||||
engine='python',
|
# engine='python',
|
||||||
# usecols=[1,2],
|
# # usecols=[1,2],
|
||||||
# nrows=100,
|
# # nrows=100,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
# quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
# quotechar='\'')
|
||||||
#print(df)
|
# #print(df)
|
||||||
texts = df[1] + '. ' + df[2]
|
# texts = df[1] + '. ' + df[2]
|
||||||
NER.make_article_orgs_dict(texts)
|
# NER.make_article_orgs_dict(texts)
|
||||||
# NER.show_most_common_companies()
|
# NER.show_most_common_companies()
|
||||||
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
||||||
|
|
||||||
|
NER.remove_banks_from_dict()
|
Loading…
Reference in New Issue