updated labeling

master
Anne Lorenz 2018-12-21 13:50:58 +01:00
parent ee911377bf
commit 9367457199
5 changed files with 534 additions and 986 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -94,7 +94,9 @@ class MNBInteractive:
# classes in order used
classes = classifier.classes_
class_count = classifier.class_count_
print('# MNB: ending multinomial naive bayes')
# return classes and vector of class estimates
return classes, class_probs
return classes, class_count, class_probs

View File

@ -193,23 +193,51 @@ class NER:
n_dict[next_highest[0]] = next_highest[1]
print(n_dict)
def remove_banks_from_dict():
''' removes bank, news agencies and other organizations we do not need
'''
# load pickle object
with open('../obj/dict_articles_organizations.pkl', 'rb') as input:
dict = pickle.load(input)
black_list = ['Eastern and Southern African Trade and Development Bank', 'PTA Bank', 'Citigroup',
'Rand Merchant Bank', 'Banca Carige', 'World Bank', 'Bank of America', 'Deutsche Bank', 'HSBC', 'JP Morgan',
'Credit Suisse', 'JPMorgan', 'BNP Paribas', 'Goldman Sachs', 'Commerzbank', 'Deutsche Boerse', 'Handelsblatt',
'Sky News', 'Labour', 'UN', 'Bank of Japan', 'Goldman', 'Goldman Sachs Asset Management', 'New York Times',
'Bank of Scotland','World Economic Forum','Organisation for Economic Cooperation and Development',
'Russell Investments','Royal London Asset Management','Conservative party','Blom Bank','Banco Santander',
'Guardian Money','Financial Services Agency','Munich Re','Banca Popolare di Vicenza','SoftBank',
'Financial Conduct Authority','Qatar National Bank','Welt am Sonntag','Sueddeutsche Zeitung','Der Spiegel',
'Bank of England', 'Bank of America Merrill Lynch', 'Barclays', 'London Metal Exchange', 'Petroleum Exporting Countries']
for k, v in dict.items():
for org in black_list:
if org in v:
v.remove(org)
# save new dict
with open('../obj/'+ 'dict_articles_organizations_without_banks' + '.pkl', 'wb') as f:
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
if __name__ == '__main__':
print('# starting NER...')
print()
# read data set
file = '..\\data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
# usecols=[1,2],
# nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
#print(df)
texts = df[1] + '. ' + df[2]
NER.make_article_orgs_dict(texts)
# print('# starting NER...')
# print()
# # read data set
# file = '..\\data\\cleaned_data_set_without_header.csv'
# df = pd.read_csv(file,
# delimiter='|',
# header=None,
# index_col=None,
# engine='python',
# # usecols=[1,2],
# # nrows=100,
# quoting=csv.QUOTE_NONNUMERIC,
# quotechar='\'')
# #print(df)
# texts = df[1] + '. ' + df[2]
# NER.make_article_orgs_dict(texts)
# NER.show_most_common_companies()
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
NER.remove_banks_from_dict()