changed plot to histogram

This commit is contained in:
Anne Lorenz 2018-09-27 09:20:07 +02:00
parent 8d6af51409
commit 549f21657c
1 changed files with 20 additions and 13 deletions

33
NER.py
View File

@ -15,15 +15,17 @@ from CsvHandler import CsvHandler
class NER: class NER:
# legal entity types # toDo: complete lists:
# some legal entity types
company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP', company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP',
'Plc', 'LLC', 'LBO', 'IPO', 'HQ', 'Plc', 'LLC', 'LBO', 'IPO', 'HQ',
'CIO', 'NGO', 'AB'] 'CIO', 'NGO', 'AB']
# entities that are not companies # some entities that are not companies
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist',
'Cnn', 'European Commission', 'EU', 'Staff', 'Min', 'Read', 'Cnn', 'European Commission', 'EU', 'Staff', 'Min', 'Read',
'Thomson Reuters Trust Principles'] 'Thomson Reuters Trust Principles', 'New York Stock Exchange',
'NYSE']
def tag_words(text): def tag_words(text):
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz' stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
@ -49,17 +51,22 @@ class NER:
current_chunks = [] current_chunks = []
return continuous_chunks return continuous_chunks
def plot_barchart(count_names): def plot_histogram(count_names):
'''pyplot diagram of company names distribution '''pyplot diagram of company names distribution
in input news articles in input news articles
x-axis:different company names (numbered consecutively) x-axis:different company names (numbered consecutively)
y-axis:counts of company name y-axis:counts of company name
''' '''
plt.plot(range(len(count_names)), count_names, 'ro', ms = 5) # sort list in descending order
plt.xlabel('Company Names') sorted = count_names.sort(reverse=True)
plt.ylabel('Article Count') # plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
plt.title('Counts of News Articles with Company Name') plt.title('Company mentions in News Articles')
plt.grid(True) plt.xlabel('Number of mentions of the company')
# Number of companies with this number of mentions
plt.ylabel('Frequency')
num_bins = 50
n, bins, patches = plt.hist(sorted, num_bins, facecolor='blue', alpha=0.5)
# plt.grid(True)
plt.show() plt.show()
def find_companies(text): def find_companies(text):
@ -99,8 +106,8 @@ class NER:
dict_com[com] += 1 dict_com[com] += 1
else: else:
dict_com[com] = 1 dict_com[com] = 1
# print outlier (value 38) # # print outlier
print(max(dict_com, key=dict_com.get)) # print(max(dict_com, key=dict_com.get))
return dict_com.values() return dict_com.values()
if __name__ == '__main__': if __name__ == '__main__':
@ -114,7 +121,7 @@ if __name__ == '__main__':
texts = df_hits['Title'] + ' ' + df_hits['Text'] texts = df_hits['Title'] + ' ' + df_hits['Text']
# # zum prüfen lesen # # zum prüfen lesen
# for text in texts[5:10]: # for text in texts[10:20]:
# print(text) # print(text)
# print() # print()
# print(NER.find_companies(text)) # print(NER.find_companies(text))
@ -124,4 +131,4 @@ if __name__ == '__main__':
count_names = NER.count_companies(texts) count_names = NER.count_companies(texts)
# plot diagram # plot diagram
NER.plot_barchart(count_names) NER.plot_histogram(count_names)