changed plot to histogram

This commit is contained in:
Anne Lorenz 2018-09-27 09:20:07 +02:00
parent 8d6af51409
commit 549f21657c
1 changed files with 20 additions and 13 deletions

33
NER.py
View File

@ -15,15 +15,17 @@ from CsvHandler import CsvHandler
class NER:
# legal entity types
# toDo: complete lists:
# some legal entity types
company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP',
'Plc', 'LLC', 'LBO', 'IPO', 'HQ',
'CIO', 'NGO', 'AB']
# entities that are not companies
# some entities that are not companies
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist',
'Cnn', 'European Commission', 'EU', 'Staff', 'Min', 'Read',
'Thomson Reuters Trust Principles']
'Thomson Reuters Trust Principles', 'New York Stock Exchange',
'NYSE']
def tag_words(text):
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
@ -49,17 +51,22 @@ class NER:
current_chunks = []
return continuous_chunks
def plot_barchart(count_names):
def plot_histogram(count_names):
'''pyplot diagram of company names distribution
in input news articles
x-axis:different company names (numbered consecutively)
y-axis:counts of company name
'''
plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
plt.xlabel('Company Names')
plt.ylabel('Article Count')
plt.title('Counts of News Articles with Company Name')
plt.grid(True)
# sort list in descending order
sorted = count_names.sort(reverse=True)
# plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
plt.title('Company mentions in News Articles')
plt.xlabel('Number of mentions of the company')
# Number of companies with this number of mentions
plt.ylabel('Frequency')
num_bins = 50
n, bins, patches = plt.hist(sorted, num_bins, facecolor='blue', alpha=0.5)
# plt.grid(True)
plt.show()
def find_companies(text):
@ -99,8 +106,8 @@ class NER:
dict_com[com] += 1
else:
dict_com[com] = 1
# print outlier (value 38)
print(max(dict_com, key=dict_com.get))
# # print outlier
# print(max(dict_com, key=dict_com.get))
return dict_com.values()
if __name__ == '__main__':
@ -114,7 +121,7 @@ if __name__ == '__main__':
texts = df_hits['Title'] + ' ' + df_hits['Text']
# # zum prüfen lesen
# for text in texts[5:10]:
# for text in texts[10:20]:
# print(text)
# print()
# print(NER.find_companies(text))
@ -124,4 +131,4 @@ if __name__ == '__main__':
count_names = NER.count_companies(texts)
# plot diagram
NER.plot_barchart(count_names)
NER.plot_histogram(count_names)