changed plot to histogram
This commit is contained in:
parent
8d6af51409
commit
549f21657c
33
NER.py
33
NER.py
|
@ -15,15 +15,17 @@ from CsvHandler import CsvHandler
|
|||
|
||||
class NER:
|
||||
|
||||
# legal entity types
|
||||
# toDo: complete lists:
|
||||
# some legal entity types
|
||||
company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP',
|
||||
'Plc', 'LLC', 'LBO', 'IPO', 'HQ',
|
||||
'CIO', 'NGO', 'AB']
|
||||
|
||||
# entities that are not companies
|
||||
# some entities that are not companies
|
||||
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist',
|
||||
'Cnn', 'European Commission', 'EU', 'Staff', 'Min', 'Read',
|
||||
'Thomson Reuters Trust Principles']
|
||||
'Thomson Reuters Trust Principles', 'New York Stock Exchange',
|
||||
'NYSE']
|
||||
|
||||
def tag_words(text):
|
||||
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
|
||||
|
@ -49,17 +51,22 @@ class NER:
|
|||
current_chunks = []
|
||||
return continuous_chunks
|
||||
|
||||
def plot_barchart(count_names):
|
||||
def plot_histogram(count_names):
|
||||
'''pyplot diagram of company names distribution
|
||||
in input news articles
|
||||
x-axis:different company names (numbered consecutively)
|
||||
y-axis:counts of company name
|
||||
'''
|
||||
plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
|
||||
plt.xlabel('Company Names')
|
||||
plt.ylabel('Article Count')
|
||||
plt.title('Counts of News Articles with Company Name')
|
||||
plt.grid(True)
|
||||
# sort list in descending order
|
||||
sorted = count_names.sort(reverse=True)
|
||||
# plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
|
||||
plt.title('Company mentions in News Articles')
|
||||
plt.xlabel('Number of mentions of the company')
|
||||
# Number of companies with this number of mentions
|
||||
plt.ylabel('Frequency')
|
||||
num_bins = 50
|
||||
n, bins, patches = plt.hist(sorted, num_bins, facecolor='blue', alpha=0.5)
|
||||
# plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
def find_companies(text):
|
||||
|
@ -99,8 +106,8 @@ class NER:
|
|||
dict_com[com] += 1
|
||||
else:
|
||||
dict_com[com] = 1
|
||||
# print outlier (value 38)
|
||||
print(max(dict_com, key=dict_com.get))
|
||||
# # print outlier
|
||||
# print(max(dict_com, key=dict_com.get))
|
||||
return dict_com.values()
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -114,7 +121,7 @@ if __name__ == '__main__':
|
|||
texts = df_hits['Title'] + ' ' + df_hits['Text']
|
||||
|
||||
# # zum prüfen lesen
|
||||
# for text in texts[5:10]:
|
||||
# for text in texts[10:20]:
|
||||
# print(text)
|
||||
# print()
|
||||
# print(NER.find_companies(text))
|
||||
|
@ -124,4 +131,4 @@ if __name__ == '__main__':
|
|||
count_names = NER.count_companies(texts)
|
||||
|
||||
# plot diagram
|
||||
NER.plot_barchart(count_names)
|
||||
NER.plot_histogram(count_names)
|
Loading…
Reference in New Issue