140 lines
4.7 KiB
Python
140 lines
4.7 KiB
Python
'''
|
|
Named Entity Recognition (NER)
|
|
==============================
|
|
|
|
Stanford NER takes a text as input and returns a list of entities
|
|
like persons, organizations and countries, e.g.
|
|
'''
|
|
import os
|
|
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from nltk.tag import StanfordNERTagger
|
|
from nltk.tokenize import word_tokenize
|
|
|
|
from CsvHandler import CsvHandler
|
|
|
|
class NER:
|
|
|
|
# toDo: complete lists:
|
|
# some legal entity types
|
|
company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP', 'Plc', 'LLC',
|
|
'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB']
|
|
|
|
# some entities that are not companies
|
|
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
|
|
'European Commission', 'EU', 'Staff', 'Min', 'Read',
|
|
'Thomson Reuters Trust Principles', 'New York Stock Exchange',
|
|
'NYSE']
|
|
|
|
def tag_words(text):
|
|
# path to Stanford NER
|
|
stanford_classifier = 'stanford-ner-2018-02-27'\
|
|
'\\classifiers'\
|
|
'\\english.all.3class.distsim.crf.ser.gz'
|
|
stanford_ner_path = 'stanford-ner-2018-02-27'\
|
|
'\\stanford-ner.jar'
|
|
# create tagger object
|
|
st = StanfordNERTagger(stanford_classifier, stanford_ner_path,
|
|
encoding='utf-8')
|
|
|
|
tokenized_text = word_tokenize(text)
|
|
# list of tuples (word, tag)
|
|
tagged_words = st.tag(tokenized_text)
|
|
return tagged_words
|
|
|
|
def get_coherent_tags(tagged_words):
|
|
continuous_chunks = []
|
|
current_chunks = []
|
|
|
|
for token, tag in tagged_words:
|
|
if tag == "ORGANIZATION" and token not in NER.company_abbrevs:
|
|
current_chunks.append((token, tag))
|
|
elif current_chunks:
|
|
# put the final current_chunk into the continuous_chunk
|
|
continuous_chunks.append(current_chunks)
|
|
current_chunks = []
|
|
return continuous_chunks
|
|
|
|
def plot_histogram(count_names):
|
|
'''pyplot diagram of company names distribution
|
|
(probability density function)
|
|
x-axis: number of mentions of the company
|
|
y-axis: frequency
|
|
'''
|
|
# sort list in descending order
|
|
count_names.sort(reverse=True)
|
|
# convert list to array
|
|
names = np.asarray(count_names)
|
|
plt.title('Company mentions in News Articles')
|
|
plt.xlabel('Count of articles that mention a company')
|
|
# Number of companies with this number of mentions
|
|
plt.ylabel('Number of companies with this number of articles')
|
|
num_bins = 50
|
|
n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
|
|
# plt.grid(True)
|
|
plt.show()
|
|
|
|
def find_companies(text):
|
|
'''param: article text where organizations must be indentified
|
|
returns: list of identified organisations as strings
|
|
'''
|
|
# set paths
|
|
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
|
os.environ['JAVAHOME'] = java_path
|
|
|
|
seen = set()
|
|
organizations = []
|
|
# create list of (word, tag) tuples
|
|
tagged_words = NER.tag_words(text)
|
|
# put coherent names together
|
|
nes = NER.get_coherent_tags(tagged_words)
|
|
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
|
|
#print(nes_coherent)
|
|
for tuple in nes_coherent:
|
|
# check if company and not already in list
|
|
if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
|
|
organizations.append(tuple[0])
|
|
seen.add(tuple[0])
|
|
return organizations
|
|
|
|
def count_companies(texts):
|
|
'''param: list of all article texts
|
|
returns: list of company counts as ints
|
|
'''
|
|
# dictionary of companies with their count
|
|
dict_com = {}
|
|
for text in texts:
|
|
# list of found companies in article
|
|
coms = NER.find_companies(text)
|
|
for com in coms:
|
|
if com in dict_com.keys():
|
|
dict_com[com] += 1
|
|
else:
|
|
dict_com[com] = 1
|
|
# # print outlier
|
|
# print(max(dict_com, key=dict_com.get))
|
|
return list(dict_com.values())
|
|
|
|
if __name__ == '__main__':
|
|
|
|
filepath = 'classification_labelled_corrected.csv'
|
|
df = CsvHandler.read_csv(filepath)
|
|
|
|
# only articles with label==1
|
|
df_hits = df[df['Label'] == 1]
|
|
|
|
texts = df_hits['Title'] + ' ' + df_hits['Text']
|
|
|
|
# # zum prüfen lesen
|
|
# for text in texts[10:20]:
|
|
# print(text)
|
|
# print()
|
|
# print(NER.find_companies(text))
|
|
# print()
|
|
|
|
# count names in hit articles
|
|
count_names = NER.count_companies(texts)
|
|
|
|
# plot diagram
|
|
NER.plot_histogram(count_names) |