thesis-anne/NER.py

140 lines
4.7 KiB
Python

'''
Named Entity Recognition (NER)
==============================
Stanford NER takes a text as input and returns a list of entities
like persons, organizations and countries, e.g.
'''
import os
import matplotlib.pyplot as plt
import numpy as np
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from CsvHandler import CsvHandler
class NER:
# toDo: complete lists:
# some legal entity types
company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP', 'Plc', 'LLC',
'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB']
# some entities that are not companies
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
'European Commission', 'EU', 'Staff', 'Min', 'Read',
'Thomson Reuters Trust Principles', 'New York Stock Exchange',
'NYSE']
def tag_words(text):
# path to Stanford NER
stanford_classifier = 'stanford-ner-2018-02-27'\
'\\classifiers'\
'\\english.all.3class.distsim.crf.ser.gz'
stanford_ner_path = 'stanford-ner-2018-02-27'\
'\\stanford-ner.jar'
# create tagger object
st = StanfordNERTagger(stanford_classifier, stanford_ner_path,
encoding='utf-8')
tokenized_text = word_tokenize(text)
# list of tuples (word, tag)
tagged_words = st.tag(tokenized_text)
return tagged_words
def get_coherent_tags(tagged_words):
continuous_chunks = []
current_chunks = []
for token, tag in tagged_words:
if tag == "ORGANIZATION" and token not in NER.company_abbrevs:
current_chunks.append((token, tag))
elif current_chunks:
# put the final current_chunk into the continuous_chunk
continuous_chunks.append(current_chunks)
current_chunks = []
return continuous_chunks
def plot_histogram(count_names):
'''pyplot diagram of company names distribution
(probability density function)
x-axis: number of mentions of the company
y-axis: frequency
'''
# sort list in descending order
count_names.sort(reverse=True)
# convert list to array
names = np.asarray(count_names)
plt.title('Company mentions in News Articles')
plt.xlabel('Count of articles that mention a company')
# Number of companies with this number of mentions
plt.ylabel('Number of companies with this number of articles')
num_bins = 50
n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
# plt.grid(True)
plt.show()
def find_companies(text):
'''param: article text where organizations must be indentified
returns: list of identified organisations as strings
'''
# set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path
seen = set()
organizations = []
# create list of (word, tag) tuples
tagged_words = NER.tag_words(text)
# put coherent names together
nes = NER.get_coherent_tags(tagged_words)
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
#print(nes_coherent)
for tuple in nes_coherent:
# check if company and not already in list
if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
organizations.append(tuple[0])
seen.add(tuple[0])
return organizations
def count_companies(texts):
'''param: list of all article texts
returns: list of company counts as ints
'''
# dictionary of companies with their count
dict_com = {}
for text in texts:
# list of found companies in article
coms = NER.find_companies(text)
for com in coms:
if com in dict_com.keys():
dict_com[com] += 1
else:
dict_com[com] = 1
# # print outlier
# print(max(dict_com, key=dict_com.get))
return list(dict_com.values())
if __name__ == '__main__':
filepath = 'classification_labelled_corrected.csv'
df = CsvHandler.read_csv(filepath)
# only articles with label==1
df_hits = df[df['Label'] == 1]
texts = df_hits['Title'] + ' ' + df_hits['Text']
# # zum prüfen lesen
# for text in texts[10:20]:
# print(text)
# print()
# print(NER.find_companies(text))
# print()
# count names in hit articles
count_names = NER.count_companies(texts)
# plot diagram
NER.plot_histogram(count_names)