improved NER.py

This commit is contained in:
Anne Lorenz 2018-11-07 11:51:54 +01:00
parent 2243a50ed0
commit 61fbdb1059
6 changed files with 150 additions and 4441 deletions

62
NER.py
View File

@ -5,10 +5,7 @@ Named Entity Recognition (NER)
Stanford NER takes a text as input and returns a list of entities
like persons, organizations and countries, e.g.
'''
# toDo: complete list legal entity types
# 'Amazon' not recognized as organization
from collections import OrderedDict
import csv
import os
@ -21,26 +18,24 @@ import re
class NER:
# common company abbreviations to be stripped
company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'U.S.', 'BRIEF-',
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'Co.', 'Groups'
'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
's.r.l.', 'Holding', 'Holdings']
's.r.l.', 'Holding', 'Holdings', 'GmbH', 'plc', 'Incs',
'Plcs', 'PLC', 'Ltds', 'SA', 'Incs', 'S.A.R.L', 'LLC'
'Company', '& Co.', 'Corporation', 'Pte', 'Pty', 'LLP']
# some entities and misc that are not companies
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
'EU', 'Staff', 'Min', 'Read', 'SRF', 'New York Stock Exchange',
'NYSE', 'DAX' 'ECB', 'Federal Reserve', 'Muslim', 'JPMorgan',
'Standard & Poor', 'International Monetary Fund', 'Morgan Stanley',
'Hongkong', 'Whitehall Street', 'Fitch Australia Pty', 'AFS',
'FT House & Home', 'Fitch Rates Autonomous Community of Asturias',
'Autonomous Community of Asturias', 'Fitch Ratings Espana',
'Barcelona', 'Fitch Ratings ', 'Congress', 'Fed', 'OPEC', 'U.N.',
'National Federation of Independent Business', 'Barclays',
'McKinsey', 'Moody', 'Fitch Ratings Ltd.']
regex = r'European.*|.*Reuters.*|.*(B|b)ank.*|.*Ministry.*|.*Trump.*|.*Banca.*|\
.*Department.*|.*House.*|Wall (Street|Str).*|.*Congress.*|\
.*Republican.*|Goldman( Sachs)?|.*Chamber.*|.*Department.*'
# organizations that are no companies
regex = r'.*Reuters.*|.*Ministry.*|.*Trump.*|.*Commission.*|.*BRIEF.*|\
|.*Department.*|.*House.*|.*Congress.*|.*IMF.*|.*Senate.*|.*OPEC.*|\
|.*Republican.|.*Chamber.*|.*Court.*|.*Committee.*|.*Stock.*|\
|.*Financial Times.*|.*Bloomberg.*|.*The Economist.*|\
|.*Cnn.*|.*EU.*|.*Staff.*|.*Min.*|.*Read.*|.*SRF.*|.*Eikon.*|\
|.*NYSE.*|.*DAX.*|.*ECB.*|.*NAFTA.*|.*Treasury.*|.*Federation.*|\
|.*Federal.*|.*Muslim.*|.*Fund.*|.*FT House.*|.*Hongkong.*|\
|.*Street.*|.*Str.*|.*St.*|.*AFS.*|.*Barcelona.*|.*Fed.*|\
|.*U.N.*|.*European.*|.*U.S.*|.*Community.*'
def tag_words(text):
# path to Stanford NER
@ -75,10 +70,6 @@ class NER:
'''param: article text where organizations must be indentified
returns: list of identified organisations as strings
'''
# print(text)
# print()
# print('# examining article...')
# print()
# set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path
@ -93,15 +84,13 @@ class NER:
#print(nes_coherent)
for tuple in nes_coherent:
# check if company and not already in list
if (tuple[0] not in NER.misc) and (tuple[0] not in seen)\
and (not re.search(NER.regex, tuple[0])):
if (tuple[0] not in seen) and (re.search(NER.regex, tuple[0]) is None):
organizations.append(tuple[0])
seen.add(tuple[0])
print('# recognized the following organizations:')
print()
print(organizations)
print()
print()
return organizations
def count_companies(texts):
@ -147,6 +136,22 @@ class NER:
# print(max(dict_com, key=dict_com.get))
return list(dict_com.values())
def show_most_common_companies(n_commons=50):
# load pickle object
with open('obj/dict_organizations.pkl', 'rb') as input:
dict = pickle.load(input)
# sort dict by value
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
reverse=True))
# return n higest values as dict (word => count)
n_dict = {}
for i in range(n_commons):
# next highest score
next_highest = o_dict.popitem(last=False)
n_dict[next_highest[0]] = next_highest[1]
print(n_dict)
if __name__ == '__main__':
print('# starting NER...')
print()
@ -164,3 +169,4 @@ if __name__ == '__main__':
#print(df)
texts = df[1] + '. ' + df[2]
NER.count_companies(texts)
# NER.show_most_common_companies()

View File

@ -7,6 +7,7 @@ Generating a square wordcloud with most common words of input data set.
from BagOfWords import BagOfWords
from NER import NER
from collections import OrderedDict
import csv
from datetime import datetime
from os import path
@ -41,7 +42,7 @@ class VisualizerNews:
quotechar='\'')
corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = False
stemming = True
rel_freq = True
# find most common words in dataset
@ -52,8 +53,8 @@ class VisualizerNews:
dict = BagOfWords.make_dict_common_words(matrix, 200,
rel_freq, stemming)
# save dict object
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
with open('obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
wordcloud = WordCloud(background_color='white',
width=2400,
@ -80,38 +81,52 @@ class VisualizerNews:
'''
print('# preparing histogram of company mentions...')
print()
# read data set
file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
usecols=[1,2],
#nrows=10,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# # read data set
# file = 'data\\cleaned_data_set_without_header.csv'
# df = pd.read_csv(file,
# delimiter='|',
# header=None,
# index_col=None,
# engine='python',
# usecols=[1,2],
# #nrows=10,
# quoting=csv.QUOTE_NONNUMERIC,
# quotechar='\'')
# # only articles with label==1
# df_hits = df[df['Label'] == 1]
# texts = df_hits['Title'] + '. ' + df_hits['Text']
texts = df[1] + '. ' + df[2]
# # # only articles with label==1
# # df_hits = df[df['Label'] == 1]
# # texts = df_hits['Title'] + '. ' + df_hits['Text']
# texts = df[1] + '. ' + df[2]
# list: count articles with company names
count_names = NER.count_companies(texts)
# # list: count articles with company names
# count_names = NER.count_companies(texts)
# # sort list in descending order
# count_names.sort(reverse=True)
# # convert list to array
# names = np.asarray(count_names)
# load pickle object
with open('obj/dict_organizations.pkl', 'rb') as input:
dict = pickle.load(input)
# make list of dict's values
count_companies = list(dict.values())
# sort list in descending order
count_names.sort(reverse=True)
count_companies.sort(reverse=True)
# convert list to array
names = np.asarray(count_names)
#plt.title('Company mentions in News Articles')
names = np.asarray(count_companies)
plt.xlabel('Count of articles that mention a company')
# Number of companies with this number of mentions
plt.ylabel('Number of companies with this number of articles')
num_bins = 50
num_bins = 400
n, bins, patches = plt.hist(names, num_bins,
facecolor='darkred', alpha=0.5)
plt.axis([0, 50, 0, 1000])
plt.axis([1, 14, 0, 14000])
# format axis labels for thousends (e.g. '10,000')
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
.FuncFormatter(lambda x, p: format(int(x), ',')))
# save to file
plt.savefig('visualization\\NER_{}.eps'
@ -163,7 +178,6 @@ class VisualizerNews:
n, bins, patches = plt.hist(names, num_bins,
facecolor='darkslategrey', alpha=0.5)
# [xmin, xmax, ymin, ymax] of axis
#plt.axis([format(300, ','),format(10000, ','), 0, 500])
plt.axis([300,10000,0,500])
# format axis labels for thousends (e.g. '10,000')
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
@ -188,7 +202,7 @@ class VisualizerNews:
#usecols=[3], #column 'Site'
index_col=None,
engine='python',
nrows=10,
#nrows=10,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# find all different sites, group by 'Site'
@ -221,44 +235,58 @@ class VisualizerNews:
def plot_hist_most_common_words(n_commons = 10):
print('# preparing histogram of most common words...')
print()
# load data set
filepath = 'data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(filepath,
delimiter='|',
header=None,
usecols=[1,2],
index_col=None,
engine='python',
#nrows=1000,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# # load data set
# filepath = 'data\\cleaned_data_set_without_header.csv'
# df_dataset = pd.read_csv(filepath,
# delimiter='|',
# header=None,
# usecols=[1,2],
# index_col=None,
# engine='python',
# #nrows=1000,
# quoting=csv.QUOTE_NONNUMERIC,
# quotechar='\'')
corpus = df_dataset[1] + '. ' + df_dataset[2]
# corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = False
rel_freq = True
# stemming = False
# rel_freq = True
# find most common words in dataset
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
stemming)
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
stemming)
# save dict object
with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
# # find most common words in dataset
# extracted_words = BagOfWords.extract_all_words(corpus, stemming)
# vocab = BagOfWords.make_vocab(extracted_words, stemming)
# matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
# stemming)
# dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
# stemming)
# # save dict object
# with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
# pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
plt.xlabel('Most common words in textual corpus')
# load pickle object
with open ('obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
dict = pickle.load(i)
# sort dict by value
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
reverse=True))
# return n higest values as dict (word => count)
n_dict = {}
for i in range(n_commons):
# next highest score
next_highest = o_dict.popitem(last=False)
n_dict[next_highest[0]] = next_highest[1]
#plt.xlabel('Most common words in textual corpus')
plt.ylabel('Relative frequency')
labels = list(dict.keys())
numbers = list(dict.values())
labels = list(n_dict.keys())
numbers = list(n_dict.values())
nbars = n_commons
plt.bar(np.arange(nbars),
height=numbers,
tick_label=labels,
facecolor='darkorange')
facecolor='royalblue')
plt.savefig('visualization\\10_most_common_words_{}.eps'
.format(VisualizerNews.datestring))
plt.savefig('visualization\\10_most_common_words_{}.png'
@ -269,10 +297,39 @@ class VisualizerNews:
''' open pkl file of dict, plot histogram of number of different
company names per article.
'''
# list of number of different companies per article (int)
list = []
with open('obj/num_mentions_companies.pkl', 'rb') as input:
list = pickle.load(input)
# sort list in descending order
list.sort(reverse=True)
# convert list to array
names = np.asarray(list)
plt.xlabel('Number of different company names in news article')
plt.ylabel('Number of articles with this number of company names')
num_bins = 100
n, bins, patches = plt.hist(names, num_bins,
facecolor='darkgreen', alpha=0.5)
plt.axis([0, 30, 0, 1500])
# format axis labels for thousends (e.g. '10,000')
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
.FuncFormatter(lambda x, p: format(int(x), ',')))
# save to file
plt.savefig('visualization\\NER_2_{}.eps'
.format(VisualizerNews.datestring))
plt.savefig('visualization\\NER_2_{}.png'
.format(VisualizerNews.datestring))
plt.show()
if __name__ == '__main__':
VisualizerNews.plot_wordcloud_dataset()
# VisualizerNews.plot_histogram_companies()
# VisualizerNews.plot_hist_num_comp_per_art()
# VisualizerNews.plot_histogram_text_lengths()
# VisualizerNews.plot_pie_chart_of_sites()
VisualizerNews.plot_hist_most_common_words()
# VisualizerNews.plot_hist_most_common_words(10)

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff