improved NER.py

This commit is contained in:
Anne Lorenz 2018-11-07 11:51:54 +01:00
parent 2243a50ed0
commit 61fbdb1059
6 changed files with 150 additions and 4441 deletions

64
NER.py
View File

@ -5,10 +5,7 @@ Named Entity Recognition (NER)
Stanford NER takes a text as input and returns a list of entities Stanford NER takes a text as input and returns a list of entities
like persons, organizations and countries, e.g. like persons, organizations and countries, e.g.
''' '''
from collections import OrderedDict
# toDo: complete list legal entity types
# 'Amazon' not recognized as organization
import csv import csv
import os import os
@ -21,26 +18,24 @@ import re
class NER: class NER:
# common company abbreviations to be stripped
company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd', company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'U.S.', 'BRIEF-', 'AG', 'LP', 'Limited', 'Tbk', 'Group', 'Co.', 'Groups'
'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc', 'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
's.r.l.', 'Holding', 'Holdings'] 's.r.l.', 'Holding', 'Holdings', 'GmbH', 'plc', 'Incs',
'Plcs', 'PLC', 'Ltds', 'SA', 'Incs', 'S.A.R.L', 'LLC'
'Company', '& Co.', 'Corporation', 'Pte', 'Pty', 'LLP']
# some entities and misc that are not companies # organizations that are no companies
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn', regex = r'.*Reuters.*|.*Ministry.*|.*Trump.*|.*Commission.*|.*BRIEF.*|\
'EU', 'Staff', 'Min', 'Read', 'SRF', 'New York Stock Exchange', |.*Department.*|.*House.*|.*Congress.*|.*IMF.*|.*Senate.*|.*OPEC.*|\
'NYSE', 'DAX' 'ECB', 'Federal Reserve', 'Muslim', 'JPMorgan', |.*Republican.|.*Chamber.*|.*Court.*|.*Committee.*|.*Stock.*|\
'Standard & Poor', 'International Monetary Fund', 'Morgan Stanley', |.*Financial Times.*|.*Bloomberg.*|.*The Economist.*|\
'Hongkong', 'Whitehall Street', 'Fitch Australia Pty', 'AFS', |.*Cnn.*|.*EU.*|.*Staff.*|.*Min.*|.*Read.*|.*SRF.*|.*Eikon.*|\
'FT House & Home', 'Fitch Rates Autonomous Community of Asturias', |.*NYSE.*|.*DAX.*|.*ECB.*|.*NAFTA.*|.*Treasury.*|.*Federation.*|\
'Autonomous Community of Asturias', 'Fitch Ratings Espana', |.*Federal.*|.*Muslim.*|.*Fund.*|.*FT House.*|.*Hongkong.*|\
'Barcelona', 'Fitch Ratings ', 'Congress', 'Fed', 'OPEC', 'U.N.', |.*Street.*|.*Str.*|.*St.*|.*AFS.*|.*Barcelona.*|.*Fed.*|\
'National Federation of Independent Business', 'Barclays', |.*U.N.*|.*European.*|.*U.S.*|.*Community.*'
'McKinsey', 'Moody', 'Fitch Ratings Ltd.']
regex = r'European.*|.*Reuters.*|.*(B|b)ank.*|.*Ministry.*|.*Trump.*|.*Banca.*|\
.*Department.*|.*House.*|Wall (Street|Str).*|.*Congress.*|\
.*Republican.*|Goldman( Sachs)?|.*Chamber.*|.*Department.*'
def tag_words(text): def tag_words(text):
# path to Stanford NER # path to Stanford NER
@ -75,10 +70,6 @@ class NER:
'''param: article text where organizations must be indentified '''param: article text where organizations must be indentified
returns: list of identified organisations as strings returns: list of identified organisations as strings
''' '''
# print(text)
# print()
# print('# examining article...')
# print()
# set paths # set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181" java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path os.environ['JAVAHOME'] = java_path
@ -93,15 +84,13 @@ class NER:
#print(nes_coherent) #print(nes_coherent)
for tuple in nes_coherent: for tuple in nes_coherent:
# check if company and not already in list # check if company and not already in list
if (tuple[0] not in NER.misc) and (tuple[0] not in seen)\ if (tuple[0] not in seen) and (re.search(NER.regex, tuple[0]) is None):
and (not re.search(NER.regex, tuple[0])):
organizations.append(tuple[0]) organizations.append(tuple[0])
seen.add(tuple[0]) seen.add(tuple[0])
print('# recognized the following organizations:') print('# recognized the following organizations:')
print() print()
print(organizations) print(organizations)
print() print()
print()
return organizations return organizations
def count_companies(texts): def count_companies(texts):
@ -147,6 +136,22 @@ class NER:
# print(max(dict_com, key=dict_com.get)) # print(max(dict_com, key=dict_com.get))
return list(dict_com.values()) return list(dict_com.values())
def show_most_common_companies(n_commons=50):
# load pickle object
with open('obj/dict_organizations.pkl', 'rb') as input:
dict = pickle.load(input)
# sort dict by value
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
reverse=True))
# return n higest values as dict (word => count)
n_dict = {}
for i in range(n_commons):
# next highest score
next_highest = o_dict.popitem(last=False)
n_dict[next_highest[0]] = next_highest[1]
print(n_dict)
if __name__ == '__main__': if __name__ == '__main__':
print('# starting NER...') print('# starting NER...')
print() print()
@ -163,4 +168,5 @@ if __name__ == '__main__':
quotechar='\'') quotechar='\'')
#print(df) #print(df)
texts = df[1] + '. ' + df[2] texts = df[1] + '. ' + df[2]
NER.count_companies(texts) NER.count_companies(texts)
# NER.show_most_common_companies()

View File

@ -7,6 +7,7 @@ Generating a square wordcloud with most common words of input data set.
from BagOfWords import BagOfWords from BagOfWords import BagOfWords
from NER import NER from NER import NER
from collections import OrderedDict
import csv import csv
from datetime import datetime from datetime import datetime
from os import path from os import path
@ -41,7 +42,7 @@ class VisualizerNews:
quotechar='\'') quotechar='\'')
corpus = df_dataset[1] + '. ' + df_dataset[2] corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = False stemming = True
rel_freq = True rel_freq = True
# find most common words in dataset # find most common words in dataset
@ -52,8 +53,8 @@ class VisualizerNews:
dict = BagOfWords.make_dict_common_words(matrix, 200, dict = BagOfWords.make_dict_common_words(matrix, 200,
rel_freq, stemming) rel_freq, stemming)
# save dict object # save dict object
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f: with open('obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL) pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
wordcloud = WordCloud(background_color='white', wordcloud = WordCloud(background_color='white',
width=2400, width=2400,
@ -80,38 +81,52 @@ class VisualizerNews:
''' '''
print('# preparing histogram of company mentions...') print('# preparing histogram of company mentions...')
print() print()
# read data set # # read data set
file = 'data\\cleaned_data_set_without_header.csv' # file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file, # df = pd.read_csv(file,
delimiter='|', # delimiter='|',
header=None, # header=None,
index_col=None, # index_col=None,
engine='python', # engine='python',
usecols=[1,2], # usecols=[1,2],
#nrows=10, # #nrows=10,
quoting=csv.QUOTE_NONNUMERIC, # quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') # quotechar='\'')
# # only articles with label==1 # # # only articles with label==1
# df_hits = df[df['Label'] == 1] # # df_hits = df[df['Label'] == 1]
# texts = df_hits['Title'] + '. ' + df_hits['Text'] # # texts = df_hits['Title'] + '. ' + df_hits['Text']
texts = df[1] + '. ' + df[2] # texts = df[1] + '. ' + df[2]
# list: count articles with company names # # list: count articles with company names
count_names = NER.count_companies(texts) # count_names = NER.count_companies(texts)
# # sort list in descending order
# count_names.sort(reverse=True)
# # convert list to array
# names = np.asarray(count_names)
# load pickle object
with open('obj/dict_organizations.pkl', 'rb') as input:
dict = pickle.load(input)
# make list of dict's values
count_companies = list(dict.values())
# sort list in descending order # sort list in descending order
count_names.sort(reverse=True) count_companies.sort(reverse=True)
# convert list to array # convert list to array
names = np.asarray(count_names) names = np.asarray(count_companies)
#plt.title('Company mentions in News Articles')
plt.xlabel('Count of articles that mention a company') plt.xlabel('Count of articles that mention a company')
# Number of companies with this number of mentions # Number of companies with this number of mentions
plt.ylabel('Number of companies with this number of articles') plt.ylabel('Number of companies with this number of articles')
num_bins = 50 num_bins = 400
n, bins, patches = plt.hist(names, num_bins, n, bins, patches = plt.hist(names, num_bins,
facecolor='darkred', alpha=0.5) facecolor='darkred', alpha=0.5)
plt.axis([0, 50, 0, 1000]) plt.axis([1, 14, 0, 14000])
# format axis labels for thousends (e.g. '10,000')
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
.FuncFormatter(lambda x, p: format(int(x), ',')))
# save to file # save to file
plt.savefig('visualization\\NER_{}.eps' plt.savefig('visualization\\NER_{}.eps'
@ -163,7 +178,6 @@ class VisualizerNews:
n, bins, patches = plt.hist(names, num_bins, n, bins, patches = plt.hist(names, num_bins,
facecolor='darkslategrey', alpha=0.5) facecolor='darkslategrey', alpha=0.5)
# [xmin, xmax, ymin, ymax] of axis # [xmin, xmax, ymin, ymax] of axis
#plt.axis([format(300, ','),format(10000, ','), 0, 500])
plt.axis([300,10000,0,500]) plt.axis([300,10000,0,500])
# format axis labels for thousends (e.g. '10,000') # format axis labels for thousends (e.g. '10,000')
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\ plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
@ -188,7 +202,7 @@ class VisualizerNews:
#usecols=[3], #column 'Site' #usecols=[3], #column 'Site'
index_col=None, index_col=None,
engine='python', engine='python',
nrows=10, #nrows=10,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
# find all different sites, group by 'Site' # find all different sites, group by 'Site'
@ -221,44 +235,58 @@ class VisualizerNews:
def plot_hist_most_common_words(n_commons = 10): def plot_hist_most_common_words(n_commons = 10):
print('# preparing histogram of most common words...') print('# preparing histogram of most common words...')
print() print()
# load data set # # load data set
filepath = 'data\\cleaned_data_set_without_header.csv' # filepath = 'data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(filepath, # df_dataset = pd.read_csv(filepath,
delimiter='|', # delimiter='|',
header=None, # header=None,
usecols=[1,2], # usecols=[1,2],
index_col=None, # index_col=None,
engine='python', # engine='python',
#nrows=1000, # #nrows=1000,
quoting=csv.QUOTE_NONNUMERIC, # quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') # quotechar='\'')
corpus = df_dataset[1] + '. ' + df_dataset[2] # corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = False # stemming = False
rel_freq = True # rel_freq = True
# find most common words in dataset # # find most common words in dataset
extracted_words = BagOfWords.extract_all_words(corpus, stemming) # extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming) # vocab = BagOfWords.make_vocab(extracted_words, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, # matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
stemming) # stemming)
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, # dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
stemming) # stemming)
# save dict object # # save dict object
with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f: # with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL) # pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
plt.xlabel('Most common words in textual corpus') # load pickle object
with open ('obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
dict = pickle.load(i)
# sort dict by value
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
reverse=True))
# return n higest values as dict (word => count)
n_dict = {}
for i in range(n_commons):
# next highest score
next_highest = o_dict.popitem(last=False)
n_dict[next_highest[0]] = next_highest[1]
#plt.xlabel('Most common words in textual corpus')
plt.ylabel('Relative frequency') plt.ylabel('Relative frequency')
labels = list(dict.keys()) labels = list(n_dict.keys())
numbers = list(dict.values()) numbers = list(n_dict.values())
nbars = n_commons nbars = n_commons
plt.bar(np.arange(nbars), plt.bar(np.arange(nbars),
height=numbers, height=numbers,
tick_label=labels, tick_label=labels,
facecolor='darkorange') facecolor='royalblue')
plt.savefig('visualization\\10_most_common_words_{}.eps' plt.savefig('visualization\\10_most_common_words_{}.eps'
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.savefig('visualization\\10_most_common_words_{}.png' plt.savefig('visualization\\10_most_common_words_{}.png'
@ -269,10 +297,39 @@ class VisualizerNews:
''' open pkl file of dict, plot histogram of number of different ''' open pkl file of dict, plot histogram of number of different
company names per article. company names per article.
''' '''
# list of number of different companies per article (int)
list = []
with open('obj/num_mentions_companies.pkl', 'rb') as input:
list = pickle.load(input)
# sort list in descending order
list.sort(reverse=True)
# convert list to array
names = np.asarray(list)
plt.xlabel('Number of different company names in news article')
plt.ylabel('Number of articles with this number of company names')
num_bins = 100
n, bins, patches = plt.hist(names, num_bins,
facecolor='darkgreen', alpha=0.5)
plt.axis([0, 30, 0, 1500])
# format axis labels for thousends (e.g. '10,000')
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
.FuncFormatter(lambda x, p: format(int(x), ',')))
# save to file
plt.savefig('visualization\\NER_2_{}.eps'
.format(VisualizerNews.datestring))
plt.savefig('visualization\\NER_2_{}.png'
.format(VisualizerNews.datestring))
plt.show()
if __name__ == '__main__': if __name__ == '__main__':
VisualizerNews.plot_wordcloud_dataset() VisualizerNews.plot_wordcloud_dataset()
# VisualizerNews.plot_histogram_companies() # VisualizerNews.plot_histogram_companies()
# VisualizerNews.plot_hist_num_comp_per_art()
# VisualizerNews.plot_histogram_text_lengths() # VisualizerNews.plot_histogram_text_lengths()
# VisualizerNews.plot_pie_chart_of_sites() # VisualizerNews.plot_pie_chart_of_sites()
VisualizerNews.plot_hist_most_common_words() # VisualizerNews.plot_hist_most_common_words(10)

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff