improved NER.py
This commit is contained in:
parent
2243a50ed0
commit
61fbdb1059
64
NER.py
64
NER.py
|
@ -5,10 +5,7 @@ Named Entity Recognition (NER)
|
||||||
Stanford NER takes a text as input and returns a list of entities
|
Stanford NER takes a text as input and returns a list of entities
|
||||||
like persons, organizations and countries, e.g.
|
like persons, organizations and countries, e.g.
|
||||||
'''
|
'''
|
||||||
|
from collections import OrderedDict
|
||||||
# toDo: complete list legal entity types
|
|
||||||
# 'Amazon' not recognized as organization
|
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
@ -21,26 +18,24 @@ import re
|
||||||
|
|
||||||
class NER:
|
class NER:
|
||||||
|
|
||||||
|
# common company abbreviations to be stripped
|
||||||
company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
|
company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
|
||||||
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'U.S.', 'BRIEF-',
|
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'Co.', 'Groups'
|
||||||
'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
|
'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
|
||||||
's.r.l.', 'Holding', 'Holdings']
|
's.r.l.', 'Holding', 'Holdings', 'GmbH', 'plc', 'Incs',
|
||||||
|
'Plcs', 'PLC', 'Ltds', 'SA', 'Incs', 'S.A.R.L', 'LLC'
|
||||||
|
'Company', '& Co.', 'Corporation', 'Pte', 'Pty', 'LLP']
|
||||||
|
|
||||||
# some entities and misc that are not companies
|
# organizations that are no companies
|
||||||
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
|
regex = r'.*Reuters.*|.*Ministry.*|.*Trump.*|.*Commission.*|.*BRIEF.*|\
|
||||||
'EU', 'Staff', 'Min', 'Read', 'SRF', 'New York Stock Exchange',
|
|.*Department.*|.*House.*|.*Congress.*|.*IMF.*|.*Senate.*|.*OPEC.*|\
|
||||||
'NYSE', 'DAX' 'ECB', 'Federal Reserve', 'Muslim', 'JPMorgan',
|
|.*Republican.|.*Chamber.*|.*Court.*|.*Committee.*|.*Stock.*|\
|
||||||
'Standard & Poor', 'International Monetary Fund', 'Morgan Stanley',
|
|.*Financial Times.*|.*Bloomberg.*|.*The Economist.*|\
|
||||||
'Hongkong', 'Whitehall Street', 'Fitch Australia Pty', 'AFS',
|
|.*Cnn.*|.*EU.*|.*Staff.*|.*Min.*|.*Read.*|.*SRF.*|.*Eikon.*|\
|
||||||
'FT House & Home', 'Fitch Rates Autonomous Community of Asturias',
|
|.*NYSE.*|.*DAX.*|.*ECB.*|.*NAFTA.*|.*Treasury.*|.*Federation.*|\
|
||||||
'Autonomous Community of Asturias', 'Fitch Ratings Espana',
|
|.*Federal.*|.*Muslim.*|.*Fund.*|.*FT House.*|.*Hongkong.*|\
|
||||||
'Barcelona', 'Fitch Ratings ', 'Congress', 'Fed', 'OPEC', 'U.N.',
|
|.*Street.*|.*Str.*|.*St.*|.*AFS.*|.*Barcelona.*|.*Fed.*|\
|
||||||
'National Federation of Independent Business', 'Barclays',
|
|.*U.N.*|.*European.*|.*U.S.*|.*Community.*'
|
||||||
'McKinsey', 'Moody', 'Fitch Ratings Ltd.']
|
|
||||||
|
|
||||||
regex = r'European.*|.*Reuters.*|.*(B|b)ank.*|.*Ministry.*|.*Trump.*|.*Banca.*|\
|
|
||||||
.*Department.*|.*House.*|Wall (Street|Str).*|.*Congress.*|\
|
|
||||||
.*Republican.*|Goldman( Sachs)?|.*Chamber.*|.*Department.*'
|
|
||||||
|
|
||||||
def tag_words(text):
|
def tag_words(text):
|
||||||
# path to Stanford NER
|
# path to Stanford NER
|
||||||
|
@ -75,10 +70,6 @@ class NER:
|
||||||
'''param: article text where organizations must be indentified
|
'''param: article text where organizations must be indentified
|
||||||
returns: list of identified organisations as strings
|
returns: list of identified organisations as strings
|
||||||
'''
|
'''
|
||||||
# print(text)
|
|
||||||
# print()
|
|
||||||
# print('# examining article...')
|
|
||||||
# print()
|
|
||||||
# set paths
|
# set paths
|
||||||
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
||||||
os.environ['JAVAHOME'] = java_path
|
os.environ['JAVAHOME'] = java_path
|
||||||
|
@ -93,15 +84,13 @@ class NER:
|
||||||
#print(nes_coherent)
|
#print(nes_coherent)
|
||||||
for tuple in nes_coherent:
|
for tuple in nes_coherent:
|
||||||
# check if company and not already in list
|
# check if company and not already in list
|
||||||
if (tuple[0] not in NER.misc) and (tuple[0] not in seen)\
|
if (tuple[0] not in seen) and (re.search(NER.regex, tuple[0]) is None):
|
||||||
and (not re.search(NER.regex, tuple[0])):
|
|
||||||
organizations.append(tuple[0])
|
organizations.append(tuple[0])
|
||||||
seen.add(tuple[0])
|
seen.add(tuple[0])
|
||||||
print('# recognized the following organizations:')
|
print('# recognized the following organizations:')
|
||||||
print()
|
print()
|
||||||
print(organizations)
|
print(organizations)
|
||||||
print()
|
print()
|
||||||
print()
|
|
||||||
return organizations
|
return organizations
|
||||||
|
|
||||||
def count_companies(texts):
|
def count_companies(texts):
|
||||||
|
@ -147,6 +136,22 @@ class NER:
|
||||||
# print(max(dict_com, key=dict_com.get))
|
# print(max(dict_com, key=dict_com.get))
|
||||||
return list(dict_com.values())
|
return list(dict_com.values())
|
||||||
|
|
||||||
|
def show_most_common_companies(n_commons=50):
|
||||||
|
# load pickle object
|
||||||
|
with open('obj/dict_organizations.pkl', 'rb') as input:
|
||||||
|
dict = pickle.load(input)
|
||||||
|
# sort dict by value
|
||||||
|
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||||||
|
reverse=True))
|
||||||
|
# return n higest values as dict (word => count)
|
||||||
|
n_dict = {}
|
||||||
|
|
||||||
|
for i in range(n_commons):
|
||||||
|
# next highest score
|
||||||
|
next_highest = o_dict.popitem(last=False)
|
||||||
|
n_dict[next_highest[0]] = next_highest[1]
|
||||||
|
print(n_dict)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print('# starting NER...')
|
print('# starting NER...')
|
||||||
print()
|
print()
|
||||||
|
@ -163,4 +168,5 @@ if __name__ == '__main__':
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
#print(df)
|
#print(df)
|
||||||
texts = df[1] + '. ' + df[2]
|
texts = df[1] + '. ' + df[2]
|
||||||
NER.count_companies(texts)
|
NER.count_companies(texts)
|
||||||
|
# NER.show_most_common_companies()
|
|
@ -7,6 +7,7 @@ Generating a square wordcloud with most common words of input data set.
|
||||||
from BagOfWords import BagOfWords
|
from BagOfWords import BagOfWords
|
||||||
from NER import NER
|
from NER import NER
|
||||||
|
|
||||||
|
from collections import OrderedDict
|
||||||
import csv
|
import csv
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from os import path
|
from os import path
|
||||||
|
@ -41,7 +42,7 @@ class VisualizerNews:
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||||
stemming = False
|
stemming = True
|
||||||
rel_freq = True
|
rel_freq = True
|
||||||
|
|
||||||
# find most common words in dataset
|
# find most common words in dataset
|
||||||
|
@ -52,8 +53,8 @@ class VisualizerNews:
|
||||||
dict = BagOfWords.make_dict_common_words(matrix, 200,
|
dict = BagOfWords.make_dict_common_words(matrix, 200,
|
||||||
rel_freq, stemming)
|
rel_freq, stemming)
|
||||||
# save dict object
|
# save dict object
|
||||||
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
|
with open('obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
|
||||||
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
wordcloud = WordCloud(background_color='white',
|
wordcloud = WordCloud(background_color='white',
|
||||||
width=2400,
|
width=2400,
|
||||||
|
@ -80,38 +81,52 @@ class VisualizerNews:
|
||||||
'''
|
'''
|
||||||
print('# preparing histogram of company mentions...')
|
print('# preparing histogram of company mentions...')
|
||||||
print()
|
print()
|
||||||
# read data set
|
# # read data set
|
||||||
file = 'data\\cleaned_data_set_without_header.csv'
|
# file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df = pd.read_csv(file,
|
# df = pd.read_csv(file,
|
||||||
delimiter='|',
|
# delimiter='|',
|
||||||
header=None,
|
# header=None,
|
||||||
index_col=None,
|
# index_col=None,
|
||||||
engine='python',
|
# engine='python',
|
||||||
usecols=[1,2],
|
# usecols=[1,2],
|
||||||
#nrows=10,
|
# #nrows=10,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
# quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
# quotechar='\'')
|
||||||
|
|
||||||
# # only articles with label==1
|
# # # only articles with label==1
|
||||||
# df_hits = df[df['Label'] == 1]
|
# # df_hits = df[df['Label'] == 1]
|
||||||
# texts = df_hits['Title'] + '. ' + df_hits['Text']
|
# # texts = df_hits['Title'] + '. ' + df_hits['Text']
|
||||||
texts = df[1] + '. ' + df[2]
|
# texts = df[1] + '. ' + df[2]
|
||||||
|
|
||||||
# list: count articles with company names
|
# # list: count articles with company names
|
||||||
count_names = NER.count_companies(texts)
|
# count_names = NER.count_companies(texts)
|
||||||
|
|
||||||
|
# # sort list in descending order
|
||||||
|
# count_names.sort(reverse=True)
|
||||||
|
# # convert list to array
|
||||||
|
# names = np.asarray(count_names)
|
||||||
|
|
||||||
|
# load pickle object
|
||||||
|
with open('obj/dict_organizations.pkl', 'rb') as input:
|
||||||
|
dict = pickle.load(input)
|
||||||
|
# make list of dict's values
|
||||||
|
count_companies = list(dict.values())
|
||||||
# sort list in descending order
|
# sort list in descending order
|
||||||
count_names.sort(reverse=True)
|
count_companies.sort(reverse=True)
|
||||||
# convert list to array
|
# convert list to array
|
||||||
names = np.asarray(count_names)
|
names = np.asarray(count_companies)
|
||||||
#plt.title('Company mentions in News Articles')
|
|
||||||
plt.xlabel('Count of articles that mention a company')
|
plt.xlabel('Count of articles that mention a company')
|
||||||
# Number of companies with this number of mentions
|
# Number of companies with this number of mentions
|
||||||
plt.ylabel('Number of companies with this number of articles')
|
plt.ylabel('Number of companies with this number of articles')
|
||||||
num_bins = 50
|
num_bins = 400
|
||||||
n, bins, patches = plt.hist(names, num_bins,
|
n, bins, patches = plt.hist(names, num_bins,
|
||||||
facecolor='darkred', alpha=0.5)
|
facecolor='darkred', alpha=0.5)
|
||||||
plt.axis([0, 50, 0, 1000])
|
plt.axis([1, 14, 0, 14000])
|
||||||
|
|
||||||
|
# format axis labels for thousends (e.g. '10,000')
|
||||||
|
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
|
||||||
|
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||||
|
|
||||||
# save to file
|
# save to file
|
||||||
plt.savefig('visualization\\NER_{}.eps'
|
plt.savefig('visualization\\NER_{}.eps'
|
||||||
|
@ -163,7 +178,6 @@ class VisualizerNews:
|
||||||
n, bins, patches = plt.hist(names, num_bins,
|
n, bins, patches = plt.hist(names, num_bins,
|
||||||
facecolor='darkslategrey', alpha=0.5)
|
facecolor='darkslategrey', alpha=0.5)
|
||||||
# [xmin, xmax, ymin, ymax] of axis
|
# [xmin, xmax, ymin, ymax] of axis
|
||||||
#plt.axis([format(300, ','),format(10000, ','), 0, 500])
|
|
||||||
plt.axis([300,10000,0,500])
|
plt.axis([300,10000,0,500])
|
||||||
# format axis labels for thousends (e.g. '10,000')
|
# format axis labels for thousends (e.g. '10,000')
|
||||||
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
|
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
|
||||||
|
@ -188,7 +202,7 @@ class VisualizerNews:
|
||||||
#usecols=[3], #column 'Site'
|
#usecols=[3], #column 'Site'
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
nrows=10,
|
#nrows=10,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
# find all different sites, group by 'Site'
|
# find all different sites, group by 'Site'
|
||||||
|
@ -221,44 +235,58 @@ class VisualizerNews:
|
||||||
def plot_hist_most_common_words(n_commons = 10):
|
def plot_hist_most_common_words(n_commons = 10):
|
||||||
print('# preparing histogram of most common words...')
|
print('# preparing histogram of most common words...')
|
||||||
print()
|
print()
|
||||||
# load data set
|
# # load data set
|
||||||
filepath = 'data\\cleaned_data_set_without_header.csv'
|
# filepath = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
# df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
# delimiter='|',
|
||||||
header=None,
|
# header=None,
|
||||||
usecols=[1,2],
|
# usecols=[1,2],
|
||||||
index_col=None,
|
# index_col=None,
|
||||||
engine='python',
|
# engine='python',
|
||||||
#nrows=1000,
|
# #nrows=1000,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
# quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
# quotechar='\'')
|
||||||
|
|
||||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
# corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||||
|
|
||||||
stemming = False
|
# stemming = False
|
||||||
rel_freq = True
|
# rel_freq = True
|
||||||
|
|
||||||
# find most common words in dataset
|
# # find most common words in dataset
|
||||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
# extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
# vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
|
# matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
|
||||||
stemming)
|
# stemming)
|
||||||
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
|
# dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
|
||||||
stemming)
|
# stemming)
|
||||||
# save dict object
|
# # save dict object
|
||||||
with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
|
# with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
|
||||||
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
# pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
plt.xlabel('Most common words in textual corpus')
|
# load pickle object
|
||||||
|
with open ('obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
|
||||||
|
dict = pickle.load(i)
|
||||||
|
# sort dict by value
|
||||||
|
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||||||
|
reverse=True))
|
||||||
|
# return n higest values as dict (word => count)
|
||||||
|
n_dict = {}
|
||||||
|
|
||||||
|
for i in range(n_commons):
|
||||||
|
# next highest score
|
||||||
|
next_highest = o_dict.popitem(last=False)
|
||||||
|
n_dict[next_highest[0]] = next_highest[1]
|
||||||
|
|
||||||
|
#plt.xlabel('Most common words in textual corpus')
|
||||||
plt.ylabel('Relative frequency')
|
plt.ylabel('Relative frequency')
|
||||||
|
|
||||||
labels = list(dict.keys())
|
labels = list(n_dict.keys())
|
||||||
numbers = list(dict.values())
|
numbers = list(n_dict.values())
|
||||||
nbars = n_commons
|
nbars = n_commons
|
||||||
plt.bar(np.arange(nbars),
|
plt.bar(np.arange(nbars),
|
||||||
height=numbers,
|
height=numbers,
|
||||||
tick_label=labels,
|
tick_label=labels,
|
||||||
facecolor='darkorange')
|
facecolor='royalblue')
|
||||||
plt.savefig('visualization\\10_most_common_words_{}.eps'
|
plt.savefig('visualization\\10_most_common_words_{}.eps'
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.savefig('visualization\\10_most_common_words_{}.png'
|
plt.savefig('visualization\\10_most_common_words_{}.png'
|
||||||
|
@ -269,10 +297,39 @@ class VisualizerNews:
|
||||||
''' open pkl file of dict, plot histogram of number of different
|
''' open pkl file of dict, plot histogram of number of different
|
||||||
company names per article.
|
company names per article.
|
||||||
'''
|
'''
|
||||||
|
# list of number of different companies per article (int)
|
||||||
|
list = []
|
||||||
|
with open('obj/num_mentions_companies.pkl', 'rb') as input:
|
||||||
|
list = pickle.load(input)
|
||||||
|
|
||||||
|
# sort list in descending order
|
||||||
|
list.sort(reverse=True)
|
||||||
|
|
||||||
|
# convert list to array
|
||||||
|
names = np.asarray(list)
|
||||||
|
|
||||||
|
plt.xlabel('Number of different company names in news article')
|
||||||
|
plt.ylabel('Number of articles with this number of company names')
|
||||||
|
num_bins = 100
|
||||||
|
n, bins, patches = plt.hist(names, num_bins,
|
||||||
|
facecolor='darkgreen', alpha=0.5)
|
||||||
|
plt.axis([0, 30, 0, 1500])
|
||||||
|
|
||||||
|
# format axis labels for thousends (e.g. '10,000')
|
||||||
|
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
|
||||||
|
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||||
|
|
||||||
|
# save to file
|
||||||
|
plt.savefig('visualization\\NER_2_{}.eps'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('visualization\\NER_2_{}.png'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.show()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
VisualizerNews.plot_wordcloud_dataset()
|
VisualizerNews.plot_wordcloud_dataset()
|
||||||
# VisualizerNews.plot_histogram_companies()
|
# VisualizerNews.plot_histogram_companies()
|
||||||
|
# VisualizerNews.plot_hist_num_comp_per_art()
|
||||||
# VisualizerNews.plot_histogram_text_lengths()
|
# VisualizerNews.plot_histogram_text_lengths()
|
||||||
# VisualizerNews.plot_pie_chart_of_sites()
|
# VisualizerNews.plot_pie_chart_of_sites()
|
||||||
VisualizerNews.plot_hist_most_common_words()
|
# VisualizerNews.plot_hist_most_common_words(10)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue