now works with hole dataset

This commit is contained in:
Anne Lorenz 2018-09-26 10:24:52 +02:00
parent 6bbd125c05
commit 759db3c0cf
1 changed files with 80 additions and 53 deletions

133
NER.py
View File

@ -5,15 +5,26 @@ Named Entity Recognition (NER)
Stanford NER takes a text as input and returns a list of entities Stanford NER takes a text as input and returns a list of entities
like persons, organizations and countries, e.g. like persons, organizations and countries, e.g.
''' '''
import os import os
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from nltk.tag import StanfordNERTagger from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from CsvHandler import CsvHandler
class NER: class NER:
# legal entity types
company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP',
'Plc', 'LLC', 'LBO', 'IPO', 'HQ',
'CIO', 'NGO', 'AB']
# entities that are not companies
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist',
'Cnn', 'European Commission', 'EU', 'Staff', 'Min', 'Read',
'Thomson Reuters Trust Principles']
def tag_words(text): def tag_words(text):
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz' stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar' stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
@ -21,80 +32,96 @@ class NER:
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8') st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
tokenized_text = word_tokenize(text) tokenized_text = word_tokenize(text)
# list of tuples (word, tag)
tagged_words = st.tag(tokenized_text) tagged_words = st.tag(tokenized_text)
# returns list of tuples (word, tag)
return tagged_words return tagged_words
def get_coherent_names(tagged_words): def get_coherent_tags(tagged_words):
continuous_chunk = [] continuous_chunks = []
current_chunk = [] current_chunks = []
for token, tag in tagged_words: for token, tag in tagged_words:
if tag != "O": if tag == "ORGANIZATION" and token not in NER.company_abbrevs:
current_chunk.append((token, tag)) current_chunks.append((token, tag))
else: elif current_chunks:
# if current chunk is not empty # put the final current_chunk into the continuous_chunk
if current_chunk: continuous_chunks.append(current_chunks)
continuous_chunk.append(current_chunk) current_chunks = []
current_chunk = [] return continuous_chunks
# put the final current_chunk into the continuous_chunk (if any)
if current_chunk:
continuous_chunk.append(current_chunk)
return continuous_chunk
def plot_barchart(): def plot_barchart(count_names):
organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6'] '''pyplot diagram of company names distribution
num_mentions = [5, 2, 33, 12, 6, 10] in input news articles
#n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green') x-axis:different company names (numbered consecutively)
plt.plot(organizations, num_mentions, 'ro', ms = 10) y-axis:counts of company name
plt.xlabel('companies') '''
plt.ylabel('count') plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
plt.title('Company mentions in articles') plt.xlabel('Company Names')
plt.ylabel('Article Count')
plt.title('Counts of News Articles with Company Name')
plt.grid(True) plt.grid(True)
plt.show() plt.show()
def find_companies(text): def find_companies(text):
#set paths '''param: article text where organizations must be indentified
returns: list of identified organisations as strings
'''
# set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181" java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path os.environ['JAVAHOME'] = java_path
seen = set()
organizations = [] organizations = []
# create list of (word, tag) tuples # create list of (word, tag) tuples
tagged_words = NER.tag_words(text) tagged_words = NER.tag_words(text)
# put coherent names together # put coherent names together
nes = NER.get_coherent_names(tagged_words) nes = NER.get_coherent_tags(tagged_words)
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes] nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
#print(nes_coherent) #print(nes_coherent)
for tuple in nes_coherent: for tuple in nes_coherent:
if tuple[1] == 'ORGANIZATION': # check if company and not already in list
if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
organizations.append(tuple[0]) organizations.append(tuple[0])
seen.add(tuple[0])
return organizations return organizations
def count_companies(texts):
'''param: list of all article texts
returns: list of company counts as ints
'''
# dictionary of companies with their count
dict_com = {}
for text in texts:
# list of found companies in article
coms = NER.find_companies(text)
for com in coms:
if com in dict_com.keys():
dict_com[com] += 1
else:
dict_com[com] = 1
# print outlier (value 38)
print(max(dict_com, key=dict_com.get))
return dict_com.values()
if __name__ == '__main__': if __name__ == '__main__':
#plot_barchart() filepath = 'classification_labelled_corrected.csv'
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets df = CsvHandler.read_csv(filepath)
\nmostly fell in light volumes on Tuesday as energy shares
tracked \nfalls in global oil prices, while weaknesses in banking shares # articles with label==1
\namid concerns about loans to an ailing steel firm sent the Thai df_hits = df[df['Label'] == 1]
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
1 percent after four \nsessions of gains. The index closed at 1,379.32, texts = df_hits['Title'] + ' ' + df_hits['Text']
its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
the most actively \ntraded by turnover, dropped 2.8 percent to a near # # zum prüfen lesen
one-month low, \nreflecting potential impact of loans to Sahaviriya Steel # for text in texts[5:10]:
\nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities # print(text)
downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure # print()
to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be # print(NER.find_companies(text))
lower than 130 percent, the \ndesired level we think and hence the need for # print()
more provisioning \nin the following quarters,\" the broker said in a report.
\nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its # count names in hit articles
creditors, dropped 1 percent. The steel firm \nand its three creditors count_names = NER.count_companies(texts)
agreed on Monday to consider options to \nrestructure debt worth over
50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their # plot diagram
slides for a third \nsession, Singapore gave up early gains and Indonesia NER.plot_barchart(count_names)
\nhit a near one-week low, all with trading volumes below \nthe 30-day
average ahead of a public holiday on Thursday. \nAmong top losers in the
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
\namid uncertainty over global demand. \nFor Asian Companies click.'''
print(NER.find_companies(text))