now works with hole dataset
This commit is contained in:
parent
6bbd125c05
commit
759db3c0cf
133
NER.py
133
NER.py
|
@ -5,15 +5,26 @@ Named Entity Recognition (NER)
|
||||||
Stanford NER takes a text as input and returns a list of entities
|
Stanford NER takes a text as input and returns a list of entities
|
||||||
like persons, organizations and countries, e.g.
|
like persons, organizations and countries, e.g.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
from nltk.tag import StanfordNERTagger
|
from nltk.tag import StanfordNERTagger
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
|
from CsvHandler import CsvHandler
|
||||||
|
|
||||||
class NER:
|
class NER:
|
||||||
|
|
||||||
|
# legal entity types
|
||||||
|
company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP',
|
||||||
|
'Plc', 'LLC', 'LBO', 'IPO', 'HQ',
|
||||||
|
'CIO', 'NGO', 'AB']
|
||||||
|
|
||||||
|
# entities that are not companies
|
||||||
|
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist',
|
||||||
|
'Cnn', 'European Commission', 'EU', 'Staff', 'Min', 'Read',
|
||||||
|
'Thomson Reuters Trust Principles']
|
||||||
|
|
||||||
def tag_words(text):
|
def tag_words(text):
|
||||||
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
|
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
|
||||||
stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
|
stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
|
||||||
|
@ -21,80 +32,96 @@ class NER:
|
||||||
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
|
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
|
||||||
|
|
||||||
tokenized_text = word_tokenize(text)
|
tokenized_text = word_tokenize(text)
|
||||||
|
# list of tuples (word, tag)
|
||||||
tagged_words = st.tag(tokenized_text)
|
tagged_words = st.tag(tokenized_text)
|
||||||
# returns list of tuples (word, tag)
|
|
||||||
return tagged_words
|
return tagged_words
|
||||||
|
|
||||||
def get_coherent_names(tagged_words):
|
def get_coherent_tags(tagged_words):
|
||||||
continuous_chunk = []
|
continuous_chunks = []
|
||||||
current_chunk = []
|
current_chunks = []
|
||||||
|
|
||||||
for token, tag in tagged_words:
|
for token, tag in tagged_words:
|
||||||
if tag != "O":
|
if tag == "ORGANIZATION" and token not in NER.company_abbrevs:
|
||||||
current_chunk.append((token, tag))
|
current_chunks.append((token, tag))
|
||||||
else:
|
elif current_chunks:
|
||||||
# if current chunk is not empty
|
# put the final current_chunk into the continuous_chunk
|
||||||
if current_chunk:
|
continuous_chunks.append(current_chunks)
|
||||||
continuous_chunk.append(current_chunk)
|
current_chunks = []
|
||||||
current_chunk = []
|
return continuous_chunks
|
||||||
# put the final current_chunk into the continuous_chunk (if any)
|
|
||||||
if current_chunk:
|
|
||||||
continuous_chunk.append(current_chunk)
|
|
||||||
return continuous_chunk
|
|
||||||
|
|
||||||
def plot_barchart():
|
def plot_barchart(count_names):
|
||||||
organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6']
|
'''pyplot diagram of company names distribution
|
||||||
num_mentions = [5, 2, 33, 12, 6, 10]
|
in input news articles
|
||||||
#n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green')
|
x-axis:different company names (numbered consecutively)
|
||||||
plt.plot(organizations, num_mentions, 'ro', ms = 10)
|
y-axis:counts of company name
|
||||||
plt.xlabel('companies')
|
'''
|
||||||
plt.ylabel('count')
|
plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
|
||||||
plt.title('Company mentions in articles')
|
plt.xlabel('Company Names')
|
||||||
|
plt.ylabel('Article Count')
|
||||||
|
plt.title('Counts of News Articles with Company Name')
|
||||||
plt.grid(True)
|
plt.grid(True)
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def find_companies(text):
|
def find_companies(text):
|
||||||
#set paths
|
'''param: article text where organizations must be indentified
|
||||||
|
returns: list of identified organisations as strings
|
||||||
|
'''
|
||||||
|
# set paths
|
||||||
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
||||||
os.environ['JAVAHOME'] = java_path
|
os.environ['JAVAHOME'] = java_path
|
||||||
|
|
||||||
|
seen = set()
|
||||||
organizations = []
|
organizations = []
|
||||||
# create list of (word, tag) tuples
|
# create list of (word, tag) tuples
|
||||||
tagged_words = NER.tag_words(text)
|
tagged_words = NER.tag_words(text)
|
||||||
# put coherent names together
|
# put coherent names together
|
||||||
nes = NER.get_coherent_names(tagged_words)
|
nes = NER.get_coherent_tags(tagged_words)
|
||||||
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
|
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
|
||||||
#print(nes_coherent)
|
#print(nes_coherent)
|
||||||
for tuple in nes_coherent:
|
for tuple in nes_coherent:
|
||||||
if tuple[1] == 'ORGANIZATION':
|
# check if company and not already in list
|
||||||
|
if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
|
||||||
organizations.append(tuple[0])
|
organizations.append(tuple[0])
|
||||||
|
seen.add(tuple[0])
|
||||||
return organizations
|
return organizations
|
||||||
|
|
||||||
|
def count_companies(texts):
|
||||||
|
'''param: list of all article texts
|
||||||
|
returns: list of company counts as ints
|
||||||
|
'''
|
||||||
|
# dictionary of companies with their count
|
||||||
|
dict_com = {}
|
||||||
|
for text in texts:
|
||||||
|
# list of found companies in article
|
||||||
|
coms = NER.find_companies(text)
|
||||||
|
for com in coms:
|
||||||
|
if com in dict_com.keys():
|
||||||
|
dict_com[com] += 1
|
||||||
|
else:
|
||||||
|
dict_com[com] = 1
|
||||||
|
# print outlier (value 38)
|
||||||
|
print(max(dict_com, key=dict_com.get))
|
||||||
|
return dict_com.values()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
#plot_barchart()
|
filepath = 'classification_labelled_corrected.csv'
|
||||||
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
df = CsvHandler.read_csv(filepath)
|
||||||
\nmostly fell in light volumes on Tuesday as energy shares
|
|
||||||
tracked \nfalls in global oil prices, while weaknesses in banking shares
|
# articles with label==1
|
||||||
\namid concerns about loans to an ailing steel firm sent the Thai
|
df_hits = df[df['Label'] == 1]
|
||||||
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
|
|
||||||
1 percent after four \nsessions of gains. The index closed at 1,379.32,
|
texts = df_hits['Title'] + ' ' + df_hits['Text']
|
||||||
its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
|
|
||||||
the most actively \ntraded by turnover, dropped 2.8 percent to a near
|
# # zum prüfen lesen
|
||||||
one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
|
# for text in texts[5:10]:
|
||||||
\nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
|
# print(text)
|
||||||
downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
|
# print()
|
||||||
to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
|
# print(NER.find_companies(text))
|
||||||
lower than 130 percent, the \ndesired level we think and hence the need for
|
# print()
|
||||||
more provisioning \nin the following quarters,\" the broker said in a report.
|
|
||||||
\nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
|
# count names in hit articles
|
||||||
creditors, dropped 1 percent. The steel firm \nand its three creditors
|
count_names = NER.count_companies(texts)
|
||||||
agreed on Monday to consider options to \nrestructure debt worth over
|
|
||||||
50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
|
# plot diagram
|
||||||
slides for a third \nsession, Singapore gave up early gains and Indonesia
|
NER.plot_barchart(count_names)
|
||||||
\nhit a near one-week low, all with trading volumes below \nthe 30-day
|
|
||||||
average ahead of a public holiday on Thursday. \nAmong top losers in the
|
|
||||||
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
|
|
||||||
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
|
|
||||||
\namid uncertainty over global demand. \nFor Asian Companies click.'''
|
|
||||||
print(NER.find_companies(text))
|
|
Loading…
Reference in New Issue