changed NER from nltk to Stanford => better results

This commit is contained in:
Anne Lorenz 2018-09-21 11:00:56 +02:00
parent 6a8386e897
commit 66d366b36e
2 changed files with 56 additions and 53 deletions

101
NER.py
View File

@ -2,59 +2,62 @@
Named Entity Recognition (NER) Named Entity Recognition (NER)
============================== ==============================
NER takes a text as input and searches for names of persons, companies Stanford NER takes a text as input and returns a list of entities
and countries. like persons, organizations and countries, e.g.
''' '''
from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
from nltk.tree import Tree
''' TODO: falsch klassifiert: import os
[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
('PERSON', 'Maybank Kim Eng Securities '), ('PERSON', 'Krung Thai Bank '), from nltk.tag import StanfordNERTagger
('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '), from nltk.tokenize import word_tokenize
('PERSON', 'Keppel Corp '), ('ORGANIZATION', 'Companies ')]
'''
class NER: class NER:
#set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path
def get_ne_with_label(text): stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
labels = [] 'NER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
names = [] stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
# TODO: letztes Wort wird nicht erkannt 'NER\\stanford-ner-2018-02-27\\stanford-ner.jar'
for chunk in ne_chunk(pos_tag(word_tokenize(text + 'lastword.'))):
if hasattr(chunk, 'label'):
name = ''
for c in chunk:
name += c[0] + ' '
if name not in names:
names.append(name.strip())
labels.append(chunk.label())
#print(chunk.label(), ' '.join(c[0] for c in chunk))
return list(zip(labels, names))
test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets def search_organizations(text):
\nmostly fell in light volumes on Tuesday as energy shares # create tagger object
tracked \nfalls in global oil prices, while weaknesses in banking shares st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
\namid concerns about loans to an ailing steel firm sent the Thai
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
1 percent after four \nsessions of gains. The index closed at 1,379.32,
its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
the most actively \ntraded by turnover, dropped 2.8 percent to a near
one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
\nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
lower than 130 percent, the \ndesired level we think and hence the need for
more provisioning \nin the following quarters,\" the broker said in a report.
\nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
creditors, dropped 1 percent. The steel firm \nand its three creditors
agreed on Monday to consider options to \nrestructure debt worth over
50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
slides for a third \nsession, Singapore gave up early gains and Indonesia
\nhit a near one-week low, all with trading volumes below \nthe 30-day
average ahead of a public holiday on Thursday. \nAmong top losers in the
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
\namid uncertainty over global demand. \nFor Asian Companies click.'''
print(NER.get_ne_with_label(test_article)) tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
return classified_text
if __name__ == '__main__':
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
\nmostly fell in light volumes on Tuesday as energy shares
tracked \nfalls in global oil prices, while weaknesses in banking shares
\namid concerns about loans to an ailing steel firm sent the Thai
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
1 percent after four \nsessions of gains. The index closed at 1,379.32,
its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
the most actively \ntraded by turnover, dropped 2.8 percent to a near
one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
\nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
lower than 130 percent, the \ndesired level we think and hence the need for
more provisioning \nin the following quarters,\" the broker said in a report.
\nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
creditors, dropped 1 percent. The steel firm \nand its three creditors
agreed on Monday to consider options to \nrestructure debt worth over
50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
slides for a third \nsession, Singapore gave up early gains and Indonesia
\nhit a near one-week low, all with trading volumes below \nthe 30-day
average ahead of a public holiday on Thursday. \nAmong top losers in the
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
\namid uncertainty over global demand. \nFor Asian Companies click.'''
classified_text = search_organizations(text)
# print organizations
for tuple in classified_text:
if tuple[1] == "ORGANIZATION":
print(tuple)

8
SVM.py
View File

@ -52,10 +52,10 @@ class SVM:
pipeline = Pipeline([('perc', selector), ('SVC', SVC())]) pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100], grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75],
'SVC__kernel': ['linear','poly'], 'SVC__kernel': ['linear'],
'SVC__gamma': [0.0001, 0.001, 0.01, 0.1], 'SVC__gamma': [0.000001, 0.00001, 0.0001, 0.001],
'SVC__C': [0.0001, 0.001, 0.1]}, 'SVC__C': [0.001, 0.01, 0.1, 1, 10]},
cv=skf, cv=skf,
scoring=make_scorer(f1_score)) scoring=make_scorer(f1_score))