changed NER from nltk to Stanford => better results
This commit is contained in:
parent
6a8386e897
commit
66d366b36e
55
NER.py
55
NER.py
|
@ -2,37 +2,35 @@
|
||||||
Named Entity Recognition (NER)
|
Named Entity Recognition (NER)
|
||||||
==============================
|
==============================
|
||||||
|
|
||||||
NER takes a text as input and searches for names of persons, companies
|
Stanford NER takes a text as input and returns a list of entities
|
||||||
and countries.
|
like persons, organizations and countries, e.g.
|
||||||
'''
|
'''
|
||||||
from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
|
|
||||||
from nltk.tree import Tree
|
|
||||||
|
|
||||||
''' TODO: falsch klassifiert:
|
import os
|
||||||
[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
|
|
||||||
('PERSON', 'Maybank Kim Eng Securities '), ('PERSON', 'Krung Thai Bank '),
|
from nltk.tag import StanfordNERTagger
|
||||||
('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),
|
from nltk.tokenize import word_tokenize
|
||||||
('PERSON', 'Keppel Corp '), ('ORGANIZATION', 'Companies ')]
|
|
||||||
'''
|
|
||||||
|
|
||||||
class NER:
|
class NER:
|
||||||
|
#set paths
|
||||||
|
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
||||||
|
os.environ['JAVAHOME'] = java_path
|
||||||
|
|
||||||
def get_ne_with_label(text):
|
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
|
||||||
labels = []
|
'NER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
|
||||||
names = []
|
stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
|
||||||
# TODO: letztes Wort wird nicht erkannt
|
'NER\\stanford-ner-2018-02-27\\stanford-ner.jar'
|
||||||
for chunk in ne_chunk(pos_tag(word_tokenize(text + 'lastword.'))):
|
|
||||||
if hasattr(chunk, 'label'):
|
|
||||||
name = ''
|
|
||||||
for c in chunk:
|
|
||||||
name += c[0] + ' '
|
|
||||||
if name not in names:
|
|
||||||
names.append(name.strip())
|
|
||||||
labels.append(chunk.label())
|
|
||||||
#print(chunk.label(), ' '.join(c[0] for c in chunk))
|
|
||||||
return list(zip(labels, names))
|
|
||||||
|
|
||||||
test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
def search_organizations(text):
|
||||||
|
# create tagger object
|
||||||
|
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
|
||||||
|
|
||||||
|
tokenized_text = word_tokenize(text)
|
||||||
|
classified_text = st.tag(tokenized_text)
|
||||||
|
return classified_text
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
||||||
\nmostly fell in light volumes on Tuesday as energy shares
|
\nmostly fell in light volumes on Tuesday as energy shares
|
||||||
tracked \nfalls in global oil prices, while weaknesses in banking shares
|
tracked \nfalls in global oil prices, while weaknesses in banking shares
|
||||||
\namid concerns about loans to an ailing steel firm sent the Thai
|
\namid concerns about loans to an ailing steel firm sent the Thai
|
||||||
|
@ -57,4 +55,9 @@ test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
||||||
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
|
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
|
||||||
\namid uncertainty over global demand. \nFor Asian Companies click.'''
|
\namid uncertainty over global demand. \nFor Asian Companies click.'''
|
||||||
|
|
||||||
print(NER.get_ne_with_label(test_article))
|
classified_text = search_organizations(text)
|
||||||
|
|
||||||
|
# print organizations
|
||||||
|
for tuple in classified_text:
|
||||||
|
if tuple[1] == "ORGANIZATION":
|
||||||
|
print(tuple)
|
8
SVM.py
8
SVM.py
|
@ -52,10 +52,10 @@ class SVM:
|
||||||
|
|
||||||
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
||||||
|
|
||||||
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
|
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75],
|
||||||
'SVC__kernel': ['linear','poly'],
|
'SVC__kernel': ['linear'],
|
||||||
'SVC__gamma': [0.0001, 0.001, 0.01, 0.1],
|
'SVC__gamma': [0.000001, 0.00001, 0.0001, 0.001],
|
||||||
'SVC__C': [0.0001, 0.001, 0.1]},
|
'SVC__C': [0.001, 0.01, 0.1, 1, 10]},
|
||||||
cv=skf,
|
cv=skf,
|
||||||
scoring=make_scorer(f1_score))
|
scoring=make_scorer(f1_score))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue