improved NER

This commit is contained in:
Anne Lorenz 2018-09-21 12:10:55 +02:00
parent 66d366b36e
commit 14e5af9d7d
1 changed files with 39 additions and 17 deletions

56
NER.py
View File

@ -12,24 +12,41 @@ from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
class NER: class NER:
#set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford' def tag_words(text):
'NER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz' stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford' stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
'NER\\stanford-ner-2018-02-27\\stanford-ner.jar'
def search_organizations(text):
# create tagger object # create tagger object
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8') st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
tokenized_text = word_tokenize(text) tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text) tagged_words = st.tag(tokenized_text)
return classified_text # returns list of tuples (word, tag)
return tagged_words
def get_coherent_names(tagged_words):
continuous_chunk = []
current_chunk = []
for token, tag in tagged_words:
if tag != "O":
current_chunk.append((token, tag))
else:
# if current chunk is not empty
if current_chunk:
continuous_chunk.append(current_chunk)
current_chunk = []
# put the final current_chunk into the continuous_chunk (if any)
if current_chunk:
continuous_chunk.append(current_chunk)
return continuous_chunk
if __name__ == '__main__': if __name__ == '__main__':
#set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
\nmostly fell in light volumes on Tuesday as energy shares \nmostly fell in light volumes on Tuesday as energy shares
tracked \nfalls in global oil prices, while weaknesses in banking shares tracked \nfalls in global oil prices, while weaknesses in banking shares
@ -55,9 +72,14 @@ class NER:
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
\namid uncertainty over global demand. \nFor Asian Companies click.''' \namid uncertainty over global demand. \nFor Asian Companies click.'''
classified_text = search_organizations(text) organizations = []
# create list of (word, tag) tuples
# print organizations tagged_words = tag_words(text)
for tuple in classified_text: # put coherent names together
if tuple[1] == "ORGANIZATION": nes = get_coherent_names(tagged_words)
print(tuple) nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
#print(nes_coherent)
for tuple in nes_coherent:
if tuple[1] == 'ORGANIZATION':
organizations.append(tuple[0])
print(organizations)