improved NER

This commit is contained in:
Anne Lorenz 2018-09-21 12:10:55 +02:00
parent 66d366b36e
commit 14e5af9d7d
1 changed files with 39 additions and 17 deletions

View File

@ -12,24 +12,41 @@ from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
class NER:
#set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
def search_organizations(text):
def tag_words(text):
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
# create tagger object
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
return classified_text
tagged_words = st.tag(tokenized_text)
# returns list of tuples (word, tag)
return tagged_words
def get_coherent_names(tagged_words):
continuous_chunk = []
current_chunk = []
for token, tag in tagged_words:
if tag != "O":
current_chunk.append((token, tag))
# if current chunk is not empty
if current_chunk:
current_chunk = []
# put the final current_chunk into the continuous_chunk (if any)
if current_chunk:
return continuous_chunk
if __name__ == '__main__':
#set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
\nmostly fell in light volumes on Tuesday as energy shares
tracked \nfalls in global oil prices, while weaknesses in banking shares
@ -55,9 +72,14 @@ class NER:
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
\namid uncertainty over global demand. \nFor Asian Companies click.'''
classified_text = search_organizations(text)
# print organizations
for tuple in classified_text:
if tuple[1] == "ORGANIZATION":
organizations = []
# create list of (word, tag) tuples
tagged_words = tag_words(text)
# put coherent names together
nes = get_coherent_names(tagged_words)
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
for tuple in nes_coherent:
if tuple[1] == 'ORGANIZATION':