improved NER
This commit is contained in:
parent
66d366b36e
commit
14e5af9d7d
56
NER.py
56
NER.py
|
@ -12,24 +12,41 @@ from nltk.tag import StanfordNERTagger
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
class NER:
|
class NER:
|
||||||
#set paths
|
|
||||||
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
|
||||||
os.environ['JAVAHOME'] = java_path
|
|
||||||
|
|
||||||
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
|
def tag_words(text):
|
||||||
'NER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
|
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
|
||||||
stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
|
stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
|
||||||
'NER\\stanford-ner-2018-02-27\\stanford-ner.jar'
|
|
||||||
|
|
||||||
def search_organizations(text):
|
|
||||||
# create tagger object
|
# create tagger object
|
||||||
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
|
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
|
||||||
|
|
||||||
tokenized_text = word_tokenize(text)
|
tokenized_text = word_tokenize(text)
|
||||||
classified_text = st.tag(tokenized_text)
|
tagged_words = st.tag(tokenized_text)
|
||||||
return classified_text
|
# returns list of tuples (word, tag)
|
||||||
|
return tagged_words
|
||||||
|
|
||||||
|
def get_coherent_names(tagged_words):
|
||||||
|
continuous_chunk = []
|
||||||
|
current_chunk = []
|
||||||
|
|
||||||
|
for token, tag in tagged_words:
|
||||||
|
if tag != "O":
|
||||||
|
current_chunk.append((token, tag))
|
||||||
|
else:
|
||||||
|
# if current chunk is not empty
|
||||||
|
if current_chunk:
|
||||||
|
continuous_chunk.append(current_chunk)
|
||||||
|
current_chunk = []
|
||||||
|
# put the final current_chunk into the continuous_chunk (if any)
|
||||||
|
if current_chunk:
|
||||||
|
continuous_chunk.append(current_chunk)
|
||||||
|
return continuous_chunk
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
#set paths
|
||||||
|
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
||||||
|
os.environ['JAVAHOME'] = java_path
|
||||||
|
|
||||||
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
||||||
\nmostly fell in light volumes on Tuesday as energy shares
|
\nmostly fell in light volumes on Tuesday as energy shares
|
||||||
tracked \nfalls in global oil prices, while weaknesses in banking shares
|
tracked \nfalls in global oil prices, while weaknesses in banking shares
|
||||||
|
@ -55,9 +72,14 @@ class NER:
|
||||||
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
|
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
|
||||||
\namid uncertainty over global demand. \nFor Asian Companies click.'''
|
\namid uncertainty over global demand. \nFor Asian Companies click.'''
|
||||||
|
|
||||||
classified_text = search_organizations(text)
|
organizations = []
|
||||||
|
# create list of (word, tag) tuples
|
||||||
# print organizations
|
tagged_words = tag_words(text)
|
||||||
for tuple in classified_text:
|
# put coherent names together
|
||||||
if tuple[1] == "ORGANIZATION":
|
nes = get_coherent_names(tagged_words)
|
||||||
print(tuple)
|
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
|
||||||
|
#print(nes_coherent)
|
||||||
|
for tuple in nes_coherent:
|
||||||
|
if tuple[1] == 'ORGANIZATION':
|
||||||
|
organizations.append(tuple[0])
|
||||||
|
print(organizations)
|
Loading…
Reference in New Issue