improved NER
This commit is contained in:
parent
66d366b36e
commit
14e5af9d7d
56
NER.py
56
NER.py
|
@ -12,24 +12,41 @@ from nltk.tag import StanfordNERTagger
|
|||
from nltk.tokenize import word_tokenize
|
||||
|
||||
class NER:
|
||||
#set paths
|
||||
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
||||
os.environ['JAVAHOME'] = java_path
|
||||
|
||||
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
|
||||
'NER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
|
||||
stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
|
||||
'NER\\stanford-ner-2018-02-27\\stanford-ner.jar'
|
||||
|
||||
def search_organizations(text):
|
||||
def tag_words(text):
|
||||
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
|
||||
stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
|
||||
# create tagger object
|
||||
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
|
||||
|
||||
tokenized_text = word_tokenize(text)
|
||||
classified_text = st.tag(tokenized_text)
|
||||
return classified_text
|
||||
tagged_words = st.tag(tokenized_text)
|
||||
# returns list of tuples (word, tag)
|
||||
return tagged_words
|
||||
|
||||
def get_coherent_names(tagged_words):
|
||||
continuous_chunk = []
|
||||
current_chunk = []
|
||||
|
||||
for token, tag in tagged_words:
|
||||
if tag != "O":
|
||||
current_chunk.append((token, tag))
|
||||
else:
|
||||
# if current chunk is not empty
|
||||
if current_chunk:
|
||||
continuous_chunk.append(current_chunk)
|
||||
current_chunk = []
|
||||
# put the final current_chunk into the continuous_chunk (if any)
|
||||
if current_chunk:
|
||||
continuous_chunk.append(current_chunk)
|
||||
return continuous_chunk
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
#set paths
|
||||
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
||||
os.environ['JAVAHOME'] = java_path
|
||||
|
||||
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
||||
\nmostly fell in light volumes on Tuesday as energy shares
|
||||
tracked \nfalls in global oil prices, while weaknesses in banking shares
|
||||
|
@ -55,9 +72,14 @@ class NER:
|
|||
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
|
||||
\namid uncertainty over global demand. \nFor Asian Companies click.'''
|
||||
|
||||
classified_text = search_organizations(text)
|
||||
|
||||
# print organizations
|
||||
for tuple in classified_text:
|
||||
if tuple[1] == "ORGANIZATION":
|
||||
print(tuple)
|
||||
organizations = []
|
||||
# create list of (word, tag) tuples
|
||||
tagged_words = tag_words(text)
|
||||
# put coherent names together
|
||||
nes = get_coherent_names(tagged_words)
|
||||
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
|
||||
#print(nes_coherent)
|
||||
for tuple in nes_coherent:
|
||||
if tuple[1] == 'ORGANIZATION':
|
||||
organizations.append(tuple[0])
|
||||
print(organizations)
|
Loading…
Reference in New Issue