improved NER

2018-09-21 12:10:55 +02:00 · 2018-09-21 12:10:55 +02:00 · 14e5af9d7d
commit 14e5af9d7d
parent 66d366b36e
1 changed files with 39 additions and 17 deletions
--- a/NER.py
+++ b/NER.py
@ -12,24 +12,41 @@ from nltk.tag import StanfordNERTagger
 from nltk.tokenize import word_tokenize
 class NER:
    #set paths
    java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
    os.environ['JAVAHOME'] = java_path
-    stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
+    def tag_words(text):
-    'NER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
+        stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
-    stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
+        stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
    'NER\\stanford-ner-2018-02-27\\stanford-ner.jar'
    def search_organizations(text):
        # create tagger object
        st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
        tokenized_text = word_tokenize(text)
-        classified_text = st.tag(tokenized_text)
+        tagged_words = st.tag(tokenized_text)
-        return classified_text
+        # returns list of tuples (word, tag)
        return tagged_words
    def get_coherent_names(tagged_words):
        continuous_chunk = []
        current_chunk = []
        for token, tag in tagged_words:
            if tag != "O":
                current_chunk.append((token, tag))
            else:
                # if current chunk is not empty
                if current_chunk: 
                    continuous_chunk.append(current_chunk)
                    current_chunk = []
        # put the final current_chunk into the continuous_chunk (if any)
        if current_chunk:
            continuous_chunk.append(current_chunk)
        return continuous_chunk
    if __name__ == '__main__':
        #set paths
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
        os.environ['JAVAHOME'] = java_path
        text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
                    \nmostly fell in light volumes on Tuesday as energy shares
                    tracked \nfalls in global oil prices, while weaknesses in banking shares
@ -55,9 +72,14 @@ class NER:
                    Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
                    \namid uncertainty over global demand. \nFor Asian Companies click.'''
-        classified_text = search_organizations(text)
+        organizations = []
-
+        # create list of (word, tag) tuples
-        # print organizations
+        tagged_words = tag_words(text)
-        for tuple in classified_text:
+        # put coherent names together
-            if tuple[1] == "ORGANIZATION":
+        nes = get_coherent_names(tagged_words)
-                print(tuple)
+        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
        #print(nes_coherent)
        for tuple in nes_coherent:
            if tuple[1] == 'ORGANIZATION':
                organizations.append(tuple[0])
        print(organizations)