improved NER

2018-09-21 12:10:55 +02:00 · 2018-09-21 12:10:55 +02:00 · 14e5af9d7d
commit 14e5af9d7d
parent 66d366b36e
1 changed files with 39 additions and 17 deletions
--- a/NER.py
+++ b/NER.py
@ -12,24 +12,41 @@ from nltk.tag import StanfordNERTagger
 from nltk.tokenize import word_tokenize

 class NER:
-    #set paths
-    java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
-    os.environ['JAVAHOME'] = java_path

-    stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
-    'NER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
-    stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
-    'NER\\stanford-ner-2018-02-27\\stanford-ner.jar'
-
-    def search_organizations(text):
+    def tag_words(text):
+        stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
+        stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
        # create tagger object
        st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')

        tokenized_text = word_tokenize(text)
-        classified_text = st.tag(tokenized_text)
-        return classified_text
+        tagged_words = st.tag(tokenized_text)
+        # returns list of tuples (word, tag)
+        return tagged_words
+
+    def get_coherent_names(tagged_words):
+        continuous_chunk = []
+        current_chunk = []
+
+        for token, tag in tagged_words:
+            if tag != "O":
+                current_chunk.append((token, tag))
+            else:
+                # if current chunk is not empty
+                if current_chunk: 
+                    continuous_chunk.append(current_chunk)
+                    current_chunk = []
+        # put the final current_chunk into the continuous_chunk (if any)
+        if current_chunk:
+            continuous_chunk.append(current_chunk)
+        return continuous_chunk

    if __name__ == '__main__':
+
+        #set paths
+        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
+        os.environ['JAVAHOME'] = java_path
+
        text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
                    \nmostly fell in light volumes on Tuesday as energy shares
                    tracked \nfalls in global oil prices, while weaknesses in banking shares
@ -55,9 +72,14 @@ class NER:
                    Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
                    \namid uncertainty over global demand. \nFor Asian Companies click.'''

-        classified_text = search_organizations(text)
-
-        # print organizations
-        for tuple in classified_text:
-            if tuple[1] == "ORGANIZATION":
-                print(tuple)
+        organizations = []
+        # create list of (word, tag) tuples
+        tagged_words = tag_words(text)
+        # put coherent names together
+        nes = get_coherent_names(tagged_words)
+        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
+        #print(nes_coherent)
+        for tuple in nes_coherent:
+            if tuple[1] == 'ORGANIZATION':
+                organizations.append(tuple[0])
+        print(organizations)