changed NER from nltk to Stanford => better results

2018-09-21 11:00:56 +02:00 · 2018-09-21 11:00:56 +02:00 · 66d366b36e
commit 66d366b36e
parent 6a8386e897
2 changed files with 56 additions and 53 deletions
--- a/NER.py
+++ b/NER.py
@ -2,37 +2,35 @@
 Named Entity Recognition (NER)
 ==============================

-NER takes a text as input and searches for names of persons, companies
-and countries.
+Stanford NER takes a text as input and returns a list of entities
+like persons, organizations and countries, e.g.
 '''
-from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
-from nltk.tree import Tree

-''' TODO: falsch klassifiert:
-[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
-('PERSON', 'Maybank Kim Eng Securities '), ('PERSON', 'Krung Thai Bank '),
- ('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),
- ('PERSON', 'Keppel Corp '), ('ORGANIZATION', 'Companies ')]
-'''
+import os
+
+from nltk.tag import StanfordNERTagger
+from nltk.tokenize import word_tokenize

 class NER:
+    #set paths
+    java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
+    os.environ['JAVAHOME'] = java_path

-    def get_ne_with_label(text):
-        labels = []
-        names = []
-        # TODO: letztes Wort wird nicht erkannt
-        for chunk in ne_chunk(pos_tag(word_tokenize(text + 'lastword.'))):
-            if hasattr(chunk, 'label'):
-                name = ''
-                for c in chunk:
-                    name += c[0] + ' '
-                if name not in names:
-                    names.append(name.strip())
-                    labels.append(chunk.label())
-                    #print(chunk.label(), ' '.join(c[0] for c in chunk))
-        return list(zip(labels, names))
+    stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
+    'NER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
+    stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
+    'NER\\stanford-ner-2018-02-27\\stanford-ner.jar'

-test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
+    def search_organizations(text):
+        # create tagger object
+        st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
+
+        tokenized_text = word_tokenize(text)
+        classified_text = st.tag(tokenized_text)
+        return classified_text
+
+    if __name__ == '__main__':
+        text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
                    \nmostly fell in light volumes on Tuesday as energy shares
                    tracked \nfalls in global oil prices, while weaknesses in banking shares
                    \namid concerns about loans to an ailing steel firm sent the Thai
@ -57,4 +55,9 @@ test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
                    Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
                    \namid uncertainty over global demand. \nFor Asian Companies click.'''

-print(NER.get_ne_with_label(test_article))
+        classified_text = search_organizations(text)
+
+        # print organizations
+        for tuple in classified_text:
+            if tuple[1] == "ORGANIZATION":
+                print(tuple)
--- a/SVM.py
+++ b/SVM.py
@ -52,10 +52,10 @@ class SVM:

        pipeline = Pipeline([('perc', selector), ('SVC', SVC())])

-        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
-                            'SVC__kernel': ['linear','poly'],
-                            'SVC__gamma': [0.0001, 0.001, 0.01, 0.1],
-                            'SVC__C': [0.0001, 0.001, 0.1]},
+        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75],
+                            'SVC__kernel': ['linear'],
+                            'SVC__gamma': [0.000001, 0.00001, 0.0001, 0.001],
+                            'SVC__C': [0.001, 0.01, 0.1, 1, 10]},
                            cv=skf,
                            scoring=make_scorer(f1_score))