changed NER from nltk to Stanford => better results

2018-09-21 11:00:56 +02:00 · 2018-09-21 11:00:56 +02:00 · 66d366b36e
commit 66d366b36e
parent 6a8386e897
2 changed files with 56 additions and 53 deletions
--- a/NER.py
+++ b/NER.py
@ -2,59 +2,62 @@
 Named Entity Recognition (NER)
 ==============================

-NER takes a text as input and searches for names of persons, companies
-and countries.
+Stanford NER takes a text as input and returns a list of entities
+like persons, organizations and countries, e.g.
 '''
-from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
-from nltk.tree import Tree

-''' TODO: falsch klassifiert:
-[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
-('PERSON', 'Maybank Kim Eng Securities '), ('PERSON', 'Krung Thai Bank '),
- ('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),
- ('PERSON', 'Keppel Corp '), ('ORGANIZATION', 'Companies ')]
-'''
+import os
+
+from nltk.tag import StanfordNERTagger
+from nltk.tokenize import word_tokenize

 class NER:
+    #set paths
+    java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
+    os.environ['JAVAHOME'] = java_path

-    def get_ne_with_label(text):
-        labels = []
-        names = []
-        # TODO: letztes Wort wird nicht erkannt
-        for chunk in ne_chunk(pos_tag(word_tokenize(text + 'lastword.'))):
-            if hasattr(chunk, 'label'):
-                name = ''
-                for c in chunk:
-                    name += c[0] + ' '
-                if name not in names:
-                    names.append(name.strip())
-                    labels.append(chunk.label())
-                    #print(chunk.label(), ' '.join(c[0] for c in chunk))
-        return list(zip(labels, names))
+    stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
+    'NER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
+    stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\Stanford'
+    'NER\\stanford-ner-2018-02-27\\stanford-ner.jar'

-test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
-                \nmostly fell in light volumes on Tuesday as energy shares
-                tracked \nfalls in global oil prices, while weaknesses in banking shares
-                \namid concerns about loans to an ailing steel firm sent the Thai
-                \nindex to a one-week closing low. \nBangkok's SET index shed nearly
-                1 percent after four \nsessions of gains. The index closed at 1,379.32,
-                its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
-                the most actively \ntraded by turnover, dropped 2.8 percent to a near
-                one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
-                \nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
-                downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
-                to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
-                lower than 130 percent, the \ndesired level we think and hence the need for
-                more provisioning \nin the following quarters,\" the broker said in a report.
-                \nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
-                creditors, dropped 1 percent. The steel firm \nand its three creditors
-                agreed on Monday to consider options to \nrestructure debt worth over
-                50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
-                slides for a third \nsession, Singapore gave up early gains and Indonesia
-                \nhit a near one-week low, all with trading volumes below \nthe 30-day
-                average ahead of a public holiday on Thursday. \nAmong top losers in the
-                region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
-                Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
-                \namid uncertainty over global demand. \nFor Asian Companies click.'''
+    def search_organizations(text):
+        # create tagger object
+        st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')

-print(NER.get_ne_with_label(test_article))
+        tokenized_text = word_tokenize(text)
+        classified_text = st.tag(tokenized_text)
+        return classified_text
+
+    if __name__ == '__main__':
+        text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
+                    \nmostly fell in light volumes on Tuesday as energy shares
+                    tracked \nfalls in global oil prices, while weaknesses in banking shares
+                    \namid concerns about loans to an ailing steel firm sent the Thai
+                    \nindex to a one-week closing low. \nBangkok's SET index shed nearly
+                    1 percent after four \nsessions of gains. The index closed at 1,379.32,
+                    its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
+                    the most actively \ntraded by turnover, dropped 2.8 percent to a near
+                    one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
+                    \nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
+                    downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
+                    to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
+                    lower than 130 percent, the \ndesired level we think and hence the need for
+                    more provisioning \nin the following quarters,\" the broker said in a report.
+                    \nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
+                    creditors, dropped 1 percent. The steel firm \nand its three creditors
+                    agreed on Monday to consider options to \nrestructure debt worth over
+                    50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
+                    slides for a third \nsession, Singapore gave up early gains and Indonesia
+                    \nhit a near one-week low, all with trading volumes below \nthe 30-day
+                    average ahead of a public holiday on Thursday. \nAmong top losers in the
+                    region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
+                    Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
+                    \namid uncertainty over global demand. \nFor Asian Companies click.'''
+
+        classified_text = search_organizations(text)
+
+        # print organizations
+        for tuple in classified_text:
+            if tuple[1] == "ORGANIZATION":
+                print(tuple)
--- a/SVM.py
+++ b/SVM.py
@ -52,10 +52,10 @@ class SVM:

        pipeline = Pipeline([('perc', selector), ('SVC', SVC())])

-        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
-                            'SVC__kernel': ['linear','poly'],
-                            'SVC__gamma': [0.0001, 0.001, 0.01, 0.1],
-                            'SVC__C': [0.0001, 0.001, 0.1]},
+        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75],
+                            'SVC__kernel': ['linear'],
+                            'SVC__gamma': [0.000001, 0.00001, 0.0001, 0.001],
+                            'SVC__C': [0.001, 0.01, 0.1, 1, 10]},
                            cv=skf,
                            scoring=make_scorer(f1_score))