now works with hole dataset

2018-09-26 10:24:52 +02:00 · 2018-09-26 10:24:52 +02:00 · 759db3c0cf
commit 759db3c0cf
parent 6bbd125c05
1 changed files with 80 additions and 53 deletions
--- a/NER.py
+++ b/NER.py
@ -5,15 +5,26 @@ Named Entity Recognition (NER)
 Stanford NER takes a text as input and returns a list of entities
 like persons, organizations and countries, e.g.
 '''
-
 import os

 import matplotlib.pyplot as plt
 from nltk.tag import StanfordNERTagger
 from nltk.tokenize import word_tokenize

+from CsvHandler import CsvHandler
+
 class NER:

+    # legal entity types
+    company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP',
+                      'Plc', 'LLC', 'LBO', 'IPO', 'HQ',
+                      'CIO', 'NGO', 'AB']
+
+    # entities that are not companies
+    misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 
+            'Cnn', 'European Commission', 'EU', 'Staff', 'Min', 'Read',
+            'Thomson Reuters Trust Principles']
+
    def tag_words(text):
        stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
        stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
@ -21,80 +32,96 @@ class NER:
        st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')

        tokenized_text = word_tokenize(text)
+        # list of tuples (word, tag)
        tagged_words = st.tag(tokenized_text)
-        # returns list of tuples (word, tag)
        return tagged_words

-    def get_coherent_names(tagged_words):
-        continuous_chunk = []
-        current_chunk = []
+    def get_coherent_tags(tagged_words):
+        continuous_chunks = []
+        current_chunks = []

        for token, tag in tagged_words:
-            if tag != "O":
-                current_chunk.append((token, tag))
-            else:
-                # if current chunk is not empty
-                if current_chunk: 
-                    continuous_chunk.append(current_chunk)
-                    current_chunk = []
-        # put the final current_chunk into the continuous_chunk (if any)
-        if current_chunk:
-            continuous_chunk.append(current_chunk)
-        return continuous_chunk
+            if tag == "ORGANIZATION" and token not in NER.company_abbrevs:
+                current_chunks.append((token, tag))
+            elif current_chunks:
+                # put the final current_chunk into the continuous_chunk
+                continuous_chunks.append(current_chunks)
+                current_chunks = []
+        return continuous_chunks

-    def plot_barchart():
-        organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6']
-        num_mentions = [5, 2, 33, 12, 6, 10]
-        #n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green')
-        plt.plot(organizations, num_mentions, 'ro', ms = 10)
-        plt.xlabel('companies')
-        plt.ylabel('count')
-        plt.title('Company mentions in articles')
+    def plot_barchart(count_names):
+        '''pyplot diagram of company names distribution
+        in input news articles
+        x-axis:different company names (numbered consecutively)
+        y-axis:counts of company name
+        '''
+        plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
+        plt.xlabel('Company Names')
+        plt.ylabel('Article Count')
+        plt.title('Counts of News Articles with Company Name')
        plt.grid(True)
        plt.show()

    def find_companies(text):
-        #set paths
+        '''param: article text where organizations must be indentified
+        returns: list of identified organisations as strings
+        '''
+        # set paths
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
        os.environ['JAVAHOME'] = java_path

+        seen = set()
        organizations = []
        # create list of (word, tag) tuples
        tagged_words = NER.tag_words(text)
        # put coherent names together
-        nes = NER.get_coherent_names(tagged_words)
+        nes = NER.get_coherent_tags(tagged_words)
        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
        #print(nes_coherent)
        for tuple in nes_coherent:
-            if tuple[1] == 'ORGANIZATION':
+            # check if company and not already in list
+            if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
                organizations.append(tuple[0])
+                seen.add(tuple[0])
        return organizations

+    def count_companies(texts):
+        '''param: list of all article texts
+        returns: list of company counts as ints
+        '''
+        # dictionary of companies with their count
+        dict_com = {}
+        for text in texts:
+            # list of found companies in article
+            coms = NER.find_companies(text)
+            for com in coms:
+                if com in dict_com.keys():
+                    dict_com[com] += 1
+                else:
+                    dict_com[com] = 1
+        # print outlier (value 38)
+        print(max(dict_com, key=dict_com.get))
+        return dict_com.values()
+
 if __name__ == '__main__':

-    #plot_barchart()
-    text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
-                    \nmostly fell in light volumes on Tuesday as energy shares
-                    tracked \nfalls in global oil prices, while weaknesses in banking shares
-                    \namid concerns about loans to an ailing steel firm sent the Thai
-                    \nindex to a one-week closing low. \nBangkok's SET index shed nearly
-                    1 percent after four \nsessions of gains. The index closed at 1,379.32,
-                    its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
-                    the most actively \ntraded by turnover, dropped 2.8 percent to a near
-                    one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
-                    \nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
-                    downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
-                    to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
-                    lower than 130 percent, the \ndesired level we think and hence the need for
-                    more provisioning \nin the following quarters,\" the broker said in a report.
-                    \nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
-                    creditors, dropped 1 percent. The steel firm \nand its three creditors
-                    agreed on Monday to consider options to \nrestructure debt worth over
-                    50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
-                    slides for a third \nsession, Singapore gave up early gains and Indonesia
-                    \nhit a near one-week low, all with trading volumes below \nthe 30-day
-                    average ahead of a public holiday on Thursday. \nAmong top losers in the
-                    region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
-                    Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
-                    \namid uncertainty over global demand. \nFor Asian Companies click.'''
-    print(NER.find_companies(text))
+    filepath = 'classification_labelled_corrected.csv'
+    df = CsvHandler.read_csv(filepath)
+
+    # articles with label==1
+    df_hits = df[df['Label'] == 1]
+
+    texts = df_hits['Title'] + ' ' + df_hits['Text']
+
+    # # zum prüfen lesen
+    # for text in texts[5:10]:
+        # print(text)
+        # print()
+        # print(NER.find_companies(text))
+        # print()
+
+    # count names in hit articles
+    count_names = NER.count_companies(texts)
+
+    # plot diagram
+    NER.plot_barchart(count_names)