improved NER.py

2018-11-07 11:51:54 +01:00 · 2018-11-07 11:51:54 +01:00 · 61fbdb1059
commit 61fbdb1059
parent 2243a50ed0
6 changed files with 150 additions and 4441 deletions
--- a/NER.py
+++ b/NER.py
@ -5,10 +5,7 @@ Named Entity Recognition (NER)
 Stanford NER takes a text as input and returns a list of entities
 like persons, organizations and countries, e.g.
 '''
-
-# toDo: complete list legal entity types
-# 'Amazon' not recognized as organization
-
+from collections import OrderedDict
 import csv
 import os

@ -21,26 +18,24 @@ import re

 class NER:

+    # common company abbreviations to be stripped
    company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
-                       'AG', 'LP', 'Limited', 'Tbk', 'Group', 'U.S.', 'BRIEF-',
+                       'AG', 'LP', 'Limited', 'Tbk', 'Group', 'Co.', 'Groups'
                       'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
-                       's.r.l.', 'Holding', 'Holdings']
+                       's.r.l.', 'Holding', 'Holdings', 'GmbH', 'plc', 'Incs',
+                       'Plcs', 'PLC', 'Ltds', 'SA', 'Incs', 'S.A.R.L', 'LLC'
+                       'Company', '& Co.', 'Corporation', 'Pte', 'Pty', 'LLP']

-    # some entities and misc that are not companies
-    misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
-            'EU', 'Staff', 'Min', 'Read', 'SRF', 'New York Stock Exchange',
-            'NYSE', 'DAX' 'ECB', 'Federal Reserve', 'Muslim', 'JPMorgan',
-            'Standard & Poor', 'International Monetary Fund', 'Morgan Stanley',
-            'Hongkong', 'Whitehall Street', 'Fitch Australia Pty', 'AFS',
-            'FT House & Home', 'Fitch Rates Autonomous Community of Asturias',
-            'Autonomous Community of Asturias', 'Fitch Ratings Espana',
-            'Barcelona', 'Fitch Ratings ', 'Congress', 'Fed', 'OPEC', 'U.N.',
-            'National Federation of Independent Business', 'Barclays',
-            'McKinsey', 'Moody', 'Fitch Ratings Ltd.']
-
-    regex = r'European.*|.*Reuters.*|.*(B|b)ank.*|.*Ministry.*|.*Trump.*|.*Banca.*|\
-            .*Department.*|.*House.*|Wall (Street|Str).*|.*Congress.*|\
-            .*Republican.*|Goldman( Sachs)?|.*Chamber.*|.*Department.*'
+    # organizations that are no companies
+    regex = r'.*Reuters.*|.*Ministry.*|.*Trump.*|.*Commission.*|.*BRIEF.*|\
+            |.*Department.*|.*House.*|.*Congress.*|.*IMF.*|.*Senate.*|.*OPEC.*|\
+            |.*Republican.|.*Chamber.*|.*Court.*|.*Committee.*|.*Stock.*|\
+            |.*Financial Times.*|.*Bloomberg.*|.*The Economist.*|\
+            |.*Cnn.*|.*EU.*|.*Staff.*|.*Min.*|.*Read.*|.*SRF.*|.*Eikon.*|\
+            |.*NYSE.*|.*DAX.*|.*ECB.*|.*NAFTA.*|.*Treasury.*|.*Federation.*|\
+            |.*Federal.*|.*Muslim.*|.*Fund.*|.*FT House.*|.*Hongkong.*|\
+            |.*Street.*|.*Str.*|.*St.*|.*AFS.*|.*Barcelona.*|.*Fed.*|\
+            |.*U.N.*|.*European.*|.*U.S.*|.*Community.*'

    def tag_words(text):
        # path to Stanford NER
@ -75,10 +70,6 @@ class NER:
        '''param: article text where organizations must be indentified
        returns: list of identified organisations as strings
        '''
-        # print(text)
-        # print()
-        # print('# examining article...')
-        # print()
        # set paths
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
        os.environ['JAVAHOME'] = java_path
@ -93,15 +84,13 @@ class NER:
        #print(nes_coherent)
        for tuple in nes_coherent:
            # check if company and not already in list
-            if (tuple[0] not in NER.misc) and (tuple[0] not in seen)\
-                and (not re.search(NER.regex, tuple[0])):
+            if (tuple[0] not in seen) and (re.search(NER.regex, tuple[0]) is None):
                organizations.append(tuple[0])
                seen.add(tuple[0])
        print('# recognized the following organizations:')
        print()
        print(organizations)
        print()
-        print()
        return organizations

    def count_companies(texts):
@ -147,6 +136,22 @@ class NER:
        # print(max(dict_com, key=dict_com.get))
        return list(dict_com.values())

+    def show_most_common_companies(n_commons=50):
+        # load pickle object
+        with open('obj/dict_organizations.pkl', 'rb') as input:
+            dict = pickle.load(input)
+        # sort dict by value
+        o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
+                             reverse=True))
+        # return n higest values as dict (word => count)
+        n_dict = {}
+
+        for i in range(n_commons):
+            # next highest score
+            next_highest = o_dict.popitem(last=False)
+            n_dict[next_highest[0]] = next_highest[1]
+        print(n_dict)
+
 if __name__ == '__main__':
    print('# starting NER...')
    print()
@ -164,3 +169,4 @@ if __name__ == '__main__':
    #print(df)
    texts = df[1] + '. ' + df[2]
    NER.count_companies(texts)
+    # NER.show_most_common_companies()
--- a/VisualizerNews.py
+++ b/VisualizerNews.py
@ -7,6 +7,7 @@ Generating a square wordcloud with most common words of input data set.
 from BagOfWords import BagOfWords
 from NER import NER

+from collections import OrderedDict
 import csv
 from datetime import datetime
 from os import path
@ -41,7 +42,7 @@ class VisualizerNews:
                                 quotechar='\'')

        corpus = df_dataset[1] + '. ' + df_dataset[2]
-        stemming = False
+        stemming = True
        rel_freq = True

        # find most common words in dataset
@ -52,8 +53,8 @@ class VisualizerNews:
        dict = BagOfWords.make_dict_common_words(matrix, 200,
                                                 rel_freq, stemming)
        # save dict object
-        with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
-            pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
+        with open('obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
+            pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)

        wordcloud = WordCloud(background_color='white',
                              width=2400, 
@ -80,38 +81,52 @@ class VisualizerNews:
        '''
        print('# preparing histogram of company mentions...')
        print()
-        # read data set
-        file = 'data\\cleaned_data_set_without_header.csv'
-        df = pd.read_csv(file,
-                         delimiter='|',
-                         header=None,
-                         index_col=None,
-                         engine='python',
-                         usecols=[1,2],
-                         #nrows=10,
-                         quoting=csv.QUOTE_NONNUMERIC,
-                         quotechar='\'')
+        # # read data set
+        # file = 'data\\cleaned_data_set_without_header.csv'
+        # df = pd.read_csv(file,
+                         # delimiter='|',
+                         # header=None,
+                         # index_col=None,
+                         # engine='python',
+                         # usecols=[1,2],
+                         # #nrows=10,
+                         # quoting=csv.QUOTE_NONNUMERIC,
+                         # quotechar='\'')

-        # # only articles with label==1
-        # df_hits = df[df['Label'] == 1]
-        # texts = df_hits['Title'] + '. ' + df_hits['Text']
-        texts = df[1] + '. ' + df[2]
+        # # # only articles with label==1
+        # # df_hits = df[df['Label'] == 1]
+        # # texts = df_hits['Title'] + '. ' + df_hits['Text']
+        # texts = df[1] + '. ' + df[2]

-        # list: count articles with company names
-        count_names = NER.count_companies(texts)
+        # # list: count articles with company names
+        # count_names = NER.count_companies(texts)

+        # # sort list in descending order
+        # count_names.sort(reverse=True)
+        # # convert list to array
+        # names = np.asarray(count_names)
+
+        # load pickle object
+        with open('obj/dict_organizations.pkl', 'rb') as input:
+            dict = pickle.load(input)
+        # make list of dict's values
+        count_companies = list(dict.values())
        # sort list in descending order
-        count_names.sort(reverse=True)
+        count_companies.sort(reverse=True)
        # convert list to array
-        names = np.asarray(count_names)
-        #plt.title('Company mentions in News Articles')
+        names = np.asarray(count_companies)
+
        plt.xlabel('Count of articles that mention a company')
        # Number of companies with this number of mentions
        plt.ylabel('Number of companies with this number of articles')
-        num_bins = 50
+        num_bins = 400
        n, bins, patches = plt.hist(names, num_bins,
                                    facecolor='darkred', alpha=0.5)
-        plt.axis([0, 50, 0, 1000])
+        plt.axis([1, 14, 0, 14000])
+
+        # format axis labels for thousends (e.g. '10,000')
+        plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
+            .FuncFormatter(lambda x, p: format(int(x), ',')))

        # save to file
        plt.savefig('visualization\\NER_{}.eps'
@ -163,7 +178,6 @@ class VisualizerNews:
        n, bins, patches = plt.hist(names, num_bins,
                                    facecolor='darkslategrey', alpha=0.5)
        # [xmin, xmax, ymin, ymax] of axis
-        #plt.axis([format(300, ','),format(10000, ','), 0, 500])
        plt.axis([300,10000,0,500])
        # format axis labels for thousends (e.g. '10,000')
        plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
@ -188,7 +202,7 @@ class VisualizerNews:
                                 #usecols=[3], #column 'Site'
                                 index_col=None,
                                 engine='python',
-                                 nrows=10,
+                                 #nrows=10,
                                 quoting=csv.QUOTE_NONNUMERIC,
                                 quotechar='\'')
        # find all different sites, group by 'Site'
@ -221,44 +235,58 @@ class VisualizerNews:
    def plot_hist_most_common_words(n_commons = 10):
        print('# preparing histogram of most common words...')
        print()
-        # load data set
-        filepath = 'data\\cleaned_data_set_without_header.csv'
-        df_dataset = pd.read_csv(filepath,
-                                 delimiter='|',
-                                 header=None,
-                                 usecols=[1,2],
-                                 index_col=None,
-                                 engine='python',
-                                 #nrows=1000,
-                                 quoting=csv.QUOTE_NONNUMERIC,
-                                 quotechar='\'')
+        # # load data set
+        # filepath = 'data\\cleaned_data_set_without_header.csv'
+        # df_dataset = pd.read_csv(filepath,
+                                 # delimiter='|',
+                                 # header=None,
+                                 # usecols=[1,2],
+                                 # index_col=None,
+                                 # engine='python',
+                                 # #nrows=1000,
+                                 # quoting=csv.QUOTE_NONNUMERIC,
+                                 # quotechar='\'')

-        corpus = df_dataset[1] + '. ' + df_dataset[2]
+        # corpus = df_dataset[1] + '. ' + df_dataset[2]

-        stemming = False
-        rel_freq = True
+        # stemming = False
+        # rel_freq = True

-        # find most common words in dataset
-        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
-        vocab = BagOfWords.make_vocab(extracted_words, stemming)
-        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
-                                        stemming)
-        dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
-                                                 stemming)
-        # save dict object
-        with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
-            pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
+        # # find most common words in dataset
+        # extracted_words = BagOfWords.extract_all_words(corpus, stemming)
+        # vocab = BagOfWords.make_vocab(extracted_words, stemming)
+        # matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
+                                        # stemming)
+        # dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
+                                                 # stemming)
+        # # save dict object
+        # with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
+            # pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)

-        plt.xlabel('Most common words in textual corpus')
+        # load pickle object
+        with open ('obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
+            dict = pickle.load(i)
+        # sort dict by value
+        o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
+                             reverse=True))
+        # return n higest values as dict (word => count)
+        n_dict = {}
+
+        for i in range(n_commons):
+            # next highest score
+            next_highest = o_dict.popitem(last=False)
+            n_dict[next_highest[0]] = next_highest[1]
+
+        #plt.xlabel('Most common words in textual corpus')
        plt.ylabel('Relative frequency')

-        labels = list(dict.keys())
-        numbers = list(dict.values())
+        labels = list(n_dict.keys())
+        numbers = list(n_dict.values())
        nbars = n_commons
        plt.bar(np.arange(nbars), 
                height=numbers, 
                tick_label=labels, 
-                facecolor='darkorange')
+                facecolor='royalblue')
        plt.savefig('visualization\\10_most_common_words_{}.eps'
                    .format(VisualizerNews.datestring))
        plt.savefig('visualization\\10_most_common_words_{}.png'
@ -269,10 +297,39 @@ class VisualizerNews:
        ''' open pkl file of dict, plot histogram of number of different
        company names per article.
        '''
+        # list of number of different companies per article (int)
+        list = []
+        with open('obj/num_mentions_companies.pkl', 'rb') as input:
+            list = pickle.load(input)
+
+        # sort list in descending order
+        list.sort(reverse=True)
+
+        # convert list to array
+        names = np.asarray(list)
+
+        plt.xlabel('Number of different company names in news article')
+        plt.ylabel('Number of articles with this number of company names')
+        num_bins = 100
+        n, bins, patches = plt.hist(names, num_bins,
+                                    facecolor='darkgreen', alpha=0.5)
+        plt.axis([0, 30, 0, 1500])
+
+        # format axis labels for thousends (e.g. '10,000')
+        plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
+            .FuncFormatter(lambda x, p: format(int(x), ',')))
+
+        # save to file
+        plt.savefig('visualization\\NER_2_{}.eps'
+                    .format(VisualizerNews.datestring))
+        plt.savefig('visualization\\NER_2_{}.png'
+                    .format(VisualizerNews.datestring))
+        plt.show()

 if __name__ == '__main__':
    VisualizerNews.plot_wordcloud_dataset()
    # VisualizerNews.plot_histogram_companies()
+    # VisualizerNews.plot_hist_num_comp_per_art()
    # VisualizerNews.plot_histogram_text_lengths()
    # VisualizerNews.plot_pie_chart_of_sites()
-    VisualizerNews.plot_hist_most_common_words()
+    # VisualizerNews.plot_hist_most_common_words(10)
--- a/obj/dict_organizations.pkl
+++ b/obj/dict_organizations.pkl
--- a/obj/num_mentions_companies.pkl
+++ b/obj/num_mentions_companies.pkl
--- a/visualization/TextLength_2018-11-05.pdf
+++ b/visualization/TextLength_2018-11-05.pdf
--- a/visualization/TextLength_2018-11-05.pgf
+++ b/visualization/TextLength_2018-11-05.pgf