improved NER.py

2018-11-07 11:51:54 +01:00 · 2018-11-07 11:51:54 +01:00 · 61fbdb1059
commit 61fbdb1059
parent 2243a50ed0
6 changed files with 150 additions and 4441 deletions
--- a/NER.py
+++ b/NER.py
@ -5,10 +5,7 @@ Named Entity Recognition (NER)
 Stanford NER takes a text as input and returns a list of entities
 like persons, organizations and countries, e.g.
 '''
-
+from collections import OrderedDict
 # toDo: complete list legal entity types
 # 'Amazon' not recognized as organization
 import csv
 import os
@ -21,26 +18,24 @@ import re
 class NER:
    # common company abbreviations to be stripped
    company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
-                       'AG', 'LP', 'Limited', 'Tbk', 'Group', 'U.S.', 'BRIEF-',
+                       'AG', 'LP', 'Limited', 'Tbk', 'Group', 'Co.', 'Groups'
                       'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
-                       's.r.l.', 'Holding', 'Holdings']
+                       's.r.l.', 'Holding', 'Holdings', 'GmbH', 'plc', 'Incs',
                       'Plcs', 'PLC', 'Ltds', 'SA', 'Incs', 'S.A.R.L', 'LLC'
                       'Company', '& Co.', 'Corporation', 'Pte', 'Pty', 'LLP']
-    # some entities and misc that are not companies
+    # organizations that are no companies
-    misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
+    regex = r'.*Reuters.*|.*Ministry.*|.*Trump.*|.*Commission.*|.*BRIEF.*|\
-            'EU', 'Staff', 'Min', 'Read', 'SRF', 'New York Stock Exchange',
+            |.*Department.*|.*House.*|.*Congress.*|.*IMF.*|.*Senate.*|.*OPEC.*|\
-            'NYSE', 'DAX' 'ECB', 'Federal Reserve', 'Muslim', 'JPMorgan',
+            |.*Republican.|.*Chamber.*|.*Court.*|.*Committee.*|.*Stock.*|\
-            'Standard & Poor', 'International Monetary Fund', 'Morgan Stanley',
+            |.*Financial Times.*|.*Bloomberg.*|.*The Economist.*|\
-            'Hongkong', 'Whitehall Street', 'Fitch Australia Pty', 'AFS',
+            |.*Cnn.*|.*EU.*|.*Staff.*|.*Min.*|.*Read.*|.*SRF.*|.*Eikon.*|\
-            'FT House & Home', 'Fitch Rates Autonomous Community of Asturias',
+            |.*NYSE.*|.*DAX.*|.*ECB.*|.*NAFTA.*|.*Treasury.*|.*Federation.*|\
-            'Autonomous Community of Asturias', 'Fitch Ratings Espana',
+            |.*Federal.*|.*Muslim.*|.*Fund.*|.*FT House.*|.*Hongkong.*|\
-            'Barcelona', 'Fitch Ratings ', 'Congress', 'Fed', 'OPEC', 'U.N.',
+            |.*Street.*|.*Str.*|.*St.*|.*AFS.*|.*Barcelona.*|.*Fed.*|\
-            'National Federation of Independent Business', 'Barclays',
+            |.*U.N.*|.*European.*|.*U.S.*|.*Community.*'
            'McKinsey', 'Moody', 'Fitch Ratings Ltd.']
    regex = r'European.*|.*Reuters.*|.*(B|b)ank.*|.*Ministry.*|.*Trump.*|.*Banca.*|\
            .*Department.*|.*House.*|Wall (Street|Str).*|.*Congress.*|\
            .*Republican.*|Goldman( Sachs)?|.*Chamber.*|.*Department.*'
    def tag_words(text):
        # path to Stanford NER
@ -75,10 +70,6 @@ class NER:
        '''param: article text where organizations must be indentified
        returns: list of identified organisations as strings
        '''
        # print(text)
        # print()
        # print('# examining article...')
        # print()
        # set paths
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
        os.environ['JAVAHOME'] = java_path
@ -93,15 +84,13 @@ class NER:
        #print(nes_coherent)
        for tuple in nes_coherent:
            # check if company and not already in list
-            if (tuple[0] not in NER.misc) and (tuple[0] not in seen)\
+            if (tuple[0] not in seen) and (re.search(NER.regex, tuple[0]) is None):
                and (not re.search(NER.regex, tuple[0])):
                organizations.append(tuple[0])
                seen.add(tuple[0])
        print('# recognized the following organizations:')
        print()
        print(organizations)
        print()
        print()
        return organizations
    def count_companies(texts):
@ -147,6 +136,22 @@ class NER:
        # print(max(dict_com, key=dict_com.get))
        return list(dict_com.values())
    def show_most_common_companies(n_commons=50):
        # load pickle object
        with open('obj/dict_organizations.pkl', 'rb') as input:
            dict = pickle.load(input)
        # sort dict by value
        o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
                             reverse=True))
        # return n higest values as dict (word => count)
        n_dict = {}
        for i in range(n_commons):
            # next highest score
            next_highest = o_dict.popitem(last=False)
            n_dict[next_highest[0]] = next_highest[1]
        print(n_dict)
 if __name__ == '__main__':
    print('# starting NER...')
    print()
@ -163,4 +168,5 @@ if __name__ == '__main__':
                     quotechar='\'')
    #print(df)
    texts = df[1] + '. ' + df[2]
-    NER.count_companies(texts)
+    NER.count_companies(texts)
    # NER.show_most_common_companies()
--- a/VisualizerNews.py
+++ b/VisualizerNews.py
@ -7,6 +7,7 @@ Generating a square wordcloud with most common words of input data set.
 from BagOfWords import BagOfWords
 from NER import NER
 from collections import OrderedDict
 import csv
 from datetime import datetime
 from os import path
@ -41,7 +42,7 @@ class VisualizerNews:
                                 quotechar='\'')
        corpus = df_dataset[1] + '. ' + df_dataset[2]
-        stemming = False
+        stemming = True
        rel_freq = True
        # find most common words in dataset
@ -52,8 +53,8 @@ class VisualizerNews:
        dict = BagOfWords.make_dict_common_words(matrix, 200,
                                                 rel_freq, stemming)
        # save dict object
-        with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
+        with open('obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
-            pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
+            pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
        wordcloud = WordCloud(background_color='white',
                              width=2400, 
@ -80,38 +81,52 @@ class VisualizerNews:
        '''
        print('# preparing histogram of company mentions...')
        print()
-        # read data set
+        # # read data set
-        file = 'data\\cleaned_data_set_without_header.csv'
+        # file = 'data\\cleaned_data_set_without_header.csv'
-        df = pd.read_csv(file,
+        # df = pd.read_csv(file,
-                         delimiter='|',
+                         # delimiter='|',
-                         header=None,
+                         # header=None,
-                         index_col=None,
+                         # index_col=None,
-                         engine='python',
+                         # engine='python',
-                         usecols=[1,2],
+                         # usecols=[1,2],
-                         #nrows=10,
+                         # #nrows=10,
-                         quoting=csv.QUOTE_NONNUMERIC,
+                         # quoting=csv.QUOTE_NONNUMERIC,
-                         quotechar='\'')
+                         # quotechar='\'')
-        # # only articles with label==1
+        # # # only articles with label==1
-        # df_hits = df[df['Label'] == 1]
+        # # df_hits = df[df['Label'] == 1]
-        # texts = df_hits['Title'] + '. ' + df_hits['Text']
+        # # texts = df_hits['Title'] + '. ' + df_hits['Text']
-        texts = df[1] + '. ' + df[2]
+        # texts = df[1] + '. ' + df[2]
-        # list: count articles with company names
+        # # list: count articles with company names
-        count_names = NER.count_companies(texts)
+        # count_names = NER.count_companies(texts)
-        
+
        # # sort list in descending order
        # count_names.sort(reverse=True)
        # # convert list to array
        # names = np.asarray(count_names)
        # load pickle object
        with open('obj/dict_organizations.pkl', 'rb') as input:
            dict = pickle.load(input)
        # make list of dict's values
        count_companies = list(dict.values())
        # sort list in descending order
-        count_names.sort(reverse=True)
+        count_companies.sort(reverse=True)
        # convert list to array
-        names = np.asarray(count_names)
+        names = np.asarray(count_companies)
-        #plt.title('Company mentions in News Articles')
+
        plt.xlabel('Count of articles that mention a company')
        # Number of companies with this number of mentions
        plt.ylabel('Number of companies with this number of articles')
-        num_bins = 50
+        num_bins = 400
        n, bins, patches = plt.hist(names, num_bins,
                                    facecolor='darkred', alpha=0.5)
-        plt.axis([0, 50, 0, 1000])
+        plt.axis([1, 14, 0, 14000])
        # format axis labels for thousends (e.g. '10,000')
        plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
            .FuncFormatter(lambda x, p: format(int(x), ',')))
        # save to file
        plt.savefig('visualization\\NER_{}.eps'
@ -163,7 +178,6 @@ class VisualizerNews:
        n, bins, patches = plt.hist(names, num_bins,
                                    facecolor='darkslategrey', alpha=0.5)
        # [xmin, xmax, ymin, ymax] of axis
        #plt.axis([format(300, ','),format(10000, ','), 0, 500])
        plt.axis([300,10000,0,500])
        # format axis labels for thousends (e.g. '10,000')
        plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
@ -188,7 +202,7 @@ class VisualizerNews:
                                 #usecols=[3], #column 'Site'
                                 index_col=None,
                                 engine='python',
-                                 nrows=10,
+                                 #nrows=10,
                                 quoting=csv.QUOTE_NONNUMERIC,
                                 quotechar='\'')
        # find all different sites, group by 'Site'
@ -221,44 +235,58 @@ class VisualizerNews:
    def plot_hist_most_common_words(n_commons = 10):
        print('# preparing histogram of most common words...')
        print()
-        # load data set
+        # # load data set
-        filepath = 'data\\cleaned_data_set_without_header.csv'
+        # filepath = 'data\\cleaned_data_set_without_header.csv'
-        df_dataset = pd.read_csv(filepath,
+        # df_dataset = pd.read_csv(filepath,
-                                 delimiter='|',
+                                 # delimiter='|',
-                                 header=None,
+                                 # header=None,
-                                 usecols=[1,2],
+                                 # usecols=[1,2],
-                                 index_col=None,
+                                 # index_col=None,
-                                 engine='python',
+                                 # engine='python',
-                                 #nrows=1000,
+                                 # #nrows=1000,
-                                 quoting=csv.QUOTE_NONNUMERIC,
+                                 # quoting=csv.QUOTE_NONNUMERIC,
-                                 quotechar='\'')
+                                 # quotechar='\'')
-        corpus = df_dataset[1] + '. ' + df_dataset[2]
+        # corpus = df_dataset[1] + '. ' + df_dataset[2]
-        stemming = False
+        # stemming = False
-        rel_freq = True
+        # rel_freq = True
-        # find most common words in dataset
+        # # find most common words in dataset
-        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
+        # extracted_words = BagOfWords.extract_all_words(corpus, stemming)
-        vocab = BagOfWords.make_vocab(extracted_words, stemming)
+        # vocab = BagOfWords.make_vocab(extracted_words, stemming)
-        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
+        # matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
-                                        stemming)
+                                        # stemming)
-        dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
+        # dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
-                                                 stemming)
+                                                 # stemming)
-        # save dict object
+        # # save dict object
-        with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
+        # with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
-            pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
+            # pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
-        plt.xlabel('Most common words in textual corpus')
+        # load pickle object
        with open ('obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
            dict = pickle.load(i)
        # sort dict by value
        o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
                             reverse=True))
        # return n higest values as dict (word => count)
        n_dict = {}
        for i in range(n_commons):
            # next highest score
            next_highest = o_dict.popitem(last=False)
            n_dict[next_highest[0]] = next_highest[1]
        #plt.xlabel('Most common words in textual corpus')
        plt.ylabel('Relative frequency')
-        labels = list(dict.keys())
+        labels = list(n_dict.keys())
-        numbers = list(dict.values())
+        numbers = list(n_dict.values())
        nbars = n_commons
        plt.bar(np.arange(nbars), 
                height=numbers, 
                tick_label=labels, 
-                facecolor='darkorange')
+                facecolor='royalblue')
        plt.savefig('visualization\\10_most_common_words_{}.eps'
                    .format(VisualizerNews.datestring))
        plt.savefig('visualization\\10_most_common_words_{}.png'
@ -269,10 +297,39 @@ class VisualizerNews:
        ''' open pkl file of dict, plot histogram of number of different
        company names per article.
        '''
-        
+        # list of number of different companies per article (int)
        list = []
        with open('obj/num_mentions_companies.pkl', 'rb') as input:
            list = pickle.load(input)
        # sort list in descending order
        list.sort(reverse=True)
        # convert list to array
        names = np.asarray(list)
        plt.xlabel('Number of different company names in news article')
        plt.ylabel('Number of articles with this number of company names')
        num_bins = 100
        n, bins, patches = plt.hist(names, num_bins,
                                    facecolor='darkgreen', alpha=0.5)
        plt.axis([0, 30, 0, 1500])
        # format axis labels for thousends (e.g. '10,000')
        plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
            .FuncFormatter(lambda x, p: format(int(x), ',')))
        # save to file
        plt.savefig('visualization\\NER_2_{}.eps'
                    .format(VisualizerNews.datestring))
        plt.savefig('visualization\\NER_2_{}.png'
                    .format(VisualizerNews.datestring))
        plt.show()
 if __name__ == '__main__':
    VisualizerNews.plot_wordcloud_dataset()
    # VisualizerNews.plot_histogram_companies()
    # VisualizerNews.plot_hist_num_comp_per_art()
    # VisualizerNews.plot_histogram_text_lengths()
    # VisualizerNews.plot_pie_chart_of_sites()
-    VisualizerNews.plot_hist_most_common_words()
+    # VisualizerNews.plot_hist_most_common_words(10)
--- a/obj/dict_organizations.pkl
+++ b/obj/dict_organizations.pkl
--- a/obj/num_mentions_companies.pkl
+++ b/obj/num_mentions_companies.pkl
--- a/visualization/TextLength_2018-11-05.pdf
+++ b/visualization/TextLength_2018-11-05.pdf
--- a/visualization/TextLength_2018-11-05.pgf
+++ b/visualization/TextLength_2018-11-05.pgf