updated labeling

2018-12-21 13:50:58 +01:00 · 2018-12-21 13:50:58 +01:00 · 9367457199
commit 9367457199
parent ee911377bf
5 changed files with 534 additions and 986 deletions
--- a/data/interactive_labeling.csv
+++ b/data/interactive_labeling.csv
--- a/obj/dict_articles_organizations_without_banks.pkl
+++ b/obj/dict_articles_organizations_without_banks.pkl
--- a/src/2018-12-01-al-interactive-labeling.ipynb
+++ b/src/2018-12-01-al-interactive-labeling.ipynb
--- a/src/MNBInteractive.py
+++ b/src/MNBInteractive.py
@ -94,7 +94,9 @@ class MNBInteractive:
        # classes in order used
        classes = classifier.classes_

+        class_count = classifier.class_count_
+
        print('# MNB: ending multinomial naive bayes')

        # return classes and vector of class estimates
-        return classes, class_probs
+        return classes, class_count, class_probs
--- a/src/NER.py
+++ b/src/NER.py
@ -193,23 +193,51 @@ class NER:
            n_dict[next_highest[0]] = next_highest[1]
        print(n_dict)

+    def remove_banks_from_dict():
+        ''' removes bank, news agencies and other organizations we do not need
+        '''
+        # load pickle object
+        with open('../obj/dict_articles_organizations.pkl', 'rb') as input:
+            dict = pickle.load(input)
+
+        black_list = ['Eastern and Southern African Trade and Development Bank', 'PTA Bank', 'Citigroup', 
+              'Rand Merchant Bank', 'Banca Carige', 'World Bank', 'Bank of America', 'Deutsche Bank', 'HSBC', 'JP Morgan',
+              'Credit Suisse', 'JPMorgan', 'BNP Paribas', 'Goldman Sachs', 'Commerzbank', 'Deutsche Boerse', 'Handelsblatt',
+              'Sky News', 'Labour', 'UN', 'Bank of Japan', 'Goldman', 'Goldman Sachs Asset Management', 'New York Times', 
+              'Bank of Scotland','World Economic Forum','Organisation for Economic Cooperation and Development',
+              'Russell Investments','Royal London Asset Management','Conservative party','Blom Bank','Banco Santander',
+              'Guardian Money','Financial Services Agency','Munich Re','Banca Popolare di Vicenza','SoftBank',
+              'Financial Conduct Authority','Qatar National Bank','Welt am Sonntag','Sueddeutsche Zeitung','Der Spiegel',
+              'Bank of England', 'Bank of America Merrill Lynch', 'Barclays', 'London Metal Exchange', 'Petroleum Exporting Countries']
+
+        for k, v in dict.items():
+            for org in black_list:
+                if org in v:
+                    v.remove(org)
+
+        # save new dict
+        with open('../obj/'+ 'dict_articles_organizations_without_banks' + '.pkl', 'wb') as f:
+            pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
+
 if __name__ == '__main__':

-    print('# starting NER...')
-    print()
-    # read data set
-    file = '..\\data\\cleaned_data_set_without_header.csv'
-    df = pd.read_csv(file,
-                     delimiter='|',
-                     header=None,
-                     index_col=None,
-                     engine='python',
-                     # usecols=[1,2],
-                     # nrows=100,
-                     quoting=csv.QUOTE_NONNUMERIC,
-                     quotechar='\'')
-    #print(df)
-    texts = df[1] + '. ' + df[2]
-    NER.make_article_orgs_dict(texts)
+    # print('# starting NER...')
+    # print()
+    # # read data set
+    # file = '..\\data\\cleaned_data_set_without_header.csv'
+    # df = pd.read_csv(file,
+                     # delimiter='|',
+                     # header=None,
+                     # index_col=None,
+                     # engine='python',
+                     # # usecols=[1,2],
+                     # # nrows=100,
+                     # quoting=csv.QUOTE_NONNUMERIC,
+                     # quotechar='\'')
+    # #print(df)
+    # texts = df[1] + '. ' + df[2]
+    # NER.make_article_orgs_dict(texts)
    # NER.show_most_common_companies()
    # print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
+    
+    NER.remove_banks_from_dict()