updated labeling

2018-12-21 13:50:58 +01:00 · 2018-12-21 13:50:58 +01:00 · 9367457199
commit 9367457199
parent ee911377bf
5 changed files with 534 additions and 986 deletions
--- a/data/interactive_labeling.csv
+++ b/data/interactive_labeling.csv
--- a/obj/dict_articles_organizations_without_banks.pkl
+++ b/obj/dict_articles_organizations_without_banks.pkl
--- a/src/2018-12-01-al-interactive-labeling.ipynb
+++ b/src/2018-12-01-al-interactive-labeling.ipynb
--- a/src/MNBInteractive.py
+++ b/src/MNBInteractive.py
@ -94,7 +94,9 @@ class MNBInteractive:
        # classes in order used
        classes = classifier.classes_
        class_count = classifier.class_count_
        print('# MNB: ending multinomial naive bayes')
        # return classes and vector of class estimates
-        return classes, class_probs
+        return classes, class_count, class_probs
--- a/src/NER.py
+++ b/src/NER.py
@ -193,23 +193,51 @@ class NER:
            n_dict[next_highest[0]] = next_highest[1]
        print(n_dict)
    def remove_banks_from_dict():
        ''' removes bank, news agencies and other organizations we do not need
        '''
        # load pickle object
        with open('../obj/dict_articles_organizations.pkl', 'rb') as input:
            dict = pickle.load(input)
        black_list = ['Eastern and Southern African Trade and Development Bank', 'PTA Bank', 'Citigroup', 
              'Rand Merchant Bank', 'Banca Carige', 'World Bank', 'Bank of America', 'Deutsche Bank', 'HSBC', 'JP Morgan',
              'Credit Suisse', 'JPMorgan', 'BNP Paribas', 'Goldman Sachs', 'Commerzbank', 'Deutsche Boerse', 'Handelsblatt',
              'Sky News', 'Labour', 'UN', 'Bank of Japan', 'Goldman', 'Goldman Sachs Asset Management', 'New York Times', 
              'Bank of Scotland','World Economic Forum','Organisation for Economic Cooperation and Development',
              'Russell Investments','Royal London Asset Management','Conservative party','Blom Bank','Banco Santander',
              'Guardian Money','Financial Services Agency','Munich Re','Banca Popolare di Vicenza','SoftBank',
              'Financial Conduct Authority','Qatar National Bank','Welt am Sonntag','Sueddeutsche Zeitung','Der Spiegel',
              'Bank of England', 'Bank of America Merrill Lynch', 'Barclays', 'London Metal Exchange', 'Petroleum Exporting Countries']
        for k, v in dict.items():
            for org in black_list:
                if org in v:
                    v.remove(org)
        # save new dict
        with open('../obj/'+ 'dict_articles_organizations_without_banks' + '.pkl', 'wb') as f:
            pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
 if __name__ == '__main__':
-    print('# starting NER...')
+    # print('# starting NER...')
-    print()
+    # print()
-    # read data set
+    # # read data set
-    file = '..\\data\\cleaned_data_set_without_header.csv'
+    # file = '..\\data\\cleaned_data_set_without_header.csv'
-    df = pd.read_csv(file,
+    # df = pd.read_csv(file,
-                     delimiter='|',
+                     # delimiter='|',
-                     header=None,
+                     # header=None,
-                     index_col=None,
+                     # index_col=None,
-                     engine='python',
+                     # engine='python',
-                     # usecols=[1,2],
+                     # # usecols=[1,2],
-                     # nrows=100,
+                     # # nrows=100,
-                     quoting=csv.QUOTE_NONNUMERIC,
+                     # quoting=csv.QUOTE_NONNUMERIC,
-                     quotechar='\'')
+                     # quotechar='\'')
-    #print(df)
+    # #print(df)
-    texts = df[1] + '. ' + df[2]
+    # texts = df[1] + '. ' + df[2]
-    NER.make_article_orgs_dict(texts)
+    # NER.make_article_orgs_dict(texts)
    # NER.show_most_common_companies()
-    # print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
+    # print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
    NER.remove_banks_from_dict()