updated labeling
This commit is contained in:
		
							parent
							
								
									ee911377bf
								
							
						
					
					
						commit
						9367457199
					
				
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								obj/dict_articles_organizations_without_banks.pkl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								obj/dict_articles_organizations_without_banks.pkl
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -94,7 +94,9 @@ class MNBInteractive: | |||||||
|         # classes in order used |         # classes in order used | ||||||
|         classes = classifier.classes_ |         classes = classifier.classes_ | ||||||
| 
 | 
 | ||||||
|  |         class_count = classifier.class_count_ | ||||||
|  | 
 | ||||||
|         print('# MNB: ending multinomial naive bayes') |         print('# MNB: ending multinomial naive bayes') | ||||||
| 
 | 
 | ||||||
|         # return classes and vector of class estimates |         # return classes and vector of class estimates | ||||||
|         return classes, class_probs |         return classes, class_count, class_probs | ||||||
							
								
								
									
										62
									
								
								src/NER.py
									
									
									
									
									
								
							
							
						
						
									
										62
									
								
								src/NER.py
									
									
									
									
									
								
							| @ -193,23 +193,51 @@ class NER: | |||||||
|             n_dict[next_highest[0]] = next_highest[1] |             n_dict[next_highest[0]] = next_highest[1] | ||||||
|         print(n_dict) |         print(n_dict) | ||||||
| 
 | 
 | ||||||
|  |     def remove_banks_from_dict(): | ||||||
|  |         ''' removes bank, news agencies and other organizations we do not need | ||||||
|  |         ''' | ||||||
|  |         # load pickle object | ||||||
|  |         with open('../obj/dict_articles_organizations.pkl', 'rb') as input: | ||||||
|  |             dict = pickle.load(input) | ||||||
|  | 
 | ||||||
|  |         black_list = ['Eastern and Southern African Trade and Development Bank', 'PTA Bank', 'Citigroup',  | ||||||
|  |               'Rand Merchant Bank', 'Banca Carige', 'World Bank', 'Bank of America', 'Deutsche Bank', 'HSBC', 'JP Morgan', | ||||||
|  |               'Credit Suisse', 'JPMorgan', 'BNP Paribas', 'Goldman Sachs', 'Commerzbank', 'Deutsche Boerse', 'Handelsblatt', | ||||||
|  |               'Sky News', 'Labour', 'UN', 'Bank of Japan', 'Goldman', 'Goldman Sachs Asset Management', 'New York Times',  | ||||||
|  |               'Bank of Scotland','World Economic Forum','Organisation for Economic Cooperation and Development', | ||||||
|  |               'Russell Investments','Royal London Asset Management','Conservative party','Blom Bank','Banco Santander', | ||||||
|  |               'Guardian Money','Financial Services Agency','Munich Re','Banca Popolare di Vicenza','SoftBank', | ||||||
|  |               'Financial Conduct Authority','Qatar National Bank','Welt am Sonntag','Sueddeutsche Zeitung','Der Spiegel', | ||||||
|  |               'Bank of England', 'Bank of America Merrill Lynch', 'Barclays', 'London Metal Exchange', 'Petroleum Exporting Countries'] | ||||||
|  | 
 | ||||||
|  |         for k, v in dict.items(): | ||||||
|  |             for org in black_list: | ||||||
|  |                 if org in v: | ||||||
|  |                     v.remove(org) | ||||||
|  | 
 | ||||||
|  |         # save new dict | ||||||
|  |         with open('../obj/'+ 'dict_articles_organizations_without_banks' + '.pkl', 'wb') as f: | ||||||
|  |             pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL) | ||||||
|  | 
 | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
| 
 | 
 | ||||||
|     print('# starting NER...') |     # print('# starting NER...') | ||||||
|     print() |     # print() | ||||||
|     # read data set |     # # read data set | ||||||
|     file = '..\\data\\cleaned_data_set_without_header.csv' |     # file = '..\\data\\cleaned_data_set_without_header.csv' | ||||||
|     df = pd.read_csv(file, |     # df = pd.read_csv(file, | ||||||
|                      delimiter='|', |                      # delimiter='|', | ||||||
|                      header=None, |                      # header=None, | ||||||
|                      index_col=None, |                      # index_col=None, | ||||||
|                      engine='python', |                      # engine='python', | ||||||
|                      # usecols=[1,2], |                      # # usecols=[1,2], | ||||||
|                      # nrows=100, |                      # # nrows=100, | ||||||
|                      quoting=csv.QUOTE_NONNUMERIC, |                      # quoting=csv.QUOTE_NONNUMERIC, | ||||||
|                      quotechar='\'') |                      # quotechar='\'') | ||||||
|     #print(df) |     # #print(df) | ||||||
|     texts = df[1] + '. ' + df[2] |     # texts = df[1] + '. ' + df[2] | ||||||
|     NER.make_article_orgs_dict(texts) |     # NER.make_article_orgs_dict(texts) | ||||||
|     # NER.show_most_common_companies() |     # NER.show_most_common_companies() | ||||||
|     # print(NER.tag_words('On Monday, Github and Microsoft announced their merger.')) |     # print(NER.tag_words('On Monday, Github and Microsoft announced their merger.')) | ||||||
|  |      | ||||||
|  |     NER.remove_banks_from_dict() | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user