improved NER.py
This commit is contained in:
		
							parent
							
								
									2243a50ed0
								
							
						
					
					
						commit
						61fbdb1059
					
				
							
								
								
									
										64
									
								
								NER.py
									
									
									
									
									
								
							
							
						
						
									
										64
									
								
								NER.py
									
									
									
									
									
								
							| @ -5,10 +5,7 @@ Named Entity Recognition (NER) | ||||
| Stanford NER takes a text as input and returns a list of entities | ||||
| like persons, organizations and countries, e.g. | ||||
| ''' | ||||
| 
 | ||||
| # toDo: complete list legal entity types | ||||
| # 'Amazon' not recognized as organization | ||||
| 
 | ||||
| from collections import OrderedDict | ||||
| import csv | ||||
| import os | ||||
| 
 | ||||
| @ -21,26 +18,24 @@ import re | ||||
| 
 | ||||
| class NER: | ||||
| 
 | ||||
|     # common company abbreviations to be stripped | ||||
|     company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd', | ||||
|                        'AG', 'LP', 'Limited', 'Tbk', 'Group', 'U.S.', 'BRIEF-', | ||||
|                        'AG', 'LP', 'Limited', 'Tbk', 'Group', 'Co.', 'Groups' | ||||
|                        'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc', | ||||
|                        's.r.l.', 'Holding', 'Holdings'] | ||||
|                        's.r.l.', 'Holding', 'Holdings', 'GmbH', 'plc', 'Incs', | ||||
|                        'Plcs', 'PLC', 'Ltds', 'SA', 'Incs', 'S.A.R.L', 'LLC' | ||||
|                        'Company', '& Co.', 'Corporation', 'Pte', 'Pty', 'LLP'] | ||||
| 
 | ||||
|     # some entities and misc that are not companies | ||||
|     misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn', | ||||
|             'EU', 'Staff', 'Min', 'Read', 'SRF', 'New York Stock Exchange', | ||||
|             'NYSE', 'DAX' 'ECB', 'Federal Reserve', 'Muslim', 'JPMorgan', | ||||
|             'Standard & Poor', 'International Monetary Fund', 'Morgan Stanley', | ||||
|             'Hongkong', 'Whitehall Street', 'Fitch Australia Pty', 'AFS', | ||||
|             'FT House & Home', 'Fitch Rates Autonomous Community of Asturias', | ||||
|             'Autonomous Community of Asturias', 'Fitch Ratings Espana', | ||||
|             'Barcelona', 'Fitch Ratings ', 'Congress', 'Fed', 'OPEC', 'U.N.', | ||||
|             'National Federation of Independent Business', 'Barclays', | ||||
|             'McKinsey', 'Moody', 'Fitch Ratings Ltd.'] | ||||
| 
 | ||||
|     regex = r'European.*|.*Reuters.*|.*(B|b)ank.*|.*Ministry.*|.*Trump.*|.*Banca.*|\ | ||||
|             .*Department.*|.*House.*|Wall (Street|Str).*|.*Congress.*|\ | ||||
|             .*Republican.*|Goldman( Sachs)?|.*Chamber.*|.*Department.*' | ||||
|     # organizations that are no companies | ||||
|     regex = r'.*Reuters.*|.*Ministry.*|.*Trump.*|.*Commission.*|.*BRIEF.*|\ | ||||
|             |.*Department.*|.*House.*|.*Congress.*|.*IMF.*|.*Senate.*|.*OPEC.*|\ | ||||
|             |.*Republican.|.*Chamber.*|.*Court.*|.*Committee.*|.*Stock.*|\ | ||||
|             |.*Financial Times.*|.*Bloomberg.*|.*The Economist.*|\ | ||||
|             |.*Cnn.*|.*EU.*|.*Staff.*|.*Min.*|.*Read.*|.*SRF.*|.*Eikon.*|\ | ||||
|             |.*NYSE.*|.*DAX.*|.*ECB.*|.*NAFTA.*|.*Treasury.*|.*Federation.*|\ | ||||
|             |.*Federal.*|.*Muslim.*|.*Fund.*|.*FT House.*|.*Hongkong.*|\ | ||||
|             |.*Street.*|.*Str.*|.*St.*|.*AFS.*|.*Barcelona.*|.*Fed.*|\ | ||||
|             |.*U.N.*|.*European.*|.*U.S.*|.*Community.*' | ||||
| 
 | ||||
|     def tag_words(text): | ||||
|         # path to Stanford NER | ||||
| @ -75,10 +70,6 @@ class NER: | ||||
|         '''param: article text where organizations must be indentified | ||||
|         returns: list of identified organisations as strings | ||||
|         ''' | ||||
|         # print(text) | ||||
|         # print() | ||||
|         # print('# examining article...') | ||||
|         # print() | ||||
|         # set paths | ||||
|         java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181" | ||||
|         os.environ['JAVAHOME'] = java_path | ||||
| @ -93,15 +84,13 @@ class NER: | ||||
|         #print(nes_coherent) | ||||
|         for tuple in nes_coherent: | ||||
|             # check if company and not already in list | ||||
|             if (tuple[0] not in NER.misc) and (tuple[0] not in seen)\ | ||||
|                 and (not re.search(NER.regex, tuple[0])): | ||||
|             if (tuple[0] not in seen) and (re.search(NER.regex, tuple[0]) is None): | ||||
|                 organizations.append(tuple[0]) | ||||
|                 seen.add(tuple[0]) | ||||
|         print('# recognized the following organizations:') | ||||
|         print() | ||||
|         print(organizations) | ||||
|         print() | ||||
|         print() | ||||
|         return organizations | ||||
| 
 | ||||
|     def count_companies(texts): | ||||
| @ -147,6 +136,22 @@ class NER: | ||||
|         # print(max(dict_com, key=dict_com.get)) | ||||
|         return list(dict_com.values()) | ||||
| 
 | ||||
|     def show_most_common_companies(n_commons=50): | ||||
|         # load pickle object | ||||
|         with open('obj/dict_organizations.pkl', 'rb') as input: | ||||
|             dict = pickle.load(input) | ||||
|         # sort dict by value | ||||
|         o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\ | ||||
|                              reverse=True)) | ||||
|         # return n higest values as dict (word => count) | ||||
|         n_dict = {} | ||||
| 
 | ||||
|         for i in range(n_commons): | ||||
|             # next highest score | ||||
|             next_highest = o_dict.popitem(last=False) | ||||
|             n_dict[next_highest[0]] = next_highest[1] | ||||
|         print(n_dict) | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     print('# starting NER...') | ||||
|     print() | ||||
| @ -163,4 +168,5 @@ if __name__ == '__main__': | ||||
|                      quotechar='\'') | ||||
|     #print(df) | ||||
|     texts = df[1] + '. ' + df[2] | ||||
|     NER.count_companies(texts) | ||||
|     NER.count_companies(texts) | ||||
|     # NER.show_most_common_companies() | ||||
| @ -7,6 +7,7 @@ Generating a square wordcloud with most common words of input data set. | ||||
| from BagOfWords import BagOfWords | ||||
| from NER import NER | ||||
| 
 | ||||
| from collections import OrderedDict | ||||
| import csv | ||||
| from datetime import datetime | ||||
| from os import path | ||||
| @ -41,7 +42,7 @@ class VisualizerNews: | ||||
|                                  quotechar='\'') | ||||
| 
 | ||||
|         corpus = df_dataset[1] + '. ' + df_dataset[2] | ||||
|         stemming = False | ||||
|         stemming = True | ||||
|         rel_freq = True | ||||
| 
 | ||||
|         # find most common words in dataset | ||||
| @ -52,8 +53,8 @@ class VisualizerNews: | ||||
|         dict = BagOfWords.make_dict_common_words(matrix, 200, | ||||
|                                                  rel_freq, stemming) | ||||
|         # save dict object | ||||
|         with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f: | ||||
|             pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL) | ||||
|         with open('obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f: | ||||
|             pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL) | ||||
| 
 | ||||
|         wordcloud = WordCloud(background_color='white', | ||||
|                               width=2400,  | ||||
| @ -80,38 +81,52 @@ class VisualizerNews: | ||||
|         ''' | ||||
|         print('# preparing histogram of company mentions...') | ||||
|         print() | ||||
|         # read data set | ||||
|         file = 'data\\cleaned_data_set_without_header.csv' | ||||
|         df = pd.read_csv(file, | ||||
|                          delimiter='|', | ||||
|                          header=None, | ||||
|                          index_col=None, | ||||
|                          engine='python', | ||||
|                          usecols=[1,2], | ||||
|                          #nrows=10, | ||||
|                          quoting=csv.QUOTE_NONNUMERIC, | ||||
|                          quotechar='\'') | ||||
|         # # read data set | ||||
|         # file = 'data\\cleaned_data_set_without_header.csv' | ||||
|         # df = pd.read_csv(file, | ||||
|                          # delimiter='|', | ||||
|                          # header=None, | ||||
|                          # index_col=None, | ||||
|                          # engine='python', | ||||
|                          # usecols=[1,2], | ||||
|                          # #nrows=10, | ||||
|                          # quoting=csv.QUOTE_NONNUMERIC, | ||||
|                          # quotechar='\'') | ||||
| 
 | ||||
|         # # only articles with label==1 | ||||
|         # df_hits = df[df['Label'] == 1] | ||||
|         # texts = df_hits['Title'] + '. ' + df_hits['Text'] | ||||
|         texts = df[1] + '. ' + df[2] | ||||
|         # # # only articles with label==1 | ||||
|         # # df_hits = df[df['Label'] == 1] | ||||
|         # # texts = df_hits['Title'] + '. ' + df_hits['Text'] | ||||
|         # texts = df[1] + '. ' + df[2] | ||||
| 
 | ||||
|         # list: count articles with company names | ||||
|         count_names = NER.count_companies(texts) | ||||
|          | ||||
|         # # list: count articles with company names | ||||
|         # count_names = NER.count_companies(texts) | ||||
| 
 | ||||
|         # # sort list in descending order | ||||
|         # count_names.sort(reverse=True) | ||||
|         # # convert list to array | ||||
|         # names = np.asarray(count_names) | ||||
| 
 | ||||
|         # load pickle object | ||||
|         with open('obj/dict_organizations.pkl', 'rb') as input: | ||||
|             dict = pickle.load(input) | ||||
|         # make list of dict's values | ||||
|         count_companies = list(dict.values()) | ||||
|         # sort list in descending order | ||||
|         count_names.sort(reverse=True) | ||||
|         count_companies.sort(reverse=True) | ||||
|         # convert list to array | ||||
|         names = np.asarray(count_names) | ||||
|         #plt.title('Company mentions in News Articles') | ||||
|         names = np.asarray(count_companies) | ||||
| 
 | ||||
|         plt.xlabel('Count of articles that mention a company') | ||||
|         # Number of companies with this number of mentions | ||||
|         plt.ylabel('Number of companies with this number of articles') | ||||
|         num_bins = 50 | ||||
|         num_bins = 400 | ||||
|         n, bins, patches = plt.hist(names, num_bins, | ||||
|                                     facecolor='darkred', alpha=0.5) | ||||
|         plt.axis([0, 50, 0, 1000]) | ||||
|         plt.axis([1, 14, 0, 14000]) | ||||
| 
 | ||||
|         # format axis labels for thousends (e.g. '10,000') | ||||
|         plt.gca().yaxis.set_major_formatter(matplotlib.ticker\ | ||||
|             .FuncFormatter(lambda x, p: format(int(x), ','))) | ||||
| 
 | ||||
|         # save to file | ||||
|         plt.savefig('visualization\\NER_{}.eps' | ||||
| @ -163,7 +178,6 @@ class VisualizerNews: | ||||
|         n, bins, patches = plt.hist(names, num_bins, | ||||
|                                     facecolor='darkslategrey', alpha=0.5) | ||||
|         # [xmin, xmax, ymin, ymax] of axis | ||||
|         #plt.axis([format(300, ','),format(10000, ','), 0, 500]) | ||||
|         plt.axis([300,10000,0,500]) | ||||
|         # format axis labels for thousends (e.g. '10,000') | ||||
|         plt.gca().xaxis.set_major_formatter(matplotlib.ticker\ | ||||
| @ -188,7 +202,7 @@ class VisualizerNews: | ||||
|                                  #usecols=[3], #column 'Site' | ||||
|                                  index_col=None, | ||||
|                                  engine='python', | ||||
|                                  nrows=10, | ||||
|                                  #nrows=10, | ||||
|                                  quoting=csv.QUOTE_NONNUMERIC, | ||||
|                                  quotechar='\'') | ||||
|         # find all different sites, group by 'Site' | ||||
| @ -221,44 +235,58 @@ class VisualizerNews: | ||||
|     def plot_hist_most_common_words(n_commons = 10): | ||||
|         print('# preparing histogram of most common words...') | ||||
|         print() | ||||
|         # load data set | ||||
|         filepath = 'data\\cleaned_data_set_without_header.csv' | ||||
|         df_dataset = pd.read_csv(filepath, | ||||
|                                  delimiter='|', | ||||
|                                  header=None, | ||||
|                                  usecols=[1,2], | ||||
|                                  index_col=None, | ||||
|                                  engine='python', | ||||
|                                  #nrows=1000, | ||||
|                                  quoting=csv.QUOTE_NONNUMERIC, | ||||
|                                  quotechar='\'') | ||||
|         # # load data set | ||||
|         # filepath = 'data\\cleaned_data_set_without_header.csv' | ||||
|         # df_dataset = pd.read_csv(filepath, | ||||
|                                  # delimiter='|', | ||||
|                                  # header=None, | ||||
|                                  # usecols=[1,2], | ||||
|                                  # index_col=None, | ||||
|                                  # engine='python', | ||||
|                                  # #nrows=1000, | ||||
|                                  # quoting=csv.QUOTE_NONNUMERIC, | ||||
|                                  # quotechar='\'') | ||||
| 
 | ||||
|         corpus = df_dataset[1] + '. ' + df_dataset[2] | ||||
|         # corpus = df_dataset[1] + '. ' + df_dataset[2] | ||||
| 
 | ||||
|         stemming = False | ||||
|         rel_freq = True | ||||
|         # stemming = False | ||||
|         # rel_freq = True | ||||
| 
 | ||||
|         # find most common words in dataset | ||||
|         extracted_words = BagOfWords.extract_all_words(corpus, stemming) | ||||
|         vocab = BagOfWords.make_vocab(extracted_words, stemming) | ||||
|         matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, | ||||
|                                         stemming) | ||||
|         dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, | ||||
|                                                  stemming) | ||||
|         # save dict object | ||||
|         with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f: | ||||
|             pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL) | ||||
|         # # find most common words in dataset | ||||
|         # extracted_words = BagOfWords.extract_all_words(corpus, stemming) | ||||
|         # vocab = BagOfWords.make_vocab(extracted_words, stemming) | ||||
|         # matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, | ||||
|                                         # stemming) | ||||
|         # dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, | ||||
|                                                  # stemming) | ||||
|         # # save dict object | ||||
|         # with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f: | ||||
|             # pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL) | ||||
| 
 | ||||
|         plt.xlabel('Most common words in textual corpus') | ||||
|         # load pickle object | ||||
|         with open ('obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i: | ||||
|             dict = pickle.load(i) | ||||
|         # sort dict by value | ||||
|         o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\ | ||||
|                              reverse=True)) | ||||
|         # return n higest values as dict (word => count) | ||||
|         n_dict = {} | ||||
| 
 | ||||
|         for i in range(n_commons): | ||||
|             # next highest score | ||||
|             next_highest = o_dict.popitem(last=False) | ||||
|             n_dict[next_highest[0]] = next_highest[1] | ||||
| 
 | ||||
|         #plt.xlabel('Most common words in textual corpus') | ||||
|         plt.ylabel('Relative frequency') | ||||
| 
 | ||||
|         labels = list(dict.keys()) | ||||
|         numbers = list(dict.values()) | ||||
|         labels = list(n_dict.keys()) | ||||
|         numbers = list(n_dict.values()) | ||||
|         nbars = n_commons | ||||
|         plt.bar(np.arange(nbars),  | ||||
|                 height=numbers,  | ||||
|                 tick_label=labels,  | ||||
|                 facecolor='darkorange') | ||||
|                 facecolor='royalblue') | ||||
|         plt.savefig('visualization\\10_most_common_words_{}.eps' | ||||
|                     .format(VisualizerNews.datestring)) | ||||
|         plt.savefig('visualization\\10_most_common_words_{}.png' | ||||
| @ -269,10 +297,39 @@ class VisualizerNews: | ||||
|         ''' open pkl file of dict, plot histogram of number of different | ||||
|         company names per article. | ||||
|         ''' | ||||
|          | ||||
|         # list of number of different companies per article (int) | ||||
|         list = [] | ||||
|         with open('obj/num_mentions_companies.pkl', 'rb') as input: | ||||
|             list = pickle.load(input) | ||||
| 
 | ||||
|         # sort list in descending order | ||||
|         list.sort(reverse=True) | ||||
| 
 | ||||
|         # convert list to array | ||||
|         names = np.asarray(list) | ||||
| 
 | ||||
|         plt.xlabel('Number of different company names in news article') | ||||
|         plt.ylabel('Number of articles with this number of company names') | ||||
|         num_bins = 100 | ||||
|         n, bins, patches = plt.hist(names, num_bins, | ||||
|                                     facecolor='darkgreen', alpha=0.5) | ||||
|         plt.axis([0, 30, 0, 1500]) | ||||
| 
 | ||||
|         # format axis labels for thousends (e.g. '10,000') | ||||
|         plt.gca().yaxis.set_major_formatter(matplotlib.ticker\ | ||||
|             .FuncFormatter(lambda x, p: format(int(x), ','))) | ||||
| 
 | ||||
|         # save to file | ||||
|         plt.savefig('visualization\\NER_2_{}.eps' | ||||
|                     .format(VisualizerNews.datestring)) | ||||
|         plt.savefig('visualization\\NER_2_{}.png' | ||||
|                     .format(VisualizerNews.datestring)) | ||||
|         plt.show() | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     VisualizerNews.plot_wordcloud_dataset() | ||||
|     # VisualizerNews.plot_histogram_companies() | ||||
|     # VisualizerNews.plot_hist_num_comp_per_art() | ||||
|     # VisualizerNews.plot_histogram_text_lengths() | ||||
|     # VisualizerNews.plot_pie_chart_of_sites() | ||||
|     VisualizerNews.plot_hist_most_common_words() | ||||
|     # VisualizerNews.plot_hist_most_common_words(10) | ||||
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user