''' Named Entity Recognition (NER) ============================== Stanford NER takes a text as input and returns a list of entities like persons, organizations and countries, e.g. ''' import os import matplotlib.pyplot as plt from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize from CsvHandler import CsvHandler class NER: # toDo: complete lists: # some legal entity types company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP', 'Plc', 'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB'] # some entities that are not companies misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn', 'European Commission', 'EU', 'Staff', 'Min', 'Read', 'Thomson Reuters Trust Principles', 'New York Stock Exchange', 'NYSE'] def tag_words(text): stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz' stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar' # create tagger object st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8') tokenized_text = word_tokenize(text) # list of tuples (word, tag) tagged_words = st.tag(tokenized_text) return tagged_words def get_coherent_tags(tagged_words): continuous_chunks = [] current_chunks = [] for token, tag in tagged_words: if tag == "ORGANIZATION" and token not in NER.company_abbrevs: current_chunks.append((token, tag)) elif current_chunks: # put the final current_chunk into the continuous_chunk continuous_chunks.append(current_chunks) current_chunks = [] return continuous_chunks def plot_histogram(count_names): '''pyplot diagram of company names distribution in input news articles x-axis:different company names (numbered consecutively) y-axis:counts of company name ''' # sort list in descending order sorted = count_names.sort(reverse=True) # plt.plot(range(len(count_names)), count_names, 'ro', ms = 5) plt.title('Company mentions in News Articles') plt.xlabel('Number of mentions of the company') # Number of companies with this number of mentions plt.ylabel('Frequency') num_bins = 50 n, bins, patches = plt.hist(sorted, num_bins, facecolor='blue', alpha=0.5) # plt.grid(True) plt.show() def find_companies(text): '''param: article text where organizations must be indentified returns: list of identified organisations as strings ''' # set paths java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181" os.environ['JAVAHOME'] = java_path seen = set() organizations = [] # create list of (word, tag) tuples tagged_words = NER.tag_words(text) # put coherent names together nes = NER.get_coherent_tags(tagged_words) nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes] #print(nes_coherent) for tuple in nes_coherent: # check if company and not already in list if (tuple[0] not in NER.misc) and (tuple[0] not in seen): organizations.append(tuple[0]) seen.add(tuple[0]) return organizations def count_companies(texts): '''param: list of all article texts returns: list of company counts as ints ''' # dictionary of companies with their count dict_com = {} for text in texts: # list of found companies in article coms = NER.find_companies(text) for com in coms: if com in dict_com.keys(): dict_com[com] += 1 else: dict_com[com] = 1 # # print outlier # print(max(dict_com, key=dict_com.get)) return dict_com.values() if __name__ == '__main__': filepath = 'classification_labelled_corrected.csv' df = CsvHandler.read_csv(filepath) # articles with label==1 df_hits = df[df['Label'] == 1] texts = df_hits['Title'] + ' ' + df_hits['Text'] # # zum prüfen lesen # for text in texts[10:20]: # print(text) # print() # print(NER.find_companies(text)) # print() # count names in hit articles count_names = NER.count_companies(texts) # plot diagram NER.plot_histogram(count_names)