''' Named Entity Recognition (NER) ============================== Stanford NER takes a text as input and returns a list of entities like persons, organizations and countries, e.g. ''' import os import matplotlib.pyplot as plt from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize class NER: def tag_words(text): stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz' stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar' # create tagger object st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8') tokenized_text = word_tokenize(text) tagged_words = st.tag(tokenized_text) # returns list of tuples (word, tag) return tagged_words def get_coherent_names(tagged_words): continuous_chunk = [] current_chunk = [] for token, tag in tagged_words: if tag != "O": current_chunk.append((token, tag)) else: # if current chunk is not empty if current_chunk: continuous_chunk.append(current_chunk) current_chunk = [] # put the final current_chunk into the continuous_chunk (if any) if current_chunk: continuous_chunk.append(current_chunk) return continuous_chunk def plot_barchart(): organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6'] num_mentions = [5, 2, 33, 12, 6, 10] #n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green') plt.plot(organizations, num_mentions, 'ro', ms = 10) plt.xlabel('companies') plt.ylabel('count') plt.title('Company mentions in articles') plt.grid(True) plt.show() def find_companies(text): #set paths java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181" os.environ['JAVAHOME'] = java_path organizations = [] # create list of (word, tag) tuples tagged_words = NER.tag_words(text) # put coherent names together nes = NER.get_coherent_names(tagged_words) nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes] #print(nes_coherent) for tuple in nes_coherent: if tuple[1] == 'ORGANIZATION': organizations.append(tuple[0]) return organizations if __name__ == '__main__': #plot_barchart() text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets \nmostly fell in light volumes on Tuesday as energy shares tracked \nfalls in global oil prices, while weaknesses in banking shares \namid concerns about loans to an ailing steel firm sent the Thai \nindex to a one-week closing low. \nBangkok's SET index shed nearly 1 percent after four \nsessions of gains. The index closed at 1,379.32, its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl, the most actively \ntraded by turnover, dropped 2.8 percent to a near one-month low, \nreflecting potential impact of loans to Sahaviriya Steel \nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be lower than 130 percent, the \ndesired level we think and hence the need for more provisioning \nin the following quarters,\" the broker said in a report. \nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its creditors, dropped 1 percent. The steel firm \nand its three creditors agreed on Monday to consider options to \nrestructure debt worth over 50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their slides for a third \nsession, Singapore gave up early gains and Indonesia \nhit a near one-week low, all with trading volumes below \nthe 30-day average ahead of a public holiday on Thursday. \nAmong top losers in the region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell \namid uncertainty over global demand. \nFor Asian Companies click.''' print(NER.find_companies(text))