now works with hole dataset
This commit is contained in:
		
							parent
							
								
									6bbd125c05
								
							
						
					
					
						commit
						759db3c0cf
					
				
							
								
								
									
										133
									
								
								NER.py
									
									
									
									
									
								
							
							
						
						
									
										133
									
								
								NER.py
									
									
									
									
									
								
							@ -5,15 +5,26 @@ Named Entity Recognition (NER)
 | 
				
			|||||||
Stanford NER takes a text as input and returns a list of entities
 | 
					Stanford NER takes a text as input and returns a list of entities
 | 
				
			||||||
like persons, organizations and countries, e.g.
 | 
					like persons, organizations and countries, e.g.
 | 
				
			||||||
'''
 | 
					'''
 | 
				
			||||||
 | 
					 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import matplotlib.pyplot as plt
 | 
					import matplotlib.pyplot as plt
 | 
				
			||||||
from nltk.tag import StanfordNERTagger
 | 
					from nltk.tag import StanfordNERTagger
 | 
				
			||||||
from nltk.tokenize import word_tokenize
 | 
					from nltk.tokenize import word_tokenize
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from CsvHandler import CsvHandler
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class NER:
 | 
					class NER:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # legal entity types
 | 
				
			||||||
 | 
					    company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP',
 | 
				
			||||||
 | 
					                      'Plc', 'LLC', 'LBO', 'IPO', 'HQ',
 | 
				
			||||||
 | 
					                      'CIO', 'NGO', 'AB']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # entities that are not companies
 | 
				
			||||||
 | 
					    misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 
 | 
				
			||||||
 | 
					            'Cnn', 'European Commission', 'EU', 'Staff', 'Min', 'Read',
 | 
				
			||||||
 | 
					            'Thomson Reuters Trust Principles']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def tag_words(text):
 | 
					    def tag_words(text):
 | 
				
			||||||
        stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
 | 
					        stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
 | 
				
			||||||
        stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
 | 
					        stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
 | 
				
			||||||
@ -21,80 +32,96 @@ class NER:
 | 
				
			|||||||
        st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
 | 
					        st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        tokenized_text = word_tokenize(text)
 | 
					        tokenized_text = word_tokenize(text)
 | 
				
			||||||
 | 
					        # list of tuples (word, tag)
 | 
				
			||||||
        tagged_words = st.tag(tokenized_text)
 | 
					        tagged_words = st.tag(tokenized_text)
 | 
				
			||||||
        # returns list of tuples (word, tag)
 | 
					 | 
				
			||||||
        return tagged_words
 | 
					        return tagged_words
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_coherent_names(tagged_words):
 | 
					    def get_coherent_tags(tagged_words):
 | 
				
			||||||
        continuous_chunk = []
 | 
					        continuous_chunks = []
 | 
				
			||||||
        current_chunk = []
 | 
					        current_chunks = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for token, tag in tagged_words:
 | 
					        for token, tag in tagged_words:
 | 
				
			||||||
            if tag != "O":
 | 
					            if tag == "ORGANIZATION" and token not in NER.company_abbrevs:
 | 
				
			||||||
                current_chunk.append((token, tag))
 | 
					                current_chunks.append((token, tag))
 | 
				
			||||||
            else:
 | 
					            elif current_chunks:
 | 
				
			||||||
                # if current chunk is not empty
 | 
					                # put the final current_chunk into the continuous_chunk
 | 
				
			||||||
                if current_chunk: 
 | 
					                continuous_chunks.append(current_chunks)
 | 
				
			||||||
                    continuous_chunk.append(current_chunk)
 | 
					                current_chunks = []
 | 
				
			||||||
                    current_chunk = []
 | 
					        return continuous_chunks
 | 
				
			||||||
        # put the final current_chunk into the continuous_chunk (if any)
 | 
					 | 
				
			||||||
        if current_chunk:
 | 
					 | 
				
			||||||
            continuous_chunk.append(current_chunk)
 | 
					 | 
				
			||||||
        return continuous_chunk
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def plot_barchart():
 | 
					    def plot_barchart(count_names):
 | 
				
			||||||
        organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6']
 | 
					        '''pyplot diagram of company names distribution
 | 
				
			||||||
        num_mentions = [5, 2, 33, 12, 6, 10]
 | 
					        in input news articles
 | 
				
			||||||
        #n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green')
 | 
					        x-axis:different company names (numbered consecutively)
 | 
				
			||||||
        plt.plot(organizations, num_mentions, 'ro', ms = 10)
 | 
					        y-axis:counts of company name
 | 
				
			||||||
        plt.xlabel('companies')
 | 
					        '''
 | 
				
			||||||
        plt.ylabel('count')
 | 
					        plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
 | 
				
			||||||
        plt.title('Company mentions in articles')
 | 
					        plt.xlabel('Company Names')
 | 
				
			||||||
 | 
					        plt.ylabel('Article Count')
 | 
				
			||||||
 | 
					        plt.title('Counts of News Articles with Company Name')
 | 
				
			||||||
        plt.grid(True)
 | 
					        plt.grid(True)
 | 
				
			||||||
        plt.show()
 | 
					        plt.show()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def find_companies(text):
 | 
					    def find_companies(text):
 | 
				
			||||||
        #set paths
 | 
					        '''param: article text where organizations must be indentified
 | 
				
			||||||
 | 
					        returns: list of identified organisations as strings
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        # set paths
 | 
				
			||||||
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
 | 
					        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
 | 
				
			||||||
        os.environ['JAVAHOME'] = java_path
 | 
					        os.environ['JAVAHOME'] = java_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        seen = set()
 | 
				
			||||||
        organizations = []
 | 
					        organizations = []
 | 
				
			||||||
        # create list of (word, tag) tuples
 | 
					        # create list of (word, tag) tuples
 | 
				
			||||||
        tagged_words = NER.tag_words(text)
 | 
					        tagged_words = NER.tag_words(text)
 | 
				
			||||||
        # put coherent names together
 | 
					        # put coherent names together
 | 
				
			||||||
        nes = NER.get_coherent_names(tagged_words)
 | 
					        nes = NER.get_coherent_tags(tagged_words)
 | 
				
			||||||
        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
 | 
					        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
 | 
				
			||||||
        #print(nes_coherent)
 | 
					        #print(nes_coherent)
 | 
				
			||||||
        for tuple in nes_coherent:
 | 
					        for tuple in nes_coherent:
 | 
				
			||||||
            if tuple[1] == 'ORGANIZATION':
 | 
					            # check if company and not already in list
 | 
				
			||||||
 | 
					            if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
 | 
				
			||||||
                organizations.append(tuple[0])
 | 
					                organizations.append(tuple[0])
 | 
				
			||||||
 | 
					                seen.add(tuple[0])
 | 
				
			||||||
        return organizations
 | 
					        return organizations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def count_companies(texts):
 | 
				
			||||||
 | 
					        '''param: list of all article texts
 | 
				
			||||||
 | 
					        returns: list of company counts as ints
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        # dictionary of companies with their count
 | 
				
			||||||
 | 
					        dict_com = {}
 | 
				
			||||||
 | 
					        for text in texts:
 | 
				
			||||||
 | 
					            # list of found companies in article
 | 
				
			||||||
 | 
					            coms = NER.find_companies(text)
 | 
				
			||||||
 | 
					            for com in coms:
 | 
				
			||||||
 | 
					                if com in dict_com.keys():
 | 
				
			||||||
 | 
					                    dict_com[com] += 1
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    dict_com[com] = 1
 | 
				
			||||||
 | 
					        # print outlier (value 38)
 | 
				
			||||||
 | 
					        print(max(dict_com, key=dict_com.get))
 | 
				
			||||||
 | 
					        return dict_com.values()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == '__main__':
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #plot_barchart()
 | 
					    filepath = 'classification_labelled_corrected.csv'
 | 
				
			||||||
    text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
 | 
					    df = CsvHandler.read_csv(filepath)
 | 
				
			||||||
                    \nmostly fell in light volumes on Tuesday as energy shares
 | 
					
 | 
				
			||||||
                    tracked \nfalls in global oil prices, while weaknesses in banking shares
 | 
					    # articles with label==1
 | 
				
			||||||
                    \namid concerns about loans to an ailing steel firm sent the Thai
 | 
					    df_hits = df[df['Label'] == 1]
 | 
				
			||||||
                    \nindex to a one-week closing low. \nBangkok's SET index shed nearly
 | 
					
 | 
				
			||||||
                    1 percent after four \nsessions of gains. The index closed at 1,379.32,
 | 
					    texts = df_hits['Title'] + ' ' + df_hits['Text']
 | 
				
			||||||
                    its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
 | 
					
 | 
				
			||||||
                    the most actively \ntraded by turnover, dropped 2.8 percent to a near
 | 
					    # # zum prüfen lesen
 | 
				
			||||||
                    one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
 | 
					    # for text in texts[5:10]:
 | 
				
			||||||
                    \nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
 | 
					        # print(text)
 | 
				
			||||||
                    downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
 | 
					        # print()
 | 
				
			||||||
                    to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
 | 
					        # print(NER.find_companies(text))
 | 
				
			||||||
                    lower than 130 percent, the \ndesired level we think and hence the need for
 | 
					        # print()
 | 
				
			||||||
                    more provisioning \nin the following quarters,\" the broker said in a report.
 | 
					
 | 
				
			||||||
                    \nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
 | 
					    # count names in hit articles
 | 
				
			||||||
                    creditors, dropped 1 percent. The steel firm \nand its three creditors
 | 
					    count_names = NER.count_companies(texts)
 | 
				
			||||||
                    agreed on Monday to consider options to \nrestructure debt worth over
 | 
					
 | 
				
			||||||
                    50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
 | 
					    # plot diagram
 | 
				
			||||||
                    slides for a third \nsession, Singapore gave up early gains and Indonesia
 | 
					    NER.plot_barchart(count_names)
 | 
				
			||||||
                    \nhit a near one-week low, all with trading volumes below \nthe 30-day
 | 
					 | 
				
			||||||
                    average ahead of a public holiday on Thursday. \nAmong top losers in the
 | 
					 | 
				
			||||||
                    region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
 | 
					 | 
				
			||||||
                    Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
 | 
					 | 
				
			||||||
                    \namid uncertainty over global demand. \nFor Asian Companies click.'''
 | 
					 | 
				
			||||||
    print(NER.find_companies(text))
 | 
					 | 
				
			||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user