thesis-anne/NER.py

'''
Named Entity Recognition (NER)
==============================

Stanford NER takes a text as input and returns a list of entities
like persons, organizations and countries, e.g.
'''
import os

import matplotlib.pyplot as plt
import numpy as np
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

from CsvHandler import CsvHandler

class NER:

    # toDo: complete lists:
    # some legal entity types
    company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP', 'Plc', 'LLC',
                       'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB']

    # some entities that are not companies
    misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
            'European Commission', 'EU', 'Staff', 'Min', 'Read',
            'Thomson Reuters Trust Principles', 'New York Stock Exchange',
            'NYSE']

    def tag_words(text):
        # path to Stanford NER
        stanford_classifier = 'stanford-ner-2018-02-27'\
                              '\\classifiers'\
                              '\\english.all.3class.distsim.crf.ser.gz'
        stanford_ner_path = 'stanford-ner-2018-02-27'\
                            '\\stanford-ner.jar'
        # create tagger object
        st = StanfordNERTagger(stanford_classifier, stanford_ner_path,
                               encoding='utf-8')

        tokenized_text = word_tokenize(text)
        # list of tuples (word, tag)
        tagged_words = st.tag(tokenized_text)
        return tagged_words

    def get_coherent_tags(tagged_words):
        continuous_chunks = []
        current_chunks = []

        for token, tag in tagged_words:
            if tag == "ORGANIZATION" and token not in NER.company_abbrevs:
                current_chunks.append((token, tag))
            elif current_chunks:
                # put the final current_chunk into the continuous_chunk
                continuous_chunks.append(current_chunks)
                current_chunks = []
        return continuous_chunks

    def plot_histogram(count_names):
        '''pyplot diagram of company names distribution
        (probability density function)
        x-axis: number of mentions of the company
        y-axis: frequency
        '''
        # sort list in descending order
        count_names.sort(reverse=True)
        # convert list to array
        names = np.asarray(count_names)
        plt.title('Company mentions in News Articles')
        plt.xlabel('Count of articles that mention a company')
        # Number of companies with this number of mentions
        plt.ylabel('Number of companies with this number of articles')
        num_bins = 50
        n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
        # plt.grid(True)
        plt.show()

    def find_companies(text):
        '''param: article text where organizations must be indentified
        returns: list of identified organisations as strings
        '''
        # set paths
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
        os.environ['JAVAHOME'] = java_path

        seen = set()
        organizations = []
        # create list of (word, tag) tuples
        tagged_words = NER.tag_words(text)
        # put coherent names together
        nes = NER.get_coherent_tags(tagged_words)
        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
        #print(nes_coherent)
        for tuple in nes_coherent:
            # check if company and not already in list
            if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
                organizations.append(tuple[0])
                seen.add(tuple[0])
        return organizations

    def count_companies(texts):
        '''param: list of all article texts
        returns: list of company counts as ints
        '''
        # dictionary of companies with their count
        dict_com = {}
        for text in texts:
            # list of found companies in article
            coms = NER.find_companies(text)
            for com in coms:
                if com in dict_com.keys():
                    dict_com[com] += 1
                else:
                    dict_com[com] = 1
        # # print outlier
        # print(max(dict_com, key=dict_com.get))
        return list(dict_com.values())

if __name__ == '__main__':

    filepath = 'classification_labelled_corrected.csv'
    df = CsvHandler.read_csv(filepath)

    # only articles with label==1
    df_hits = df[df['Label'] == 1]

    texts = df_hits['Title'] + ' ' + df_hits['Text']

    # # zum prüfen lesen
    # for text in texts[10:20]:
        # print(text)
        # print()
        # print(NER.find_companies(text))
        # print()

    # count names in hit articles
    count_names = NER.count_companies(texts)

    # plot diagram
    NER.plot_histogram(count_names)