'''
Named Entity Recognition (NER)
==============================

Stanford NER takes a text as input and returns a list of entities
like persons, organizations and countries, e.g.
'''
import os

import matplotlib.pyplot as plt
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

from CsvHandler import CsvHandler

class NER:

    # legal entity types
    company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP',
                      'Plc', 'LLC', 'LBO', 'IPO', 'HQ',
                      'CIO', 'NGO', 'AB']

    # entities that are not companies
    misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 
            'Cnn', 'European Commission', 'EU', 'Staff', 'Min', 'Read',
            'Thomson Reuters Trust Principles']

    def tag_words(text):
        stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
        stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
        # create tagger object
        st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')

        tokenized_text = word_tokenize(text)
        # list of tuples (word, tag)
        tagged_words = st.tag(tokenized_text)
        return tagged_words

    def get_coherent_tags(tagged_words):
        continuous_chunks = []
        current_chunks = []

        for token, tag in tagged_words:
            if tag == "ORGANIZATION" and token not in NER.company_abbrevs:
                current_chunks.append((token, tag))
            elif current_chunks:
                # put the final current_chunk into the continuous_chunk
                continuous_chunks.append(current_chunks)
                current_chunks = []
        return continuous_chunks

    def plot_barchart(count_names):
        '''pyplot diagram of company names distribution
        in input news articles
        x-axis:different company names (numbered consecutively)
        y-axis:counts of company name
        '''
        plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
        plt.xlabel('Company Names')
        plt.ylabel('Article Count')
        plt.title('Counts of News Articles with Company Name')
        plt.grid(True)
        plt.show()

    def find_companies(text):
        '''param: article text where organizations must be indentified
        returns: list of identified organisations as strings
        '''
        # set paths
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
        os.environ['JAVAHOME'] = java_path

        seen = set()
        organizations = []
        # create list of (word, tag) tuples
        tagged_words = NER.tag_words(text)
        # put coherent names together
        nes = NER.get_coherent_tags(tagged_words)
        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
        #print(nes_coherent)
        for tuple in nes_coherent:
            # check if company and not already in list
            if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
                organizations.append(tuple[0])
                seen.add(tuple[0])
        return organizations

    def count_companies(texts):
        '''param: list of all article texts
        returns: list of company counts as ints
        '''
        # dictionary of companies with their count
        dict_com = {}
        for text in texts:
            # list of found companies in article
            coms = NER.find_companies(text)
            for com in coms:
                if com in dict_com.keys():
                    dict_com[com] += 1
                else:
                    dict_com[com] = 1
        # print outlier (value 38)
        print(max(dict_com, key=dict_com.get))
        return dict_com.values()

if __name__ == '__main__':

    filepath = 'classification_labelled_corrected.csv'
    df = CsvHandler.read_csv(filepath)

    # articles with label==1
    df_hits = df[df['Label'] == 1]

    texts = df_hits['Title'] + ' ' + df_hits['Text']

    # # zum prüfen lesen
    # for text in texts[5:10]:
        # print(text)
        # print()
        # print(NER.find_companies(text))
        # print()

    # count names in hit articles
    count_names = NER.count_companies(texts)

    # plot diagram
    NER.plot_barchart(count_names)