'''
Json Handler
============

JsonHandler reads articles from JSON files,
extracts relevant information and
writes it to a csv file.
'''

# -*- coding: utf-8 -*-

import csv
import glob
import json

import numpy as np
import pandas as pd

class JsonHandler:

    # string for every month of the year
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
              '11', '12']

    def select_randoms(df, n):
        '''select n random samples from dataset.
        params: df DataFrame to select items from,
        n number of items to select randomly,
        return new DataFrame with only selected items.
        '''

        # initialize random => reproducible sequence
        np.random.seed(5)
        # add new column 'Random'
        df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
        # sort DataFrame by random numbers
        df = df.sort_values('Random')
        # return first n elements of randomly sorted dataset
        return df.iloc[0:n]

    def create_labeling_dataset():
        # number of articles to select from each month:
        # 10.000 / 12 = 833,33
        n_select = 833
        # except every third month:
        every_third_month = ['03', '06', '09', '12']
        for m in JsonHandler.month:
            df = pandas.read_csv('all_{}.csv'.format(m),
                                  delimiter='|',
                                  header=0,
                                  index_col=None,
                                  engine='python',
                                  quotechar='\'',
                                  quoting=0,
                                  encoding='utf-8')
            # pick one more from every third article
            if m in every_third_month:
                n_select = 834
            JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv', 
                                                            header=True, 
                                                            mode='a', 
                                                            encoding='python', 
                                                            quoting=QUOTE_MINIMAL, 
                                                            quotechar='\'')

    def write_articles_to_csv_files():
        '''read JSON files, select articles and write them to csv.
        '''
        # reliable sources (site_sections)
        site_sections = []
        # read list from 'sections.txt' file
        with open('sections.txt', 'r') as s_list:
            site_sections = s_list.read().split('\n')

        # article counter
        a = 0
        for m in JsonHandler.months:
            # 1 output file per month
            output_file = 'all_{}.csv'.format(m)
            # path of input JSON files per month
            path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
                   '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
                   '\\news_[0-9]*.json'.format(m)
            files = glob.glob(path)

            # file counter
            n = 0
            # write separate csv file for every month
            with open(output_file, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile, 
                                    delimiter='|',
                                    quotechar='\'', 
                                    quoting=csv.QUOTE_NONNUMERIC)

                # write header / column names
                writer.writerow(['Uuid',        #0
                                 'Title',       #1
                                 'Text',        #2
                                 'Site',        #3
                                 'SiteSection', #4
                                 'Url',         #5
                                 'Timestamp'])  #6
                # write articles
                for file in files:
                    n += 1
                    # read every JSON file
                    with open(file, encoding='utf-8') as f:
                        # Json is converted to dict
                        dict = json.load(f)
                        # check if comment or post
                        if ((dict['ord_in_thread'] != 0) or
                            # check if not english
                            (dict['language'] != 'english') or
                            # check if spam
                            (dict['thread']['spam_score'] > 0.3) or
                            # check if reliable source
                            (dict['thread']['site_section'] not in site_sections) or
                            # check if text parsed correctly
                            ('Further company coverage:' in dict['text']) or
                            (('subscription' or 'subscribe') in dict['text']) or
                            (len(dict['text']) < 300)):
                            continue
                        else:
                            try:
                                # replace whitespaces and delimiter chars
                                # and write to csv
                                writer.writerow([dict['thread']['uuid'],
                                                 dict['thread']['title']\
                                                 .replace('|', '-'),
                                                 dict['text']\
                                                 .replace('\n', '')\
                                                 .replace('\r', '')\
                                                 .replace('|', '-'),
                                                 dict['thread']['site'],
                                                 dict['thread']['site_section']\
                                                 .replace('\n', '')\
                                                 .replace('\r', ''),
                                                 dict['url'],
                                                 dict['published']])
                                a += 1
                            # handle undefined characters (videos and other spam)
                            except UnicodeEncodeError:
                                print('# filtered out: {} (UnicodeEncodeError)'
                                            .format(dict['thread']['site_section']))
            print('# saved articles in file {}, now {} in total'.format(output_file, a))
            print('#')
        print('# saved {} articles in total'.format(a))
        print('#')

if __name__ == '__main__':
    JsonHandler.write_articles_to_csv_files()
    #JsonHandler.create_labeling_dataset()