'''
File Handler
============

FileHandler reads articles from JSON files,
extracts relevant information and
writes it to a csv file.
'''

# -*- coding: utf-8 -*-

import csv
import glob
import json

import numpy as np
import pandas as pd

class FileHandler:

    # strings for every month of the year
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
              '11', '12']

    def select_randoms(df, n):
        '''select n random samples from dataset.
        params: df DataFrame to select items from,
        n number of items to select randomly,
        return new DataFrame with only selected items.
        '''
        # initialize random => reproducible sequence
        np.random.seed(5)
        # add new column 'Random'
        df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
        # sort DataFrame by random numbers
        df = df.sort_values('Random')
        # return first n elements of randomly sorted dataset
        return df.iloc[0:n]

    def create_labeling_dataset():
        # output file
        o_file = 'data\\interactive_labeling_dataset.csv'
        # create file and write header
        with open(o_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, 
                                delimiter='|',
                                quotechar='\'', 
                                quoting=csv.QUOTE_NONNUMERIC)
            writer.writerow(['Uuid',        #0
                             'Title',       #1
                             'Text',        #2
                             'Site',        #3
                             'SiteSection', #4
                             'Url',         #5
                             'Timestamp'])  #6
        # number of articles to select from each month (10000/12=833,33)
        n_select = 833
        for m in FileHandler.months:
            df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
                             delimiter='|',
                             header=0,
                             index_col=None,
                             engine='python',
                             quoting=csv.QUOTE_NONNUMERIC,
                             quotechar='\'')
            # pick one more from every third article
            if m in ['03', '06', '09', '12']:
                n_select = 834
            random_articles = FileHandler.select_randoms(df, n_select)
            del random_articles['Random']
            random_articles.to_csv(o_file,
                                   header=False,
                                   index=False,
                                   sep='|',
                                   mode='a',
                                   encoding='utf-8',
                                   quoting=csv.QUOTE_NONNUMERIC,
                                   quotechar='\'')

    def write_articles_to_csv_files():
        '''read JSON files, select articles and write them to csv.
        '''
        # reliable sources (site_sections)
        site_sections = []
        # read list from 'sections.txt' file
        with open('data\\sections.txt', 'r') as s_list:
            site_sections = s_list.read().split('\n')

        # article counter
        a = 0
        for m in FileHandler.months:
            # 1 output file per month
            output_file = 'data\\articles\\all_{}.csv'.format(m)
            # path of input JSON files per month
            path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
                   '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
                   '\\news_[0-9]*.json'.format(m)
            files = glob.glob(path)

            # file counter
            n = 0
            # write separate csv file for every month
            with open(output_file, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile, 
                                    delimiter='|',
                                    quotechar='\'', 
                                    quoting=csv.QUOTE_NONNUMERIC)

                # write header / column names
                writer.writerow(['Uuid',        #0
                                 'Title',       #1
                                 'Text',        #2
                                 'Site',        #3
                                 'SiteSection', #4
                                 'Url',         #5
                                 'Timestamp'])  #6
                # write articles
                for file in files:
                    n += 1
                    # read every JSON file
                    with open(file, encoding='utf-8') as f:
                        # Json is converted to dict
                        dict = json.load(f)
                        # check if comment or post
                        if ((dict['ord_in_thread'] != 0) or
                            # check if not english
                            (dict['language'] != 'english') or
                            # check if spam
                            (dict['thread']['spam_score'] > 0.3) or
                            # check if reliable source
                            (dict['thread']['site_section'] not in site_sections) or
                            # check if text parsed correctly
                            ('Further company coverage:' in dict['text']) or
                            (('subscription' or 'subscribe') in dict['text']) or
                            (len(dict['text']) < 200)):
                            continue
                        else:
                            try:
                                # replace whitespaces and delimiter chars
                                # and write to csv
                                writer.writerow([dict['thread']['uuid'],
                                                 dict['thread']['title']\
                                                 .replace('|', '-'),
                                                 dict['text']\
                                                 .replace('\n', '')\
                                                 .replace('\r', '')\
                                                 .replace('|', '-'),
                                                 dict['thread']['site'],
                                                 dict['thread']['site_section']\
                                                 .replace('\n', '')\
                                                 .replace('\r', ''),
                                                 dict['url'],
                                                 dict['published']])
                                a += 1
                            # handle undefined characters (videos and other spam)
                            except UnicodeEncodeError:
                                print('# filtered out: {} (UnicodeEncodeError)'
                                            .format(dict['thread']['site_section']))
            print('# saved articles in file {}, now {} in total'.format(output_file, a))
            print('#')
        print('# saved {} articles in total'.format(a))
        print('#')
    def join_all_csv_files():

if __name__ == '__main__':
    # FileHandler.write_articles_to_csv_files()
    # FileHandler.create_labeling_dataset()