'''
JSON Handler
============

JSON Handler reads articles from JSON files,
extracts relevant information and
writes it to a csv file.
'''

# -*- coding: utf-8 -*-

import csv
import glob
import json

import numpy as np
import pandas as pd

class JsonHandler:

    def select_randoms(df, n):
        '''selects n random samples from dataset.
        params: df DataFrame to select items from,
        n number of items to select randomly,
        returns new DataFrame with only selected items
        '''
        # initialize random => reproducible sequence
        np.random.seed(5)
        # add new column 'Random'
        df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
        # sort DataFrame by random numbers
        df = df.sort_values('Random')
        # return first n elements of randomly sorted dataset
        return df.iloc[0:n]

    def create_csv(file_name):
        # create new csv file for each month.
        # each row contains an news article.

        with open(file_name, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, 
                                delimiter='|',
                                quotechar='\'',
                                quoting=csv.QUOTE_NONNUMERIC)
            # write header / column names
            writer.writerow(['Uuid',        #0
                             'Title',       #1
                             'Text',        #2
                             'Site',        #3
                             'SiteSection', #4
                             'Url',         #5
                             'Timestamp'])  #6

    def write_articles_to_csv(file_name):
        # path of JSON files
        path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json'
        files = glob.glob(path)

        # reliable sources (site_sections)
        site_sections = ['http://feeds.reuters.com/reuters/financialsNews',
                         'http://feeds.reuters.com/reuters/INbusinessNews',
                         'http://feeds.reuters.com/reuters/businessNews',
                         'http://feeds.reuters.com/reuters/companyNews',
                         'http://www.reuters.com/finance/deals',
                         'http://feeds.reuters.com/reuters/mergersNews',
                         'http://rss.cnn.com/rss/money_topstories.rss',
                         'http://rss.cnn.com/rss/money_latest.rss',
                         'http://www.economist.com/sections/business-finance/rss.xml',
                         'http://rss.cnn.com/rss/edition_business.rss',
                         'http://in.reuters.com/finance/deals',
                         'http://feeds.reuters.com/reuters/technologyNews',
                         'http://feeds.reuters.com/reuters/technologysectorNews',
                         'https://www.ft.com/companies/us',
                         'http://feeds.reuters.com/reuters/UKScienceNews',
                         'http://in.reuters.com/news/technology',
                         'http://in.reuters.com/finance/economy',
                         'https://www.bloomberg.com/middleeast',
                         'http://in.reuters.com/news/top-news']

        # file counter
        n = 0
        # article counter
        a = 0
        # read every JSON file in current folder
        with open(file_name, 'a', newline='') as csvfile:
            writer = csv.writer(csvfile, 
                                delimiter='|',
                                quotechar='\'', 
                                quoting=csv.QUOTE_NONNUMERIC)
            for file in files:
                n += 1
                with open(file, encoding='utf-8') as f:
                    # Json is converted to dict
                    dict = json.load(f)
                    #print(n)
                    # leave out comments or posts, take only reuters as source
                    if ((dict['ord_in_thread'] != 0) or 
                        (dict['language'] != 'english') or 
                        (dict['thread']['spam_score'] > 0.3) or
                        (dict['thread']['site_section'] not in site_sections)):
                        continue
                    # pick only relevant information of article
                    # and put in in list
                    article = [dict['thread']['uuid'],         # 0:'Uuid'
                               dict['thread']['title'],        # 1:'Title'
                               dict['text'],                   # 2:'Text'
                               dict['thread']['site'],         # 3:'Site'
                               dict['thread']['site_section'], # 4:'SiteSection'
                               dict['url'],                    # 5:'Url'
                               dict['published']]              # 6:'Timestamp'

                    # remove newlines and delimiter char
                    article[1] = article[1].replace('|', '-') # in 'Title'
                    article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text'

                    try:
                        writer.writerow(article)
                        a += 1
                    # handle undefined characters (videos and other spam)
                    except UnicodeEncodeError:
                        print('# filtered out site_section: {} (UnicodeEncodeError)'
                                    .format(dict['thread']['site_section']))
        print()
        print('# saved {} articles in file {}'.format(a, file_name))

if __name__ == '__main__':
    file_name = 'test.csv'
    JsonHandler.create_csv(file_name)
    JsonHandler.write_articles_to_csv(file_name)