thesis-anne/JsonHandler.py

'''
Json Handler
============

JsonHandler reads articles from JSON files,
extracts relevant information and
writes it to a csv file.
'''

# -*- coding: utf-8 -*-

import csv
import glob
import json

import numpy as np
import pandas as pd

class JsonHandler:

    # string for every month of the year
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
              '11', '12']

    def select_randoms(df, n):
        '''select n random samples from dataset.
        params: df DataFrame to select items from,
        n number of items to select randomly,
        return new DataFrame with only selected items.
        '''

        # initialize random => reproducible sequence
        np.random.seed(5)
        # add new column 'Random'
        df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
        # sort DataFrame by random numbers
        df = df.sort_values('Random')
        # return first n elements of randomly sorted dataset
        return df.iloc[0:n]

    def create_labeling_dataset():
        # number of articles to select from each month:
        # 10.000 / 12 = 833,33
        n_select = 833
        # except every third month:
        every_third_month = ['03', '06', '09', '12']
        for m in JsonHandler.month:
            df = pandas.read_csv('all_{}.csv'.format(m),
                                  delimiter='|',
                                  header=0,
                                  index_col=None,
                                  engine='python',
                                  quotechar='\'',
                                  quoting=0,
                                  encoding='utf-8')
            # pick one more from every third article
            if m in every_third_month:
                n_select = 834
            JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv', 
                                                            header=True, 
                                                            mode='a', 
                                                            encoding='python', 
                                                            quoting=QUOTE_MINIMAL, 
                                                            quotechar='\'')

    def write_articles_to_csv_files():
        '''read JSON files, select articles and write them to csv.
        '''
        # reliable sources (site_sections)
        site_sections = []
        # read list from 'sections.txt' file
        with open('sections.txt', 'r') as s_list:
            site_sections = s_list.read().split('\n')

        # article counter
        a = 0
        for m in JsonHandler.months:
            # 1 output file per month
            output_file = 'all_{}.csv'.format(m)
            # path of input JSON files per month
            path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
                   '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
                   '\\news_[0-9]*.json'.format(m)
            files = glob.glob(path)

            # file counter
            n = 0
            # write separate csv file for every month
            with open(output_file, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile, 
                                    delimiter='|',
                                    quotechar='\'', 
                                    quoting=csv.QUOTE_NONNUMERIC)

                # write header / column names
                writer.writerow(['Uuid',        #0
                                 'Title',       #1
                                 'Text',        #2
                                 'Site',        #3
                                 'SiteSection', #4
                                 'Url',         #5
                                 'Timestamp'])  #6
                # write articles
                for file in files:
                    n += 1
                    # read every JSON file
                    with open(file, encoding='utf-8') as f:
                        # Json is converted to dict
                        dict = json.load(f)
                        # check if comment or post
                        if ((dict['ord_in_thread'] != 0) or
                            # check if not english
                            (dict['language'] != 'english') or
                            # check if spam
                            (dict['thread']['spam_score'] > 0.3) or
                            # check if reliable source
                            (dict['thread']['site_section'] not in site_sections) or
                            # check if text parsed correctly
                            ('Further company coverage:' in dict['text']) or
                            (('subscription' or 'subscribe') in dict['text']) or
                            (len(dict['text']) < 300)):
                            continue
                        else:
                            try:
                                # replace whitespaces and delimiter chars
                                # and write to csv
                                writer.writerow([dict['thread']['uuid'],
                                                 dict['thread']['title']\
                                                 .replace('|', '-'),
                                                 dict['text']\
                                                 .replace('\n', '')\
                                                 .replace('\r', '')\
                                                 .replace('|', '-'),
                                                 dict['thread']['site'],
                                                 dict['thread']['site_section']\
                                                 .replace('\n', '')\
                                                 .replace('\r', ''),
                                                 dict['url'],
                                                 dict['published']])
                                a += 1
                            # handle undefined characters (videos and other spam)
                            except UnicodeEncodeError:
                                print('# filtered out: {} (UnicodeEncodeError)'
                                            .format(dict['thread']['site_section']))
            print('# saved articles in file {}, now {} in total'.format(output_file, a))
            print('#')
        print('# saved {} articles in total'.format(a))
        print('#')

if __name__ == '__main__':
    JsonHandler.write_articles_to_csv_files()
    #JsonHandler.create_labeling_dataset()
refactoring 2018-10-22 08:17:52 +00:00			`'''`
			`Json Handler`
			`============`

			`JsonHandler reads articles from JSON files,`
			`extracts relevant information and`
			`writes it to a csv file.`
			`'''`

			`# -- coding: utf-8 --`

			`import csv`
			`import glob`
			`import json`

			`import numpy as np`
			`import pandas as pd`

			`class JsonHandler:`

			`# string for every month of the year`
			`months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',`
			`'11', '12']`

			`def select_randoms(df, n):`
			`'''select n random samples from dataset.`
			`params: df DataFrame to select items from,`
			`n number of items to select randomly,`
			`return new DataFrame with only selected items.`
			`'''`

			`# initialize random => reproducible sequence`
			`np.random.seed(5)`
			`# add new column 'Random'`
			`df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)`
			`# sort DataFrame by random numbers`
			`df = df.sort_values('Random')`
			`# return first n elements of randomly sorted dataset`
			`return df.iloc[0:n]`

			`def create_labeling_dataset():`
			`# number of articles to select from each month:`
			`# 10.000 / 12 = 833,33`
			`n_select = 833`
			`# except every third month:`
			`every_third_month = ['03', '06', '09', '12']`
			`for m in JsonHandler.month:`
			`df = pandas.read_csv('all_{}.csv'.format(m),`
			`delimiter='\|',`
			`header=0,`
			`index_col=None,`
			`engine='python',`
			`quotechar='\'',`
			`quoting=0,`
			`encoding='utf-8')`
			`# pick one more from every third article`
			`if m in every_third_month:`
			`n_select = 834`
			`JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv',`
			`header=True,`
			`mode='a',`
			`encoding='python',`
			`quoting=QUOTE_MINIMAL,`
			`quotechar='\'')`

			`def write_articles_to_csv_files():`
			`'''read JSON files, select articles and write them to csv.`
			`'''`
			`# reliable sources (site_sections)`
			`site_sections = []`
			`# read list from 'sections.txt' file`
			`with open('sections.txt', 'r') as s_list:`
			`site_sections = s_list.read().split('\n')`

			`# article counter`
			`a = 0`
			`for m in JsonHandler.months:`
			`# 1 output file per month`
			`output_file = 'all_{}.csv'.format(m)`
			`# path of input JSON files per month`
			`path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\`
			`'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\`
			`'\\news_[0-9]*.json'.format(m)`
			`files = glob.glob(path)`

			`# file counter`
			`n = 0`
			`# write separate csv file for every month`
			`with open(output_file, 'w', newline='') as csvfile:`
			`writer = csv.writer(csvfile,`
			`delimiter='\|',`
			`quotechar='\'',`
			`quoting=csv.QUOTE_NONNUMERIC)`

			`# write header / column names`
			`writer.writerow(['Uuid', #0`
			`'Title', #1`
			`'Text', #2`
			`'Site', #3`
			`'SiteSection', #4`
			`'Url', #5`
			`'Timestamp']) #6`
			`# write articles`
			`for file in files:`
			`n += 1`
			`# read every JSON file`
			`with open(file, encoding='utf-8') as f:`
			`# Json is converted to dict`
			`dict = json.load(f)`
			`# check if comment or post`
			`if ((dict['ord_in_thread'] != 0) or`
			`# check if not english`
			`(dict['language'] != 'english') or`
			`# check if spam`
			`(dict['thread']['spam_score'] > 0.3) or`
			`# check if reliable source`
			`(dict['thread']['site_section'] not in site_sections) or`
			`# check if text parsed correctly`
			`('Further company coverage:' in dict['text']) or`
			`(('subscription' or 'subscribe') in dict['text']) or`
			`(len(dict['text']) < 300)):`
			`continue`
			`else:`
			`try:`
			`# replace whitespaces and delimiter chars`
			`# and write to csv`
			`writer.writerow([dict['thread']['uuid'],`
			`dict['thread']['title']\`
			`.replace('\|', '-'),`
			`dict['text']\`
			`.replace('\n', '')\`
			`.replace('\r', '')\`
			`.replace('\|', '-'),`
			`dict['thread']['site'],`
			`dict['thread']['site_section']\`
			`.replace('\n', '')\`
			`.replace('\r', ''),`
			`dict['url'],`
			`dict['published']])`
			`a += 1`
			`# handle undefined characters (videos and other spam)`
			`except UnicodeEncodeError:`
			`print('# filtered out: {} (UnicodeEncodeError)'`
			`.format(dict['thread']['site_section']))`
			`print('# saved articles in file {}, now {} in total'.format(output_file, a))`
			`print('#')`
			`print('# saved {} articles in total'.format(a))`
			`print('#')`

			`if __name__ == '__main__':`
			`JsonHandler.write_articles_to_csv_files()`
			`#JsonHandler.create_labeling_dataset()`