thesis-anne/FileHandler.py

'''
File Handler
============

FileHandler reads articles from JSON files,
extracts relevant information and
writes it to a csv file.
'''

# -*- coding: utf-8 -*-

import csv
import glob
import json

import numpy as np
import pandas as pd

class FileHandler:

    # strings for every month of the year
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
              '11', '12']

    def select_randoms(df, n):
        '''select n random samples from dataset.
        params: df DataFrame to select items from,
        n number of items to select randomly,
        return new DataFrame with only selected items.
        '''
        # initialize random => reproducible sequence
        np.random.seed(5)
        # add new column 'Random'
        df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
        # sort DataFrame by random numbers
        df = df.sort_values('Random')
        # return first n elements of randomly sorted dataset
        return df.iloc[0:n]

    def create_labeling_dataset():
        # output file
        o_file = 'data\\interactive_labeling_dataset.csv'
        # create file and write header
        with open(o_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, 
                                delimiter='|',
                                quotechar='\'', 
                                quoting=csv.QUOTE_NONNUMERIC)
            writer.writerow(['Uuid',        #0
                             'Title',       #1
                             'Text',        #2
                             'Site',        #3
                             'SiteSection', #4
                             'Url',         #5
                             'Timestamp'])  #6
        # number of articles to select from each month (10000/12=833,33)
        n_select = 833
        for m in FileHandler.months:
            df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
                             delimiter='|',
                             header=0,
                             index_col=None,
                             engine='python',
                             quoting=csv.QUOTE_NONNUMERIC,
                             quotechar='\'')
            # pick one more from every third article
            if m in ['03', '06', '09', '12']:
                n_select = 834
            random_articles = FileHandler.select_randoms(df, n_select)
            del random_articles['Random']
            random_articles.to_csv(o_file,
                                   header=False,
                                   index=False,
                                   sep='|',
                                   mode='a',
                                   encoding='utf-8',
                                   quoting=csv.QUOTE_NONNUMERIC,
                                   quotechar='\'')

    def write_articles_to_csv_files():
        '''read JSON files, select articles and write them to csv.
        '''
        # reliable sources (site_sections)
        site_sections = []
        # read list from 'sections.txt' file
        with open('data\\sections.txt', 'r') as s_list:
            site_sections = s_list.read().split('\n')

        # article counter
        a = 0
        for m in FileHandler.months:
            # 1 output file per month
            output_file = 'data\\articles\\all_{}.csv'.format(m)
            # path of input JSON files per month
            path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
                   '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
                   '\\news_[0-9]*.json'.format(m)
            files = glob.glob(path)

            # file counter
            n = 0
            # write separate csv file for every month
            with open(output_file, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile, 
                                    delimiter='|',
                                    quotechar='\'', 
                                    quoting=csv.QUOTE_NONNUMERIC)

                # write header / column names
                writer.writerow(['Uuid',        #0
                                 'Title',       #1
                                 'Text',        #2
                                 'Site',        #3
                                 'SiteSection', #4
                                 'Url',         #5
                                 'Timestamp'])  #6
                # write articles
                for file in files:
                    n += 1
                    # read every JSON file
                    with open(file, encoding='utf-8') as f:
                        # Json is converted to dict
                        dict = json.load(f)
                        # check if comment or post
                        if ((dict['ord_in_thread'] != 0) or
                            # check if not english
                            (dict['language'] != 'english') or
                            # check if spam
                            (dict['thread']['spam_score'] > 0.3) or
                            # check if reliable source
                            (dict['thread']['site_section'] not in site_sections) or
                            # check if text parsed correctly
                            ('Further company coverage:' in dict['text']) or
                            (('subscription' or 'subscribe') in dict['text']) or
                            (len(dict['text']) < 200)):
                            continue
                        else:
                            try:
                                # replace whitespaces and delimiter chars
                                # and write to csv
                                writer.writerow([dict['thread']['uuid'],
                                                 dict['thread']['title']\
                                                 .replace('|', '-'),
                                                 dict['text']\
                                                 .replace('\n', '')\
                                                 .replace('\r', '')\
                                                 .replace('|', '-'),
                                                 dict['thread']['site'],
                                                 dict['thread']['site_section']\
                                                 .replace('\n', '')\
                                                 .replace('\r', ''),
                                                 dict['url'],
                                                 dict['published']])
                                a += 1
                            # handle undefined characters (videos and other spam)
                            except UnicodeEncodeError:
                                print('# filtered out: {} (UnicodeEncodeError)'
                                            .format(dict['thread']['site_section']))
            print('# saved articles in file {}, now {} in total'.format(output_file, a))
            print('#')
        print('# saved {} articles in total'.format(a))
        print('#')
    def join_all_csv_files():

if __name__ == '__main__':
    # FileHandler.write_articles_to_csv_files()
    # FileHandler.create_labeling_dataset()
refactoring 2018-10-22 08:17:52 +00:00			`'''`
refactoring 2018-10-22 09:53:03 +00:00			`File Handler`
refactoring 2018-10-22 08:17:52 +00:00			`============`

refactoring 2018-10-22 09:53:03 +00:00			`FileHandler reads articles from JSON files,`
refactoring 2018-10-22 08:17:52 +00:00			`extracts relevant information and`
			`writes it to a csv file.`
			`'''`

			`# -- coding: utf-8 --`

			`import csv`
			`import glob`
			`import json`

			`import numpy as np`
			`import pandas as pd`

refactoring 2018-10-22 09:53:03 +00:00			`class FileHandler:`
refactoring 2018-10-22 08:17:52 +00:00
refactoring 2018-10-22 09:53:03 +00:00			`# strings for every month of the year`
refactoring 2018-10-22 08:17:52 +00:00			`months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',`
			`'11', '12']`

			`def select_randoms(df, n):`
			`'''select n random samples from dataset.`
			`params: df DataFrame to select items from,`
			`n number of items to select randomly,`
			`return new DataFrame with only selected items.`
			`'''`
			`# initialize random => reproducible sequence`
			`np.random.seed(5)`
			`# add new column 'Random'`
			`df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)`
			`# sort DataFrame by random numbers`
			`df = df.sort_values('Random')`
			`# return first n elements of randomly sorted dataset`
			`return df.iloc[0:n]`

			`def create_labeling_dataset():`
refactoring 2018-10-22 09:53:03 +00:00			`# output file`
			`o_file = 'data\\interactive_labeling_dataset.csv'`
			`# create file and write header`
			`with open(o_file, 'w', newline='') as csvfile:`
			`writer = csv.writer(csvfile,`
			`delimiter='\|',`
			`quotechar='\'',`
			`quoting=csv.QUOTE_NONNUMERIC)`
			`writer.writerow(['Uuid', #0`
			`'Title', #1`
			`'Text', #2`
			`'Site', #3`
			`'SiteSection', #4`
			`'Url', #5`
			`'Timestamp']) #6`
			`# number of articles to select from each month (10000/12=833,33)`
refactoring 2018-10-22 08:17:52 +00:00			`n_select = 833`
refactoring 2018-10-22 09:53:03 +00:00			`for m in FileHandler.months:`
			`df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),`
			`delimiter='\|',`
			`header=0,`
			`index_col=None,`
			`engine='python',`
			`quoting=csv.QUOTE_NONNUMERIC,`
			`quotechar='\'')`
refactoring 2018-10-22 08:17:52 +00:00			`# pick one more from every third article`
refactoring 2018-10-22 09:53:03 +00:00			`if m in ['03', '06', '09', '12']:`
refactoring 2018-10-22 08:17:52 +00:00			`n_select = 834`
refactoring 2018-10-22 09:53:03 +00:00			`random_articles = FileHandler.select_randoms(df, n_select)`
			`del random_articles['Random']`
			`random_articles.to_csv(o_file,`
			`header=False,`
			`index=False,`
			`sep='\|',`
			`mode='a',`
			`encoding='utf-8',`
			`quoting=csv.QUOTE_NONNUMERIC,`
			`quotechar='\'')`
refactoring 2018-10-22 08:17:52 +00:00
			`def write_articles_to_csv_files():`
			`'''read JSON files, select articles and write them to csv.`
			`'''`
			`# reliable sources (site_sections)`
			`site_sections = []`
			`# read list from 'sections.txt' file`
refactoring 2018-10-22 09:53:03 +00:00			`with open('data\\sections.txt', 'r') as s_list:`
refactoring 2018-10-22 08:17:52 +00:00			`site_sections = s_list.read().split('\n')`

			`# article counter`
			`a = 0`
refactoring 2018-10-22 09:53:03 +00:00			`for m in FileHandler.months:`
refactoring 2018-10-22 08:17:52 +00:00			`# 1 output file per month`
refactoring 2018-10-22 09:53:03 +00:00			`output_file = 'data\\articles\\all_{}.csv'.format(m)`
refactoring 2018-10-22 08:17:52 +00:00			`# path of input JSON files per month`
			`path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\`
			`'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\`
			`'\\news_[0-9]*.json'.format(m)`
			`files = glob.glob(path)`

			`# file counter`
			`n = 0`
			`# write separate csv file for every month`
			`with open(output_file, 'w', newline='') as csvfile:`
			`writer = csv.writer(csvfile,`
			`delimiter='\|',`
			`quotechar='\'',`
			`quoting=csv.QUOTE_NONNUMERIC)`

			`# write header / column names`
			`writer.writerow(['Uuid', #0`
			`'Title', #1`
			`'Text', #2`
			`'Site', #3`
			`'SiteSection', #4`
			`'Url', #5`
			`'Timestamp']) #6`
			`# write articles`
			`for file in files:`
			`n += 1`
			`# read every JSON file`
			`with open(file, encoding='utf-8') as f:`
			`# Json is converted to dict`
			`dict = json.load(f)`
			`# check if comment or post`
			`if ((dict['ord_in_thread'] != 0) or`
			`# check if not english`
			`(dict['language'] != 'english') or`
			`# check if spam`
			`(dict['thread']['spam_score'] > 0.3) or`
			`# check if reliable source`
			`(dict['thread']['site_section'] not in site_sections) or`
			`# check if text parsed correctly`
			`('Further company coverage:' in dict['text']) or`
			`(('subscription' or 'subscribe') in dict['text']) or`
changes for data exploration 2018-10-26 09:30:19 +00:00			`(len(dict['text']) < 200)):`
refactoring 2018-10-22 08:17:52 +00:00			`continue`
			`else:`
			`try:`
			`# replace whitespaces and delimiter chars`
			`# and write to csv`
			`writer.writerow([dict['thread']['uuid'],`
			`dict['thread']['title']\`
			`.replace('\|', '-'),`
			`dict['text']\`
			`.replace('\n', '')\`
			`.replace('\r', '')\`
			`.replace('\|', '-'),`
			`dict['thread']['site'],`
			`dict['thread']['site_section']\`
			`.replace('\n', '')\`
			`.replace('\r', ''),`
			`dict['url'],`
			`dict['published']])`
			`a += 1`
			`# handle undefined characters (videos and other spam)`
			`except UnicodeEncodeError:`
			`print('# filtered out: {} (UnicodeEncodeError)'`
			`.format(dict['thread']['site_section']))`
			`print('# saved articles in file {}, now {} in total'.format(output_file, a))`
			`print('#')`
			`print('# saved {} articles in total'.format(a))`
			`print('#')`
refactoring 2018-10-22 09:53:03 +00:00			`def join_all_csv_files():`
refactoring 2018-10-22 08:17:52 +00:00
			`if __name__ == '__main__':`
refactoring 2018-10-22 09:53:03 +00:00			`# FileHandler.write_articles_to_csv_files()`
			`# FileHandler.create_labeling_dataset()`