thesis-anne/src/FileHandler.py

'''
File Handler
============

FileHandler reads articles from JSON files,
extracts relevant information and
writes it to a csv file.
'''

# -*- coding: utf-8 -*-

import csv
import glob
import json
import string

import numpy as np
import pandas as pd

class FileHandler:

    # strings for every month of the year
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
              '11', '12']

    def select_randoms(df, n):
        '''select n random samples from dataset.
        params: df DataFrame to select items from,
        n number of items to select randomly,
        return new DataFrame with only selected items.
        '''
        # initialize random => reproducible sequence
        np.random.seed(5)
        # add new column 'Random'
        df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
        # sort DataFrame by random numbers
        df = df.sort_values('Random')
        # return first n elements of randomly sorted dataset
        return df.iloc[0:n]

    def create_labeling_dataset():
        # output file
        o_file = '..\\data\\cleaned_data_set_without_header.csv'
        # create file and write header
        with open(o_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile,
                                delimiter='|',
                                quotechar='\'',
                                quoting=csv.QUOTE_NONNUMERIC)
            writer.writerow(['Uuid',        #0
                             'Title',       #1
                             'Text',        #2
                             'Site',        #3
                             'SiteSection', #4
                             'Url',         #5
                             'Timestamp'])  #6
        # number of articles to select from each month (10000/12=833,33)
        n_select = 833
        for m in FileHandler.months:
            df = pd.read_csv('..\\data\\articles\\all_{}.csv'.format(m),
                             delimiter='|',
                             header=0,
                             index_col=None,
                             engine='python',
                             quoting=csv.QUOTE_NONNUMERIC,
                             quotechar='\'')
            # pick one more from every third article
            if m in ['03', '06', '09', '12']:
                n_select = 834
            random_articles = FileHandler.select_randoms(df, n_select)
            del random_articles['Random']
            random_articles.to_csv(o_file,
                                   header=False,
                                   index=False,
                                   sep='|',
                                   mode='a',
                                   encoding='utf-8',
                                   quoting=csv.QUOTE_NONNUMERIC,
                                   quotechar='\'')

    def clean_articles():
        '''clean articles in data set: filter out all non-printable characters
        '''
        # read data set
        file = '..\\data\\cleaned_data_set_without_header.csv'
        df = pd.read_csv(file,
                         delimiter='|',
                         header=None,
                         index_col=None,
                         engine='python',
                         #usecols=[1,2],
                         #nrows=100,
                         quoting=csv.QUOTE_NONNUMERIC,
                         quotechar='\'')

        # for each article(row)
        for i in range (len(df)):
            # filter headline
            df.iloc[i][1] = ''.join(x for x in df.iloc[i][1] if x in string.printable)
            # filter text
            df.iloc[i][2] = ''.join(x for x in df.iloc[i][2] if x in string.printable)
        print(df)
        # save cleaned dataframe
        df.to_csv('data\\cleaned_data_set_without_header.csv',
                   header=False,
                   index=False,
                   sep='|',
                   mode='a',
                   encoding='utf-8',
                   quoting=csv.QUOTE_NONNUMERIC,
                   quotechar='\'')

    def remove_duplicates():
        '''remove articles with exactly same headline
        '''
        # read data set
        file = '..\\data\\cleaned_data_set_without_header.csv'
        df = pd.read_csv(file,
                         delimiter='|',
                         header=None,
                         index_col=None,
                         engine='python',
                         #usecols=[1,2],
                         #nrows=100,
                         quoting=csv.QUOTE_NONNUMERIC,
                         quotechar='\'')
        print(type(df))

        # for each article(row)
        i = 1
        while i < len(df):
            # check if already in column
            if df.iloc[i][1] in df.iloc[0:(i-1)][1]:
                # remove duplicate
                del df.iloc[i]
            else:
                i += 1

        # save cleaned dataframe
        df.to_csv('..\\data\\cleaned_data_set_without_header.csv',
                   header=False,
                   index=False,
                   sep='|',
                   mode='a',
                   encoding='utf-8',
                   quoting=csv.QUOTE_NONNUMERIC,
                   quotechar='\'')

    def write_articles_to_csv_files():
        '''read JSON files, select articles and write them to csv.
        '''
        # reliable sources (site_sections)
        site_sections = []
        # read list from 'sections.txt' file
        with open('..\\data\\sections.txt', 'r') as s_list:
            site_sections = s_list.read().split('\n')

        # article counter
        a = 0
        for m in FileHandler.months:
            # 1 output file per month
            output_file = '..\\data\\articles\\all_{}.csv'.format(m)
            # path of input JSON files per month
            path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
                   '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
                   '\\news_[0-9]*.json'.format(m)
            files = glob.glob(path)

            # file counter
            n = 0
            # write separate csv file for every month
            with open(output_file, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile,
                                    delimiter='|',
                                    quotechar='\'',
                                    quoting=csv.QUOTE_NONNUMERIC)

                # write header / column names
                writer.writerow(['Uuid',        #0
                                 'Title',       #1
                                 'Text',        #2
                                 'Site',        #3
                                 'SiteSection', #4
                                 'Url',         #5
                                 'Timestamp'])  #6
                # write articles
                for file in files:
                    n += 1
                    # read every JSON file
                    with open(file, encoding='utf-8') as f:
                        # Json is converted to dict
                        dict = json.load(f)
                        # check if comment or post
                        if ((dict['ord_in_thread'] != 0) or
                            # check if not english
                            (dict['language'] != 'english') or
                            # check if spam
                            (dict['thread']['spam_score'] > 0.3) or
                            # check if reliable source
                            (dict['thread']['site_section'] not in site_sections) or
                            # check if text parsed correctly
                            ('Further company coverage:' in dict['text']) or
                            (('subscription' or 'subscribe') in dict['text']) or
                            (len(dict['text']) < 200)):
                            continue
                        else:
                            try:
                                # replace whitespaces and delimiter chars
                                # and write to csv
                                writer.writerow([dict['thread']['uuid'],
                                                 dict['thread']['title']\
                                                 .replace('|', '-'),
                                                 dict['text']\
                                                 .replace('\n', '')\
                                                 .replace('\r', '')\
                                                 .replace('|', '-'),
                                                 dict['thread']['site'],
                                                 dict['thread']['site_section']\
                                                 .replace('\n', '')\
                                                 .replace('\r', ''),
                                                 dict['url'],
                                                 dict['published']])
                                a += 1
                            # handle undefined characters (videos and other spam)
                            except UnicodeEncodeError:
                                print('# filtered out: {} (UnicodeEncodeError)'
                                            .format(dict['thread']['site_section']))
            print('# saved articles in file {}, now {} in total'.format(output_file, a))
            print('#')
        print('# saved {} articles in total'.format(a))
        print('#')

if __name__ == '__main__':
    # FileHandler.write_articles_to_csv_files()
    # FileHandler.create_labeling_dataset()
    FileHandler.remove_duplicates()