thesis-anne/JSONHandler.py

'''
JSON Handler
============

JSON Handler reads articles from JSON files,
extracts relevant information and
writes it to a csv file.
'''

# -*- coding: utf-8 -*-

import csv
import glob
import json

import numpy as np
import pandas as pd

class JsonHandler:

    def select_randoms(df, n):
        '''selects n random samples from dataset.
        params: df DataFrame to select items from,
        n number of items to select randomly,
        returns new DataFrame with only selected items
        '''
        # initialize random => reproducible sequence
        np.random.seed(5)
        # add new column 'Random'
        df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
        # sort DataFrame by random numbers
        df = df.sort_values('Random')
        # return first n elements of randomly sorted dataset
        return df.iloc[0:n]

    def create_csv(file_name):
        # create new csv file for each month.
        # each row contains an news article.

        with open(file_name, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, 
                                delimiter='|',
                                quotechar='\'',
                                quoting=csv.QUOTE_NONNUMERIC)
            # write header / column names
            writer.writerow(['Uuid',        #0
                             'Title',       #1
                             'Text',        #2
                             'Site',        #3
                             'SiteSection', #4
                             'Url',         #5
                             'Timestamp'])  #6

    def write_articles_to_csv(file_name):
        # path of JSON files
        path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json'
        files = glob.glob(path)

        # reliable sources (site_sections)
        site_sections = ['http://feeds.reuters.com/reuters/financialsNews',
                         'http://feeds.reuters.com/reuters/INbusinessNews',
                         'http://feeds.reuters.com/reuters/businessNews',
                         'http://feeds.reuters.com/reuters/companyNews',
                         'http://www.reuters.com/finance/deals',
                         'http://feeds.reuters.com/reuters/mergersNews',
                         'http://rss.cnn.com/rss/money_topstories.rss',
                         'http://rss.cnn.com/rss/money_latest.rss',
                         'http://www.economist.com/sections/business-finance/rss.xml',
                         'http://rss.cnn.com/rss/edition_business.rss',
                         'http://in.reuters.com/finance/deals',
                         'http://feeds.reuters.com/reuters/technologyNews',
                         'http://feeds.reuters.com/reuters/technologysectorNews',
                         'https://www.ft.com/companies/us',
                         'http://feeds.reuters.com/reuters/UKScienceNews',
                         'http://in.reuters.com/news/technology',
                         'http://in.reuters.com/finance/economy',
                         'https://www.bloomberg.com/middleeast',
                         'http://in.reuters.com/news/top-news']

        # file counter
        n = 0
        # article counter
        a = 0
        # read every JSON file in current folder
        with open(file_name, 'a', newline='') as csvfile:
            writer = csv.writer(csvfile, 
                                delimiter='|',
                                quotechar='\'', 
                                quoting=csv.QUOTE_NONNUMERIC)
            for file in files:
                n += 1
                with open(file, encoding='utf-8') as f:
                    # Json is converted to dict
                    dict = json.load(f)
                    #print(n)
                    # leave out comments or posts, take only reuters as source
                    if ((dict['ord_in_thread'] != 0) or 
                        (dict['language'] != 'english') or 
                        (dict['thread']['spam_score'] > 0.3) or
                        (dict['thread']['site_section'] not in site_sections)):
                        continue
                    # pick only relevant information of article
                    # and put in in list
                    article = [dict['thread']['uuid'],         # 0:'Uuid'
                               dict['thread']['title'],        # 1:'Title'
                               dict['text'],                   # 2:'Text'
                               dict['thread']['site'],         # 3:'Site'
                               dict['thread']['site_section'], # 4:'SiteSection'
                               dict['url'],                    # 5:'Url'
                               dict['published']]              # 6:'Timestamp'

                    # remove newlines and delimiter char
                    article[1] = article[1].replace('|', '-') # in 'Title'
                    article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text'

                    try:
                        writer.writerow(article)
                        a += 1
                    # handle undefined characters (videos and other spam)
                    except UnicodeEncodeError:
                        print('# filtered out site_section: {} (UnicodeEncodeError)'
                                    .format(dict['thread']['site_section']))
        print()
        print('# saved {} articles in file {}'.format(a, file_name))

if __name__ == '__main__':
    file_name = 'test.csv'
    JsonHandler.create_csv(file_name)
    JsonHandler.write_articles_to_csv(file_name)
removed csvHandler.py 2018-10-18 11:57:46 +00:00			`'''`
			`JSON Handler`
			`============`

			`JSON Handler reads articles from JSON files,`
			`extracts relevant information and`
			`writes it to a csv file.`
			`'''`

			`# -- coding: utf-8 --`

			`import csv`
			`import glob`
			`import json`

			`import numpy as np`
			`import pandas as pd`

			`class JsonHandler:`

			`def select_randoms(df, n):`
			`'''selects n random samples from dataset.`
			`params: df DataFrame to select items from,`
			`n number of items to select randomly,`
			`returns new DataFrame with only selected items`
			`'''`
			`# initialize random => reproducible sequence`
			`np.random.seed(5)`
			`# add new column 'Random'`
			`df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)`
			`# sort DataFrame by random numbers`
			`df = df.sort_values('Random')`
			`# return first n elements of randomly sorted dataset`
			`return df.iloc[0:n]`

			`def create_csv(file_name):`
			`# create new csv file for each month.`
			`# each row contains an news article.`

			`with open(file_name, 'w', newline='') as csvfile:`
			`writer = csv.writer(csvfile,`
			`delimiter='\|',`
			`quotechar='\'',`
			`quoting=csv.QUOTE_NONNUMERIC)`
			`# write header / column names`
			`writer.writerow(['Uuid', #0`
			`'Title', #1`
			`'Text', #2`
			`'Site', #3`
			`'SiteSection', #4`
			`'Url', #5`
			`'Timestamp']) #6`

			`def write_articles_to_csv(file_name):`
			`# path of JSON files`
			`path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json'`
			`files = glob.glob(path)`

			`# reliable sources (site_sections)`
			`site_sections = ['http://feeds.reuters.com/reuters/financialsNews',`
			`'http://feeds.reuters.com/reuters/INbusinessNews',`
			`'http://feeds.reuters.com/reuters/businessNews',`
			`'http://feeds.reuters.com/reuters/companyNews',`
			`'http://www.reuters.com/finance/deals',`
			`'http://feeds.reuters.com/reuters/mergersNews',`
			`'http://rss.cnn.com/rss/money_topstories.rss',`
			`'http://rss.cnn.com/rss/money_latest.rss',`
			`'http://www.economist.com/sections/business-finance/rss.xml',`
			`'http://rss.cnn.com/rss/edition_business.rss',`
			`'http://in.reuters.com/finance/deals',`
			`'http://feeds.reuters.com/reuters/technologyNews',`
			`'http://feeds.reuters.com/reuters/technologysectorNews',`
			`'https://www.ft.com/companies/us',`
			`'http://feeds.reuters.com/reuters/UKScienceNews',`
			`'http://in.reuters.com/news/technology',`
			`'http://in.reuters.com/finance/economy',`
			`'https://www.bloomberg.com/middleeast',`
			`'http://in.reuters.com/news/top-news']`

			`# file counter`
			`n = 0`
			`# article counter`
			`a = 0`
			`# read every JSON file in current folder`
			`with open(file_name, 'a', newline='') as csvfile:`
			`writer = csv.writer(csvfile,`
			`delimiter='\|',`
			`quotechar='\'',`
			`quoting=csv.QUOTE_NONNUMERIC)`
			`for file in files:`
			`n += 1`
			`with open(file, encoding='utf-8') as f:`
			`# Json is converted to dict`
			`dict = json.load(f)`
			`#print(n)`
			`# leave out comments or posts, take only reuters as source`
			`if ((dict['ord_in_thread'] != 0) or`
			`(dict['language'] != 'english') or`
			`(dict['thread']['spam_score'] > 0.3) or`
			`(dict['thread']['site_section'] not in site_sections)):`
			`continue`
			`# pick only relevant information of article`
			`# and put in in list`
			`article = [dict['thread']['uuid'], # 0:'Uuid'`
			`dict['thread']['title'], # 1:'Title'`
			`dict['text'], # 2:'Text'`
			`dict['thread']['site'], # 3:'Site'`
			`dict['thread']['site_section'], # 4:'SiteSection'`
			`dict['url'], # 5:'Url'`
			`dict['published']] # 6:'Timestamp'`

			`# remove newlines and delimiter char`
			`article[1] = article[1].replace('\|', '-') # in 'Title'`
			`article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('\|', '-') # in 'Text'`

			`try:`
			`writer.writerow(article)`
			`a += 1`
			`# handle undefined characters (videos and other spam)`
			`except UnicodeEncodeError:`
			`print('# filtered out site_section: {} (UnicodeEncodeError)'`
			`.format(dict['thread']['site_section']))`
			`print()`
			`print('# saved {} articles in file {}'.format(a, file_name))`

			`if __name__ == '__main__':`
			`file_name = 'test.csv'`
			`JsonHandler.create_csv(file_name)`
			`JsonHandler.write_articles_to_csv(file_name)`