''' Json Handler ============ JsonHandler reads articles from JSON files, extracts relevant information and writes it to a csv file. ''' # -*- coding: utf-8 -*- import csv import glob import json import numpy as np import pandas as pd class JsonHandler: # string for every month of the year months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'] def select_randoms(df, n): '''select n random samples from dataset. params: df DataFrame to select items from, n number of items to select randomly, return new DataFrame with only selected items. ''' # initialize random => reproducible sequence np.random.seed(5) # add new column 'Random' df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index) # sort DataFrame by random numbers df = df.sort_values('Random') # return first n elements of randomly sorted dataset return df.iloc[0:n] def create_labeling_dataset(): # number of articles to select from each month: # 10.000 / 12 = 833,33 n_select = 833 # except every third month: every_third_month = ['03', '06', '09', '12'] for m in JsonHandler.month: df = pandas.read_csv('all_{}.csv'.format(m), delimiter='|', header=0, index_col=None, engine='python', quotechar='\'', quoting=0, encoding='utf-8') # pick one more from every third article if m in every_third_month: n_select = 834 JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv', header=True, mode='a', encoding='python', quoting=QUOTE_MINIMAL, quotechar='\'') def write_articles_to_csv_files(): '''read JSON files, select articles and write them to csv. ''' # reliable sources (site_sections) site_sections = [] # read list from 'sections.txt' file with open('sections.txt', 'r') as s_list: site_sections = s_list.read().split('\n') # article counter a = 0 for m in JsonHandler.months: # 1 output file per month output_file = 'all_{}.csv'.format(m) # path of input JSON files per month path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\ '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\ '\\news_[0-9]*.json'.format(m) files = glob.glob(path) # file counter n = 0 # write separate csv file for every month with open(output_file, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter='|', quotechar='\'', quoting=csv.QUOTE_NONNUMERIC) # write header / column names writer.writerow(['Uuid', #0 'Title', #1 'Text', #2 'Site', #3 'SiteSection', #4 'Url', #5 'Timestamp']) #6 # write articles for file in files: n += 1 # read every JSON file with open(file, encoding='utf-8') as f: # Json is converted to dict dict = json.load(f) # check if comment or post if ((dict['ord_in_thread'] != 0) or # check if not english (dict['language'] != 'english') or # check if spam (dict['thread']['spam_score'] > 0.3) or # check if reliable source (dict['thread']['site_section'] not in site_sections) or # check if text parsed correctly ('Further company coverage:' in dict['text']) or (('subscription' or 'subscribe') in dict['text']) or (len(dict['text']) < 300)): continue else: try: # replace whitespaces and delimiter chars # and write to csv writer.writerow([dict['thread']['uuid'], dict['thread']['title']\ .replace('|', '-'), dict['text']\ .replace('\n', '')\ .replace('\r', '')\ .replace('|', '-'), dict['thread']['site'], dict['thread']['site_section']\ .replace('\n', '')\ .replace('\r', ''), dict['url'], dict['published']]) a += 1 # handle undefined characters (videos and other spam) except UnicodeEncodeError: print('# filtered out: {} (UnicodeEncodeError)' .format(dict['thread']['site_section'])) print('# saved articles in file {}, now {} in total'.format(output_file, a)) print('#') print('# saved {} articles in total'.format(a)) print('#') if __name__ == '__main__': JsonHandler.write_articles_to_csv_files() #JsonHandler.create_labeling_dataset()