''' File Handler ============ FileHandler reads articles from JSON files, extracts relevant information and writes it to a csv file. ''' # -*- coding: utf-8 -*- import csv import glob import json import numpy as np import pandas as pd class FileHandler: # strings for every month of the year months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'] def select_randoms(df, n): '''select n random samples from dataset. params: df DataFrame to select items from, n number of items to select randomly, return new DataFrame with only selected items. ''' # initialize random => reproducible sequence np.random.seed(5) # add new column 'Random' df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index) # sort DataFrame by random numbers df = df.sort_values('Random') # return first n elements of randomly sorted dataset return df.iloc[0:n] def create_labeling_dataset(): # output file o_file = 'data\\interactive_labeling_dataset.csv' # create file and write header with open(o_file, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter='|', quotechar='\'', quoting=csv.QUOTE_NONNUMERIC) writer.writerow(['Uuid', #0 'Title', #1 'Text', #2 'Site', #3 'SiteSection', #4 'Url', #5 'Timestamp']) #6 # number of articles to select from each month (10000/12=833,33) n_select = 833 for m in FileHandler.months: df = pd.read_csv('data\\articles\\all_{}.csv'.format(m), delimiter='|', header=0, index_col=None, engine='python', quoting=csv.QUOTE_NONNUMERIC, quotechar='\'') # pick one more from every third article if m in ['03', '06', '09', '12']: n_select = 834 random_articles = FileHandler.select_randoms(df, n_select) del random_articles['Random'] random_articles.to_csv(o_file, header=False, index=False, sep='|', mode='a', encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC, quotechar='\'') def write_articles_to_csv_files(): '''read JSON files, select articles and write them to csv. ''' # reliable sources (site_sections) site_sections = [] # read list from 'sections.txt' file with open('data\\sections.txt', 'r') as s_list: site_sections = s_list.read().split('\n') # article counter a = 0 for m in FileHandler.months: # 1 output file per month output_file = 'data\\articles\\all_{}.csv'.format(m) # path of input JSON files per month path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\ '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\ '\\news_[0-9]*.json'.format(m) files = glob.glob(path) # file counter n = 0 # write separate csv file for every month with open(output_file, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter='|', quotechar='\'', quoting=csv.QUOTE_NONNUMERIC) # write header / column names writer.writerow(['Uuid', #0 'Title', #1 'Text', #2 'Site', #3 'SiteSection', #4 'Url', #5 'Timestamp']) #6 # write articles for file in files: n += 1 # read every JSON file with open(file, encoding='utf-8') as f: # Json is converted to dict dict = json.load(f) # check if comment or post if ((dict['ord_in_thread'] != 0) or # check if not english (dict['language'] != 'english') or # check if spam (dict['thread']['spam_score'] > 0.3) or # check if reliable source (dict['thread']['site_section'] not in site_sections) or # check if text parsed correctly ('Further company coverage:' in dict['text']) or (('subscription' or 'subscribe') in dict['text']) or (len(dict['text']) < 200)): continue else: try: # replace whitespaces and delimiter chars # and write to csv writer.writerow([dict['thread']['uuid'], dict['thread']['title']\ .replace('|', '-'), dict['text']\ .replace('\n', '')\ .replace('\r', '')\ .replace('|', '-'), dict['thread']['site'], dict['thread']['site_section']\ .replace('\n', '')\ .replace('\r', ''), dict['url'], dict['published']]) a += 1 # handle undefined characters (videos and other spam) except UnicodeEncodeError: print('# filtered out: {} (UnicodeEncodeError)' .format(dict['thread']['site_section'])) print('# saved articles in file {}, now {} in total'.format(output_file, a)) print('#') print('# saved {} articles in total'.format(a)) print('#') def join_all_csv_files(): if __name__ == '__main__': # FileHandler.write_articles_to_csv_files() # FileHandler.create_labeling_dataset()