''' JSON Handler ============ JSON Handler reads articles from JSON files, extracts relevant information and writes it to a csv file. ''' # -*- coding: utf-8 -*- import csv import glob import json import numpy as np import pandas as pd class JsonHandler: def select_randoms(df, n): '''selects n random samples from dataset. params: df DataFrame to select items from, n number of items to select randomly, returns new DataFrame with only selected items ''' # initialize random => reproducible sequence np.random.seed(5) # add new column 'Random' df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index) # sort DataFrame by random numbers df = df.sort_values('Random') # return first n elements of randomly sorted dataset return df.iloc[0:n] def create_csv(file_name): # create new csv file for each month. # each row contains an news article. with open(file_name, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter='|', quotechar='\'', quoting=csv.QUOTE_NONNUMERIC) # write header / column names writer.writerow(['Uuid', #0 'Title', #1 'Text', #2 'Site', #3 'SiteSection', #4 'Url', #5 'Timestamp']) #6 def write_articles_to_csv(file_name): # path of JSON files path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json' files = glob.glob(path) # reliable sources (site_sections) site_sections = ['http://feeds.reuters.com/reuters/financialsNews', 'http://feeds.reuters.com/reuters/INbusinessNews', 'http://feeds.reuters.com/reuters/businessNews', 'http://feeds.reuters.com/reuters/companyNews', 'http://www.reuters.com/finance/deals', 'http://feeds.reuters.com/reuters/mergersNews', 'http://rss.cnn.com/rss/money_topstories.rss', 'http://rss.cnn.com/rss/money_latest.rss', 'http://www.economist.com/sections/business-finance/rss.xml', 'http://rss.cnn.com/rss/edition_business.rss', 'http://in.reuters.com/finance/deals', 'http://feeds.reuters.com/reuters/technologyNews', 'http://feeds.reuters.com/reuters/technologysectorNews', 'https://www.ft.com/companies/us', 'http://feeds.reuters.com/reuters/UKScienceNews', 'http://in.reuters.com/news/technology', 'http://in.reuters.com/finance/economy', 'https://www.bloomberg.com/middleeast', 'http://in.reuters.com/news/top-news'] # file counter n = 0 # article counter a = 0 # read every JSON file in current folder with open(file_name, 'a', newline='') as csvfile: writer = csv.writer(csvfile, delimiter='|', quotechar='\'', quoting=csv.QUOTE_NONNUMERIC) for file in files: n += 1 with open(file, encoding='utf-8') as f: # Json is converted to dict dict = json.load(f) #print(n) # leave out comments or posts, take only reuters as source if ((dict['ord_in_thread'] != 0) or (dict['language'] != 'english') or (dict['thread']['spam_score'] > 0.3) or (dict['thread']['site_section'] not in site_sections)): continue # pick only relevant information of article # and put in in list article = [dict['thread']['uuid'], # 0:'Uuid' dict['thread']['title'], # 1:'Title' dict['text'], # 2:'Text' dict['thread']['site'], # 3:'Site' dict['thread']['site_section'], # 4:'SiteSection' dict['url'], # 5:'Url' dict['published']] # 6:'Timestamp' # remove newlines and delimiter char article[1] = article[1].replace('|', '-') # in 'Title' article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text' try: writer.writerow(article) a += 1 # handle undefined characters (videos and other spam) except UnicodeEncodeError: print('# filtered out site_section: {} (UnicodeEncodeError)' .format(dict['thread']['site_section'])) print() print('# saved {} articles in file {}'.format(a, file_name)) if __name__ == '__main__': file_name = 'test.csv' JsonHandler.create_csv(file_name) JsonHandler.write_articles_to_csv(file_name)