''' Requester ========= retrieves JSON files from webhose.io saves articles' relevant information in csv file ''' # toDo: update your webhose query and # insert personal webhose key import csv from datetime import datetime import pandas as pd import webhoseio class Requester: datestring = datetime.strftime(datetime.now(), '%Y-%m-%d') filestring = 'webhoseio_articles_{}.csv'.format(datestring) # personal API key # webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX") def save_articles_from_webhoseio(): print('# retrieving articles from webhose.io') print('# ...') # webhose.io query query_params = { "q": "thread.title:(merger OR merges OR merge OR merged OR " "acquisition OR \"take over\" OR \"take-over\" OR " "\"takeover\" OR deal OR transaction OR buy OR sell OR " "approval OR approve OR \"business combination\" OR " "\"combined company\") " "is_first:true " "site_type:news " "site:(nytimes.com OR reuters.com OR bloomberg.com OR " "cnn.com OR economist.com OR theguardian.com) " "language:english " "has_video:false", "ts": "1537264167885", "sort": "crawled"} output = webhoseio.query("filterWebContent", query_params) sum_posts = output['totalResults'] print('# total sum of posts: ' + str(sum_posts)) # 100 articles per batch (download) num_downloads = int(sum_posts / 100) print('# collecting first {} articles'.format(num_downloads * 100)) with open(Requester.filestring, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter='|', quotechar='\'', quoting=csv.QUOTE_NONNUMERIC) # write header / column names writer.writerow(['Uuid', #0 'Title', #1 'Text', #2 'Site', #3 'SiteSection', #4 'Url', #5 'Timestamp']) #6 for n in range(num_downloads): # save next 100 articles for i in range(100): try: # write article as row to csv writer.writerow(# 0:'Uuid' [output['posts'][i]['thread']['uuid'], # 1:'Title' output['posts'][i]['thread']['title'] .replace('|', '-'), # 2:'Text' output['posts'][i]['text'].replace('\n', ' ')\ .replace('\r', ' ').replace('|', '-'), # 3:'Site' output['posts'][i]['thread']['site'], # 4:'SiteSection' output['posts'][i]['thread']['site_section'] .replace('\r', ' '), # 5:'Url' output['posts'][i]['url'], # 6:'Timestamp' output['posts'][i]['published']]) # handle undefined characters (videos and other spam) except UnicodeEncodeError: print('# filtered out {} (UnicodeEncodeError)' .format(output['posts'][i]['thread']['site_section'])) # Get the next batch of 100 posts output = webhoseio.get_next() if __name__ == '__main__': print('# starting requester') print('# ...') Requester.save_articles_from_webhoseio() print('# saved articles in file {}'.format(Requester.filestring)) print('# ending requester')