''' Requester ========= retrieves JSON files from webhose.io saves articles' relevant information in csv file ''' #toDo: insert personal webhose key import re from datetime import datetime import pandas as pd import webhoseio from CsvHandler import CsvHandler class Requester: def save_articles_from_webhoseio(): ''' create DataFrame of articles with Timestamp, Title, Text, SiteSection and then save it in csv target file ''' datestring = datetime.strftime(datetime.now(), '%Y-%m-%d') filestring = 'download_articles_{}.csv'.format(datestring) # print message print('# retrieving articles from webhose.io') # personal API key webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX") # webhose.io query # suboptimal: usage of search terms :-( query_params = { "q": "thread.title:(merger OR merges OR merge OR merged OR " "acquisition OR \"take over\" OR \"take-over\" OR " "\"takeover\" OR deal OR transaction OR buy OR sell OR " "approval OR approve OR \"business combination\" OR " "\"combined company\") " "is_first:true " "site_type:news " "site:reuters.com " "language:english " "has_video:false", "ts": "1527411742661", "sort": "crawled"} output = webhoseio.query("filterWebContent", query_params) sum_posts = output['totalResults'] print('# total sum of posts: ' + str(sum_posts)) # 100 articles per batch (download) num_downloads = int(sum_posts / 100) print('# collecting first {} articles'.format(num_downloads * 100)) print('# sorting out other sources than reuters') # twodimensional list of all articles list_articles = [] for n in range(num_downloads): # save next 100 articles for i in range(100): # check if correct source 'reuters' if not re.search(r'reuters', output['posts'][i]['thread']['site_section']): continue else: article = [] article.append(output['posts'][i]['published']) article.append(output['posts'][i]['title'].replace('|', ' ')) # remove white spaces and separators text = output['posts'][i]['text'].replace('\n', ' ') .replace('\r', ' ').replace('|', ' ') section = output['posts'][i]['thread']['site_section'] article.append(text) # remove '\r' at end of some urls section = section.replace('\r', '') article.append(section) # add article to list list_articles.append(article) # Get the next batch of 100 posts output = webhoseio.get_next() # create DataFrame df = pd.DataFrame(data=list_articles, columns=['Timestamp', 'Title', 'Text', 'SiteSection']) # save csv CsvHandler.write_csv(df, filestring)