2018-09-07 12:16:47 +00:00
retrieves JSON files from webhose.io
saves articles' relevant information in csv file
2018-10-18 10:11:11 +00:00
# toDo: update your webhose query and
# insert personal webhose key
2018-10-18 08:48:07 +00:00
2018-10-18 10:11:11 +00:00
import csv
2018-09-07 12:16:47 +00:00
from datetime import datetime
import pandas as pd
2018-09-17 12:47:50 +00:00
import webhoseio
2018-09-07 12:16:47 +00:00
class Requester:
2018-09-17 12:47:50 +00:00
2018-10-18 10:11:11 +00:00
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
filestring = 'webhoseio_articles_{}.csv'.format(datestring)
# personal API key
# webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
2018-09-07 12:16:47 +00:00
def save_articles_from_webhoseio():
2018-10-18 10:11:11 +00:00
2018-09-07 12:16:47 +00:00
print('# retrieving articles from webhose.io')
2018-09-17 19:16:19 +00:00
print('# ...')
2018-09-17 12:47:50 +00:00
# webhose.io query
2018-09-07 12:16:47 +00:00
query_params = {
"q": "thread.title:(merger OR merges OR merge OR merged OR "
"acquisition OR \"take over\" OR \"take-over\" OR "
"\"takeover\" OR deal OR transaction OR buy OR sell OR "
"approval OR approve OR \"business combination\" OR "
"\"combined company\") "
"is_first:true "
"site_type:news "
2018-10-18 10:11:11 +00:00
"site:(nytimes.com OR reuters.com OR bloomberg.com OR "
"cnn.com OR economist.com OR theguardian.com) "
2018-09-07 12:16:47 +00:00
"language:english "
2018-10-18 10:11:11 +00:00
"ts": "1537264167885",
2018-09-07 12:16:47 +00:00
"sort": "crawled"}
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
output = webhoseio.query("filterWebContent", query_params)
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
sum_posts = output['totalResults']
print('# total sum of posts: ' + str(sum_posts))
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
# 100 articles per batch (download)
2018-09-17 12:47:50 +00:00
num_downloads = int(sum_posts / 100)
2018-09-07 12:16:47 +00:00
print('# collecting first {} articles'.format(num_downloads * 100))
2018-09-17 12:47:50 +00:00
2018-10-18 10:11:11 +00:00
with open(Requester.filestring, 'w', newline='') as csvfile:
writer = csv.writer(csvfile,
# write header / column names
writer.writerow(['Uuid', #0
'Title', #1
'Text', #2
'Site', #3
'SiteSection', #4
'Url', #5
'Timestamp']) #6
for n in range(num_downloads):
# save next 100 articles
for i in range(100):
# write article as row to csv
writer.writerow(# 0:'Uuid'
# 1:'Title'
.replace('|', '-'),
# 2:'Text'
output['posts'][i]['text'].replace('\n', ' ')\
.replace('\r', ' ').replace('|', '-'),
# 3:'Site'
# 4:'SiteSection'
.replace('\r', ' '),
# 5:'Url'
# 6:'Timestamp'
# handle undefined characters (videos and other spam)
except UnicodeEncodeError:
print('# filtered out {} (UnicodeEncodeError)'
# Get the next batch of 100 posts
output = webhoseio.get_next()
if __name__ == '__main__':
print('# starting requester')
print('# ...')
print('# saved articles in file {}'.format(Requester.filestring))
print('# ending requester')