106 lines
4.2 KiB
Python
106 lines
4.2 KiB
Python
'''
|
|
Requester
|
|
=========
|
|
|
|
retrieves JSON files from webhose.io
|
|
saves articles' relevant information in csv file
|
|
'''
|
|
|
|
# toDo: update your webhose query and
|
|
# insert personal webhose key
|
|
|
|
import csv
|
|
from datetime import datetime
|
|
|
|
import pandas as pd
|
|
import webhoseio
|
|
|
|
class Requester:
|
|
|
|
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
|
|
filestring = 'webhoseio_articles_{}.csv'.format(datestring)
|
|
|
|
# personal API key
|
|
# webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
|
|
|
|
def save_articles_from_webhoseio():
|
|
|
|
print('# retrieving articles from webhose.io')
|
|
print('# ...')
|
|
|
|
# webhose.io query
|
|
query_params = {
|
|
"q": "thread.title:(merger OR merges OR merge OR merged OR "
|
|
"acquisition OR \"take over\" OR \"take-over\" OR "
|
|
"\"takeover\" OR deal OR transaction OR buy OR sell OR "
|
|
"approval OR approve OR \"business combination\" OR "
|
|
"\"combined company\") "
|
|
"is_first:true "
|
|
"site_type:news "
|
|
"site:(nytimes.com OR reuters.com OR bloomberg.com OR "
|
|
"cnn.com OR economist.com OR theguardian.com) "
|
|
"language:english "
|
|
"has_video:false",
|
|
"ts": "1537264167885",
|
|
"sort": "crawled"}
|
|
|
|
output = webhoseio.query("filterWebContent", query_params)
|
|
|
|
sum_posts = output['totalResults']
|
|
print('# total sum of posts: ' + str(sum_posts))
|
|
|
|
# 100 articles per batch (download)
|
|
num_downloads = int(sum_posts / 100)
|
|
print('# collecting first {} articles'.format(num_downloads * 100))
|
|
|
|
with open(Requester.filestring, 'w', newline='') as csvfile:
|
|
writer = csv.writer(csvfile,
|
|
delimiter='|',
|
|
quotechar='\'',
|
|
quoting=csv.QUOTE_NONNUMERIC)
|
|
# write header / column names
|
|
writer.writerow(['Uuid', #0
|
|
'Title', #1
|
|
'Text', #2
|
|
'Site', #3
|
|
'SiteSection', #4
|
|
'Url', #5
|
|
'Timestamp']) #6
|
|
|
|
for n in range(num_downloads):
|
|
# save next 100 articles
|
|
for i in range(100):
|
|
try:
|
|
# write article as row to csv
|
|
writer.writerow(# 0:'Uuid'
|
|
[output['posts'][i]['thread']['uuid'],
|
|
# 1:'Title'
|
|
output['posts'][i]['thread']['title']
|
|
.replace('|', '-'),
|
|
# 2:'Text'
|
|
output['posts'][i]['text'].replace('\n', ' ')\
|
|
.replace('\r', ' ').replace('|', '-'),
|
|
# 3:'Site'
|
|
output['posts'][i]['thread']['site'],
|
|
# 4:'SiteSection'
|
|
output['posts'][i]['thread']['site_section']
|
|
.replace('\r', ' '),
|
|
# 5:'Url'
|
|
output['posts'][i]['url'],
|
|
# 6:'Timestamp'
|
|
output['posts'][i]['published']])
|
|
# handle undefined characters (videos and other spam)
|
|
except UnicodeEncodeError:
|
|
print('# filtered out {} (UnicodeEncodeError)'
|
|
.format(output['posts'][i]['thread']['site_section']))
|
|
|
|
# Get the next batch of 100 posts
|
|
output = webhoseio.get_next()
|
|
|
|
if __name__ == '__main__':
|
|
|
|
print('# starting requester')
|
|
print('# ...')
|
|
Requester.save_articles_from_webhoseio()
|
|
print('# saved articles in file {}'.format(Requester.filestring))
|
|
print('# ending requester') |