thesis-anne/Requester.py

106 lines
4.2 KiB
Python

'''
Requester
=========
retrieves JSON files from webhose.io
saves articles' relevant information in csv file
'''
# toDo: update your webhose query and
# insert personal webhose key
import csv
from datetime import datetime
import pandas as pd
import webhoseio
class Requester:
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
filestring = 'webhoseio_articles_{}.csv'.format(datestring)
# personal API key
# webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
def save_articles_from_webhoseio():
print('# retrieving articles from webhose.io')
print('# ...')
# webhose.io query
query_params = {
"q": "thread.title:(merger OR merges OR merge OR merged OR "
"acquisition OR \"take over\" OR \"take-over\" OR "
"\"takeover\" OR deal OR transaction OR buy OR sell OR "
"approval OR approve OR \"business combination\" OR "
"\"combined company\") "
"is_first:true "
"site_type:news "
"site:(nytimes.com OR reuters.com OR bloomberg.com OR "
"cnn.com OR economist.com OR theguardian.com) "
"language:english "
"has_video:false",
"ts": "1537264167885",
"sort": "crawled"}
output = webhoseio.query("filterWebContent", query_params)
sum_posts = output['totalResults']
print('# total sum of posts: ' + str(sum_posts))
# 100 articles per batch (download)
num_downloads = int(sum_posts / 100)
print('# collecting first {} articles'.format(num_downloads * 100))
with open(Requester.filestring, 'w', newline='') as csvfile:
writer = csv.writer(csvfile,
delimiter='|',
quotechar='\'',
quoting=csv.QUOTE_NONNUMERIC)
# write header / column names
writer.writerow(['Uuid', #0
'Title', #1
'Text', #2
'Site', #3
'SiteSection', #4
'Url', #5
'Timestamp']) #6
for n in range(num_downloads):
# save next 100 articles
for i in range(100):
try:
# write article as row to csv
writer.writerow(# 0:'Uuid'
[output['posts'][i]['thread']['uuid'],
# 1:'Title'
output['posts'][i]['thread']['title']
.replace('|', '-'),
# 2:'Text'
output['posts'][i]['text'].replace('\n', ' ')\
.replace('\r', ' ').replace('|', '-'),
# 3:'Site'
output['posts'][i]['thread']['site'],
# 4:'SiteSection'
output['posts'][i]['thread']['site_section']
.replace('\r', ' '),
# 5:'Url'
output['posts'][i]['url'],
# 6:'Timestamp'
output['posts'][i]['published']])
# handle undefined characters (videos and other spam)
except UnicodeEncodeError:
print('# filtered out {} (UnicodeEncodeError)'
.format(output['posts'][i]['thread']['site_section']))
# Get the next batch of 100 posts
output = webhoseio.get_next()
if __name__ == '__main__':
print('# starting requester')
print('# ...')
Requester.save_articles_from_webhoseio()
print('# saved articles in file {}'.format(Requester.filestring))
print('# ending requester')