thesis-anne/Requester.py

100 lines
3.4 KiB
Python
Raw Normal View History

2018-09-07 12:16:47 +00:00
'''
Requester
=========
retrieves JSON files from webhose.io
saves articles' relevant information in csv file
'''
#toDo: insert personal webhose key
import re
from datetime import datetime
import pandas as pd
2018-09-17 12:47:50 +00:00
import webhoseio
2018-09-07 12:16:47 +00:00
2018-09-17 12:47:50 +00:00
from CsvHandler import CsvHandler
2018-09-07 12:16:47 +00:00
class Requester:
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
def save_articles_from_webhoseio():
''' create DataFrame of articles with
Timestamp, Title, Text, SiteSection
and then save it in csv target file
'''
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
filestring = 'download_articles_{}.csv'.format(datestring)
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
# print message
print('# retrieving articles from webhose.io')
2018-09-17 19:16:19 +00:00
print('# ...')
2018-09-07 12:16:47 +00:00
# personal API key
2018-09-17 12:47:50 +00:00
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
2018-09-07 12:16:47 +00:00
2018-09-17 12:47:50 +00:00
# webhose.io query
2018-09-07 12:16:47 +00:00
# suboptimal: usage of search terms :-(
query_params = {
"q": "thread.title:(merger OR merges OR merge OR merged OR "
"acquisition OR \"take over\" OR \"take-over\" OR "
"\"takeover\" OR deal OR transaction OR buy OR sell OR "
"approval OR approve OR \"business combination\" OR "
"\"combined company\") "
"is_first:true "
"site_type:news "
"site:reuters.com "
"language:english "
"has_video:false",
"ts": "1527411742661",
"sort": "crawled"}
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
output = webhoseio.query("filterWebContent", query_params)
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
sum_posts = output['totalResults']
print('# total sum of posts: ' + str(sum_posts))
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
# 100 articles per batch (download)
2018-09-17 12:47:50 +00:00
num_downloads = int(sum_posts / 100)
2018-09-07 12:16:47 +00:00
print('# collecting first {} articles'.format(num_downloads * 100))
print('# sorting out other sources than reuters')
2018-09-17 19:16:19 +00:00
print('# ...')
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
# twodimensional list of all articles
list_articles = []
for n in range(num_downloads):
# save next 100 articles
2018-09-17 12:47:50 +00:00
for i in range(100):
2018-09-07 12:16:47 +00:00
# check if correct source 'reuters'
2018-09-17 12:47:50 +00:00
if not re.search(r'reuters',
2018-09-07 12:16:47 +00:00
output['posts'][i]['thread']['site_section']):
continue
else:
article = []
article.append(output['posts'][i]['published'])
article.append(output['posts'][i]['title'].replace('|', ' '))
# remove white spaces and separators
2018-09-17 12:47:50 +00:00
text = output['posts'][i]['text'].replace('\n', ' ')\
.replace('\r', ' ').replace('|', ' ')
2018-09-07 12:16:47 +00:00
section = output['posts'][i]['thread']['site_section']
article.append(text)
# remove '\r' at end of some urls
2018-09-17 12:47:50 +00:00
section = section.replace('\r', '')
2018-09-07 12:16:47 +00:00
article.append(section)
# add article to list
list_articles.append(article)
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
# Get the next batch of 100 posts
output = webhoseio.get_next()
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
# create DataFrame
2018-09-17 12:47:50 +00:00
df = pd.DataFrame(data=list_articles,
2018-09-07 12:16:47 +00:00
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
# save csv
2018-09-17 19:16:19 +00:00
CsvHandler.write_csv(df, filestring)
print('# starting requester')
print('# ...')
save_articles_from_webhoseio()
print('# ending requester')