2018-09-07 12:16:47 +00:00
|
|
|
'''
|
|
|
|
Requester
|
|
|
|
=========
|
|
|
|
|
|
|
|
retrieves JSON files from webhose.io
|
|
|
|
saves articles' relevant information in csv file
|
|
|
|
'''
|
|
|
|
|
|
|
|
#toDo: insert personal webhose key
|
|
|
|
|
|
|
|
import re
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
import pandas as pd
|
2018-09-17 12:47:50 +00:00
|
|
|
import webhoseio
|
2018-09-07 12:16:47 +00:00
|
|
|
|
2018-09-17 12:47:50 +00:00
|
|
|
from CsvHandler import CsvHandler
|
2018-09-07 12:16:47 +00:00
|
|
|
|
|
|
|
class Requester:
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
def save_articles_from_webhoseio():
|
|
|
|
''' create DataFrame of articles with
|
|
|
|
Timestamp, Title, Text, SiteSection
|
|
|
|
and then save it in csv target file
|
|
|
|
'''
|
|
|
|
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
|
|
|
|
filestring = 'download_articles_{}.csv'.format(datestring)
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
# print message
|
|
|
|
print('# retrieving articles from webhose.io')
|
2018-09-17 19:16:19 +00:00
|
|
|
print('# ...')
|
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
# personal API key
|
2018-09-17 12:47:50 +00:00
|
|
|
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
|
2018-09-07 12:16:47 +00:00
|
|
|
|
2018-09-17 12:47:50 +00:00
|
|
|
# webhose.io query
|
2018-09-07 12:16:47 +00:00
|
|
|
# suboptimal: usage of search terms :-(
|
|
|
|
query_params = {
|
|
|
|
"q": "thread.title:(merger OR merges OR merge OR merged OR "
|
|
|
|
"acquisition OR \"take over\" OR \"take-over\" OR "
|
|
|
|
"\"takeover\" OR deal OR transaction OR buy OR sell OR "
|
|
|
|
"approval OR approve OR \"business combination\" OR "
|
|
|
|
"\"combined company\") "
|
|
|
|
"is_first:true "
|
|
|
|
"site_type:news "
|
|
|
|
"site:reuters.com "
|
|
|
|
"language:english "
|
|
|
|
"has_video:false",
|
|
|
|
"ts": "1527411742661",
|
|
|
|
"sort": "crawled"}
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
output = webhoseio.query("filterWebContent", query_params)
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
sum_posts = output['totalResults']
|
|
|
|
print('# total sum of posts: ' + str(sum_posts))
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
# 100 articles per batch (download)
|
2018-09-17 12:47:50 +00:00
|
|
|
num_downloads = int(sum_posts / 100)
|
2018-09-07 12:16:47 +00:00
|
|
|
print('# collecting first {} articles'.format(num_downloads * 100))
|
|
|
|
print('# sorting out other sources than reuters')
|
2018-09-17 19:16:19 +00:00
|
|
|
print('# ...')
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
# twodimensional list of all articles
|
|
|
|
list_articles = []
|
|
|
|
|
|
|
|
for n in range(num_downloads):
|
|
|
|
# save next 100 articles
|
2018-09-17 12:47:50 +00:00
|
|
|
for i in range(100):
|
2018-09-07 12:16:47 +00:00
|
|
|
# check if correct source 'reuters'
|
2018-09-17 12:47:50 +00:00
|
|
|
if not re.search(r'reuters',
|
2018-09-07 12:16:47 +00:00
|
|
|
output['posts'][i]['thread']['site_section']):
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
article = []
|
|
|
|
article.append(output['posts'][i]['published'])
|
|
|
|
article.append(output['posts'][i]['title'].replace('|', ' '))
|
|
|
|
# remove white spaces and separators
|
2018-09-17 12:47:50 +00:00
|
|
|
text = output['posts'][i]['text'].replace('\n', ' ')\
|
|
|
|
.replace('\r', ' ').replace('|', ' ')
|
2018-09-07 12:16:47 +00:00
|
|
|
section = output['posts'][i]['thread']['site_section']
|
|
|
|
article.append(text)
|
|
|
|
# remove '\r' at end of some urls
|
2018-09-17 12:47:50 +00:00
|
|
|
section = section.replace('\r', '')
|
2018-09-07 12:16:47 +00:00
|
|
|
article.append(section)
|
|
|
|
# add article to list
|
|
|
|
list_articles.append(article)
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
# Get the next batch of 100 posts
|
|
|
|
output = webhoseio.get_next()
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
# create DataFrame
|
2018-09-17 12:47:50 +00:00
|
|
|
df = pd.DataFrame(data=list_articles,
|
2018-09-07 12:16:47 +00:00
|
|
|
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
|
|
|
|
# save csv
|
2018-09-17 19:16:19 +00:00
|
|
|
CsvHandler.write_csv(df, filestring)
|
2018-09-20 08:37:18 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2018-09-17 19:16:19 +00:00
|
|
|
|
2018-09-20 08:37:18 +00:00
|
|
|
print('# starting requester')
|
|
|
|
print('# ...')
|
|
|
|
save_articles_from_webhoseio()
|
|
|
|
print('# ending requester')
|