refactoring
This commit is contained in:
parent
cd3a90101f
commit
a0471f3087
124
Requester.py
124
Requester.py
|
@ -6,38 +6,30 @@ retrieves JSON files from webhose.io
|
|||
saves articles' relevant information in csv file
|
||||
'''
|
||||
|
||||
# toDo: add Uuid, URL, Site and change order to:
|
||||
# Title, Text, Site, SiteSection, Url, Timestamp
|
||||
# toDo: update your webhose query and
|
||||
# insert personal webhose key
|
||||
|
||||
# toDo: insert personal webhose key
|
||||
|
||||
import re
|
||||
import csv
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
import webhoseio
|
||||
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
class Requester:
|
||||
|
||||
def save_articles_from_webhoseio():
|
||||
''' create DataFrame of articles with
|
||||
Timestamp, Title, Text, SiteSection
|
||||
and then save it in csv target file
|
||||
'''
|
||||
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
|
||||
filestring = 'download_articles_{}.csv'.format(datestring)
|
||||
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
|
||||
filestring = 'webhoseio_articles_{}.csv'.format(datestring)
|
||||
|
||||
# print message
|
||||
# personal API key
|
||||
# webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
|
||||
webhoseio.config(token="6a8c073f-8ba1-47a6-96e8-7603f228948d")
|
||||
|
||||
def save_articles_from_webhoseio():
|
||||
|
||||
print('# retrieving articles from webhose.io')
|
||||
print('# ...')
|
||||
|
||||
# personal API key
|
||||
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
|
||||
|
||||
# webhose.io query
|
||||
# suboptimal: usage of search terms :-(
|
||||
query_params = {
|
||||
"q": "thread.title:(merger OR merges OR merge OR merged OR "
|
||||
"acquisition OR \"take over\" OR \"take-over\" OR "
|
||||
|
@ -46,10 +38,11 @@ class Requester:
|
|||
"\"combined company\") "
|
||||
"is_first:true "
|
||||
"site_type:news "
|
||||
"site:reuters.com "
|
||||
"site:(nytimes.com OR reuters.com OR bloomberg.com OR "
|
||||
"cnn.com OR economist.com OR theguardian.com) "
|
||||
"language:english "
|
||||
"has_video:false",
|
||||
"ts": "1527411742661",
|
||||
"ts": "1537264167885",
|
||||
"sort": "crawled"}
|
||||
|
||||
output = webhoseio.query("filterWebContent", query_params)
|
||||
|
@ -60,48 +53,55 @@ class Requester:
|
|||
# 100 articles per batch (download)
|
||||
num_downloads = int(sum_posts / 100)
|
||||
print('# collecting first {} articles'.format(num_downloads * 100))
|
||||
print('# sorting out other sources than reuters')
|
||||
print('# ...')
|
||||
|
||||
# twodimensional list of all articles
|
||||
list_articles = []
|
||||
with open(Requester.filestring, 'w', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile,
|
||||
delimiter='|',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONNUMERIC)
|
||||
# write header / column names
|
||||
writer.writerow(['Uuid', #0
|
||||
'Title', #1
|
||||
'Text', #2
|
||||
'Site', #3
|
||||
'SiteSection', #4
|
||||
'Url', #5
|
||||
'Timestamp']) #6
|
||||
|
||||
for n in range(num_downloads):
|
||||
# save next 100 articles
|
||||
for i in range(100):
|
||||
# check if correct source 'reuters'
|
||||
if not re.search(r'reuters',
|
||||
output['posts'][i]['thread']['site_section']):
|
||||
continue
|
||||
else:
|
||||
article = []
|
||||
article.append(output['posts'][i]['published']) # Timestamp
|
||||
article.append(output['posts'][i]['title'].replace('|', ' ')) # Title
|
||||
# remove white spaces and separators
|
||||
text = output['posts'][i]['text'].replace('\n', ' ')\
|
||||
.replace('\r', ' ').replace('|', ' ') # Text
|
||||
section = output['posts'][i]['thread']['site_section'] # SiteSection
|
||||
article.append(text)
|
||||
# remove '\r' at end of some urls
|
||||
section = section.replace('\r', '')
|
||||
article.append(section)
|
||||
# add article to list
|
||||
list_articles.append(article)
|
||||
## DORIS: WARUM SCHREIBST DU ES NICHT DIREKT IN EINE CSV, SONDERN KONVERTIERST NOCHMAL?
|
||||
for n in range(num_downloads):
|
||||
# save next 100 articles
|
||||
for i in range(100):
|
||||
try:
|
||||
# write article as row to csv
|
||||
writer.writerow(# 0:'Uuid'
|
||||
[output['posts'][i]['thread']['uuid'],
|
||||
# 1:'Title'
|
||||
output['posts'][i]['thread']['title']
|
||||
.replace('|', '-'),
|
||||
# 2:'Text'
|
||||
output['posts'][i]['text'].replace('\n', ' ')\
|
||||
.replace('\r', ' ').replace('|', '-'),
|
||||
# 3:'Site'
|
||||
output['posts'][i]['thread']['site'],
|
||||
# 4:'SiteSection'
|
||||
output['posts'][i]['thread']['site_section']
|
||||
.replace('\r', ' '),
|
||||
# 5:'Url'
|
||||
output['posts'][i]['url'],
|
||||
# 6:'Timestamp'
|
||||
output['posts'][i]['published']])
|
||||
# handle undefined characters (videos and other spam)
|
||||
except UnicodeEncodeError:
|
||||
print('# filtered out {} (UnicodeEncodeError)'
|
||||
.format(output['posts'][i]['thread']['site_section']))
|
||||
|
||||
# Get the next batch of 100 posts
|
||||
output = webhoseio.get_next()
|
||||
|
||||
|
||||
# create DataFrame
|
||||
df = pd.DataFrame(data=list_articles,
|
||||
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
|
||||
# save csv
|
||||
CsvHandler.write_csv(df, filestring)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Get the next batch of 100 posts
|
||||
output = webhoseio.get_next()
|
||||
|
||||
print('# starting requester')
|
||||
print('# ...')
|
||||
save_articles_from_webhoseio()
|
||||
print('# ending requester')
|
||||
if __name__ == '__main__':
|
||||
|
||||
print('# starting requester')
|
||||
print('# ...')
|
||||
Requester.save_articles_from_webhoseio()
|
||||
print('# saved articles in file {}'.format(Requester.filestring))
|
||||
print('# ending requester')
|
Loading…
Reference in New Issue