refactoring

This commit is contained in:
Anne Lorenz 2018-10-18 12:11:11 +02:00
parent cd3a90101f
commit a0471f3087
1 changed files with 62 additions and 62 deletions

View File

@ -6,38 +6,30 @@ retrieves JSON files from webhose.io
saves articles' relevant information in csv file
'''
# toDo: add Uuid, URL, Site and change order to:
# Title, Text, Site, SiteSection, Url, Timestamp
# toDo: update your webhose query and
# insert personal webhose key
# toDo: insert personal webhose key
import re
import csv
from datetime import datetime
import pandas as pd
import webhoseio
from CsvHandler import CsvHandler
class Requester:
def save_articles_from_webhoseio():
''' create DataFrame of articles with
Timestamp, Title, Text, SiteSection
and then save it in csv target file
'''
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
filestring = 'download_articles_{}.csv'.format(datestring)
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
filestring = 'webhoseio_articles_{}.csv'.format(datestring)
# print message
# personal API key
# webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
webhoseio.config(token="6a8c073f-8ba1-47a6-96e8-7603f228948d")
def save_articles_from_webhoseio():
print('# retrieving articles from webhose.io')
print('# ...')
# personal API key
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
# webhose.io query
# suboptimal: usage of search terms :-(
query_params = {
"q": "thread.title:(merger OR merges OR merge OR merged OR "
"acquisition OR \"take over\" OR \"take-over\" OR "
@ -46,10 +38,11 @@ class Requester:
"\"combined company\") "
"is_first:true "
"site_type:news "
"site:reuters.com "
"site:(nytimes.com OR reuters.com OR bloomberg.com OR "
"cnn.com OR economist.com OR theguardian.com) "
"language:english "
"has_video:false",
"ts": "1527411742661",
"ts": "1537264167885",
"sort": "crawled"}
output = webhoseio.query("filterWebContent", query_params)
@ -60,48 +53,55 @@ class Requester:
# 100 articles per batch (download)
num_downloads = int(sum_posts / 100)
print('# collecting first {} articles'.format(num_downloads * 100))
print('# sorting out other sources than reuters')
print('# ...')
# twodimensional list of all articles
list_articles = []
with open(Requester.filestring, 'w', newline='') as csvfile:
writer = csv.writer(csvfile,
delimiter='|',
quotechar='\'',
quoting=csv.QUOTE_NONNUMERIC)
# write header / column names
writer.writerow(['Uuid', #0
'Title', #1
'Text', #2
'Site', #3
'SiteSection', #4
'Url', #5
'Timestamp']) #6
for n in range(num_downloads):
# save next 100 articles
for i in range(100):
# check if correct source 'reuters'
if not re.search(r'reuters',
output['posts'][i]['thread']['site_section']):
continue
else:
article = []
article.append(output['posts'][i]['published']) # Timestamp
article.append(output['posts'][i]['title'].replace('|', ' ')) # Title
# remove white spaces and separators
text = output['posts'][i]['text'].replace('\n', ' ')\
.replace('\r', ' ').replace('|', ' ') # Text
section = output['posts'][i]['thread']['site_section'] # SiteSection
article.append(text)
# remove '\r' at end of some urls
section = section.replace('\r', '')
article.append(section)
# add article to list
list_articles.append(article)
## DORIS: WARUM SCHREIBST DU ES NICHT DIREKT IN EINE CSV, SONDERN KONVERTIERST NOCHMAL?
for n in range(num_downloads):
# save next 100 articles
for i in range(100):
try:
# write article as row to csv
writer.writerow(# 0:'Uuid'
[output['posts'][i]['thread']['uuid'],
# 1:'Title'
output['posts'][i]['thread']['title']
.replace('|', '-'),
# 2:'Text'
output['posts'][i]['text'].replace('\n', ' ')\
.replace('\r', ' ').replace('|', '-'),
# 3:'Site'
output['posts'][i]['thread']['site'],
# 4:'SiteSection'
output['posts'][i]['thread']['site_section']
.replace('\r', ' '),
# 5:'Url'
output['posts'][i]['url'],
# 6:'Timestamp'
output['posts'][i]['published']])
# handle undefined characters (videos and other spam)
except UnicodeEncodeError:
print('# filtered out {} (UnicodeEncodeError)'
.format(output['posts'][i]['thread']['site_section']))
# Get the next batch of 100 posts
output = webhoseio.get_next()
# create DataFrame
df = pd.DataFrame(data=list_articles,
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
# save csv
CsvHandler.write_csv(df, filestring)
if __name__ == '__main__':
# Get the next batch of 100 posts
output = webhoseio.get_next()
print('# starting requester')
print('# ...')
save_articles_from_webhoseio()
print('# ending requester')
if __name__ == '__main__':
print('# starting requester')
print('# ...')
Requester.save_articles_from_webhoseio()
print('# saved articles in file {}'.format(Requester.filestring))
print('# ending requester')