refactoring

This commit is contained in:
Anne Lorenz 2018-10-18 12:11:11 +02:00
parent cd3a90101f
commit a0471f3087
1 changed files with 62 additions and 62 deletions

View File

@ -6,38 +6,30 @@ retrieves JSON files from webhose.io
saves articles' relevant information in csv file saves articles' relevant information in csv file
''' '''
# toDo: add Uuid, URL, Site and change order to: # toDo: update your webhose query and
# Title, Text, Site, SiteSection, Url, Timestamp # insert personal webhose key
# toDo: insert personal webhose key import csv
import re
from datetime import datetime from datetime import datetime
import pandas as pd import pandas as pd
import webhoseio import webhoseio
from CsvHandler import CsvHandler
class Requester: class Requester:
def save_articles_from_webhoseio(): datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
''' create DataFrame of articles with filestring = 'webhoseio_articles_{}.csv'.format(datestring)
Timestamp, Title, Text, SiteSection
and then save it in csv target file
'''
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
filestring = 'download_articles_{}.csv'.format(datestring)
# print message # personal API key
# webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
webhoseio.config(token="6a8c073f-8ba1-47a6-96e8-7603f228948d")
def save_articles_from_webhoseio():
print('# retrieving articles from webhose.io') print('# retrieving articles from webhose.io')
print('# ...') print('# ...')
# personal API key
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
# webhose.io query # webhose.io query
# suboptimal: usage of search terms :-(
query_params = { query_params = {
"q": "thread.title:(merger OR merges OR merge OR merged OR " "q": "thread.title:(merger OR merges OR merge OR merged OR "
"acquisition OR \"take over\" OR \"take-over\" OR " "acquisition OR \"take over\" OR \"take-over\" OR "
@ -46,10 +38,11 @@ class Requester:
"\"combined company\") " "\"combined company\") "
"is_first:true " "is_first:true "
"site_type:news " "site_type:news "
"site:reuters.com " "site:(nytimes.com OR reuters.com OR bloomberg.com OR "
"cnn.com OR economist.com OR theguardian.com) "
"language:english " "language:english "
"has_video:false", "has_video:false",
"ts": "1527411742661", "ts": "1537264167885",
"sort": "crawled"} "sort": "crawled"}
output = webhoseio.query("filterWebContent", query_params) output = webhoseio.query("filterWebContent", query_params)
@ -60,48 +53,55 @@ class Requester:
# 100 articles per batch (download) # 100 articles per batch (download)
num_downloads = int(sum_posts / 100) num_downloads = int(sum_posts / 100)
print('# collecting first {} articles'.format(num_downloads * 100)) print('# collecting first {} articles'.format(num_downloads * 100))
print('# sorting out other sources than reuters')
print('# ...')
# twodimensional list of all articles with open(Requester.filestring, 'w', newline='') as csvfile:
list_articles = [] writer = csv.writer(csvfile,
delimiter='|',
quotechar='\'',
quoting=csv.QUOTE_NONNUMERIC)
# write header / column names
writer.writerow(['Uuid', #0
'Title', #1
'Text', #2
'Site', #3
'SiteSection', #4
'Url', #5
'Timestamp']) #6
for n in range(num_downloads): for n in range(num_downloads):
# save next 100 articles # save next 100 articles
for i in range(100): for i in range(100):
# check if correct source 'reuters' try:
if not re.search(r'reuters', # write article as row to csv
output['posts'][i]['thread']['site_section']): writer.writerow(# 0:'Uuid'
continue [output['posts'][i]['thread']['uuid'],
else: # 1:'Title'
article = [] output['posts'][i]['thread']['title']
article.append(output['posts'][i]['published']) # Timestamp .replace('|', '-'),
article.append(output['posts'][i]['title'].replace('|', ' ')) # Title # 2:'Text'
# remove white spaces and separators output['posts'][i]['text'].replace('\n', ' ')\
text = output['posts'][i]['text'].replace('\n', ' ')\ .replace('\r', ' ').replace('|', '-'),
.replace('\r', ' ').replace('|', ' ') # Text # 3:'Site'
section = output['posts'][i]['thread']['site_section'] # SiteSection output['posts'][i]['thread']['site'],
article.append(text) # 4:'SiteSection'
# remove '\r' at end of some urls output['posts'][i]['thread']['site_section']
section = section.replace('\r', '') .replace('\r', ' '),
article.append(section) # 5:'Url'
# add article to list output['posts'][i]['url'],
list_articles.append(article) # 6:'Timestamp'
## DORIS: WARUM SCHREIBST DU ES NICHT DIREKT IN EINE CSV, SONDERN KONVERTIERST NOCHMAL? output['posts'][i]['published']])
# handle undefined characters (videos and other spam)
except UnicodeEncodeError:
print('# filtered out {} (UnicodeEncodeError)'
.format(output['posts'][i]['thread']['site_section']))
# Get the next batch of 100 posts # Get the next batch of 100 posts
output = webhoseio.get_next() output = webhoseio.get_next()
# create DataFrame
df = pd.DataFrame(data=list_articles,
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
# save csv
CsvHandler.write_csv(df, filestring)
if __name__ == '__main__':
print('# starting requester') if __name__ == '__main__':
print('# ...')
save_articles_from_webhoseio() print('# starting requester')
print('# ending requester') print('# ...')
Requester.save_articles_from_webhoseio()
print('# saved articles in file {}'.format(Requester.filestring))
print('# ending requester')