From a0471f3087ac20843e99ff6f456534e882a1c100 Mon Sep 17 00:00:00 2001 From: Anne Lorenz Date: Thu, 18 Oct 2018 12:11:11 +0200 Subject: [PATCH] refactoring --- Requester.py | 124 +++++++++++++++++++++++++-------------------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/Requester.py b/Requester.py index b6db7bb..45ca62e 100644 --- a/Requester.py +++ b/Requester.py @@ -6,38 +6,30 @@ retrieves JSON files from webhose.io saves articles' relevant information in csv file ''' -# toDo: add Uuid, URL, Site and change order to: -# Title, Text, Site, SiteSection, Url, Timestamp +# toDo: update your webhose query and +# insert personal webhose key -# toDo: insert personal webhose key - -import re +import csv from datetime import datetime import pandas as pd import webhoseio -from CsvHandler import CsvHandler - class Requester: - def save_articles_from_webhoseio(): - ''' create DataFrame of articles with - Timestamp, Title, Text, SiteSection - and then save it in csv target file - ''' - datestring = datetime.strftime(datetime.now(), '%Y-%m-%d') - filestring = 'download_articles_{}.csv'.format(datestring) + datestring = datetime.strftime(datetime.now(), '%Y-%m-%d') + filestring = 'webhoseio_articles_{}.csv'.format(datestring) - # print message + # personal API key + # webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX") + webhoseio.config(token="6a8c073f-8ba1-47a6-96e8-7603f228948d") + + def save_articles_from_webhoseio(): + print('# retrieving articles from webhose.io') print('# ...') - # personal API key - webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX") - # webhose.io query - # suboptimal: usage of search terms :-( query_params = { "q": "thread.title:(merger OR merges OR merge OR merged OR " "acquisition OR \"take over\" OR \"take-over\" OR " @@ -46,10 +38,11 @@ class Requester: "\"combined company\") " "is_first:true " "site_type:news " - "site:reuters.com " + "site:(nytimes.com OR reuters.com OR bloomberg.com OR " + "cnn.com OR economist.com OR theguardian.com) " "language:english " "has_video:false", - "ts": "1527411742661", + "ts": "1537264167885", "sort": "crawled"} output = webhoseio.query("filterWebContent", query_params) @@ -60,48 +53,55 @@ class Requester: # 100 articles per batch (download) num_downloads = int(sum_posts / 100) print('# collecting first {} articles'.format(num_downloads * 100)) - print('# sorting out other sources than reuters') - print('# ...') - # twodimensional list of all articles - list_articles = [] + with open(Requester.filestring, 'w', newline='') as csvfile: + writer = csv.writer(csvfile, + delimiter='|', + quotechar='\'', + quoting=csv.QUOTE_NONNUMERIC) + # write header / column names + writer.writerow(['Uuid', #0 + 'Title', #1 + 'Text', #2 + 'Site', #3 + 'SiteSection', #4 + 'Url', #5 + 'Timestamp']) #6 - for n in range(num_downloads): - # save next 100 articles - for i in range(100): - # check if correct source 'reuters' - if not re.search(r'reuters', - output['posts'][i]['thread']['site_section']): - continue - else: - article = [] - article.append(output['posts'][i]['published']) # Timestamp - article.append(output['posts'][i]['title'].replace('|', ' ')) # Title - # remove white spaces and separators - text = output['posts'][i]['text'].replace('\n', ' ')\ - .replace('\r', ' ').replace('|', ' ') # Text - section = output['posts'][i]['thread']['site_section'] # SiteSection - article.append(text) - # remove '\r' at end of some urls - section = section.replace('\r', '') - article.append(section) - # add article to list - list_articles.append(article) - ## DORIS: WARUM SCHREIBST DU ES NICHT DIREKT IN EINE CSV, SONDERN KONVERTIERST NOCHMAL? + for n in range(num_downloads): + # save next 100 articles + for i in range(100): + try: + # write article as row to csv + writer.writerow(# 0:'Uuid' + [output['posts'][i]['thread']['uuid'], + # 1:'Title' + output['posts'][i]['thread']['title'] + .replace('|', '-'), + # 2:'Text' + output['posts'][i]['text'].replace('\n', ' ')\ + .replace('\r', ' ').replace('|', '-'), + # 3:'Site' + output['posts'][i]['thread']['site'], + # 4:'SiteSection' + output['posts'][i]['thread']['site_section'] + .replace('\r', ' '), + # 5:'Url' + output['posts'][i]['url'], + # 6:'Timestamp' + output['posts'][i]['published']]) + # handle undefined characters (videos and other spam) + except UnicodeEncodeError: + print('# filtered out {} (UnicodeEncodeError)' + .format(output['posts'][i]['thread']['site_section'])) - # Get the next batch of 100 posts - output = webhoseio.get_next() - - - # create DataFrame - df = pd.DataFrame(data=list_articles, - columns=['Timestamp', 'Title', 'Text', 'SiteSection']) - # save csv - CsvHandler.write_csv(df, filestring) - - if __name__ == '__main__': + # Get the next batch of 100 posts + output = webhoseio.get_next() - print('# starting requester') - print('# ...') - save_articles_from_webhoseio() - print('# ending requester') \ No newline at end of file +if __name__ == '__main__': + + print('# starting requester') + print('# ...') + Requester.save_articles_from_webhoseio() + print('# saved articles in file {}'.format(Requester.filestring)) + print('# ending requester') \ No newline at end of file