refactoring

2018-10-18 12:11:11 +02:00 · 2018-10-18 12:11:11 +02:00 · a0471f3087
commit a0471f3087
parent cd3a90101f
1 changed files with 62 additions and 62 deletions
--- a/Requester.py
+++ b/Requester.py
@ -6,38 +6,30 @@ retrieves JSON files from webhose.io
 saves articles' relevant information in csv file
 '''
-# toDo: add Uuid, URL, Site and change order to:
+# toDo: update your webhose query and 
-# Title, Text, Site, SiteSection, Url, Timestamp
+# insert personal webhose key
-# toDo: insert personal webhose key
+import csv
 import re
 from datetime import datetime
 import pandas as pd
 import webhoseio
 from CsvHandler import CsvHandler
 class Requester:
-    def save_articles_from_webhoseio():
+    datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
-        ''' create DataFrame of articles with
+    filestring = 'webhoseio_articles_{}.csv'.format(datestring)
        Timestamp, Title, Text, SiteSection
        and then save it in csv target file
        '''
        datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
        filestring = 'download_articles_{}.csv'.format(datestring)
-        # print message
+    # personal API key
    # webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
    webhoseio.config(token="6a8c073f-8ba1-47a6-96e8-7603f228948d")
    def save_articles_from_webhoseio():
        print('# retrieving articles from webhose.io')
        print('# ...')
        # personal API key
        webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
        # webhose.io query
        # suboptimal: usage of search terms :-(
        query_params = {
            "q": "thread.title:(merger OR merges OR merge OR merged OR "
                 "acquisition OR \"take over\" OR \"take-over\" OR "
@ -46,10 +38,11 @@ class Requester:
                 "\"combined company\") "
                 "is_first:true "
                 "site_type:news "
-                 "site:reuters.com "
+                 "site:(nytimes.com OR reuters.com OR bloomberg.com OR "
                    "cnn.com OR economist.com OR theguardian.com) "
                 "language:english "
                 "has_video:false",
-            "ts": "1527411742661",
+            "ts": "1537264167885",
            "sort": "crawled"}
        output = webhoseio.query("filterWebContent", query_params)
@ -60,48 +53,55 @@ class Requester:
        # 100 articles per batch (download)
        num_downloads = int(sum_posts / 100)
        print('# collecting first {} articles'.format(num_downloads * 100))
        print('# sorting out other sources than reuters')
        print('# ...')
-        # twodimensional list of all articles
+        with open(Requester.filestring, 'w', newline='') as csvfile:
-        list_articles = []
+            writer = csv.writer(csvfile, 
                                delimiter='|',
                                quotechar='\'',
                                quoting=csv.QUOTE_NONNUMERIC)
            # write header / column names
            writer.writerow(['Uuid',        #0
                             'Title',       #1
                             'Text',        #2
                             'Site',        #3
                             'SiteSection', #4
                             'Url',         #5
                             'Timestamp'])  #6
-        for n in range(num_downloads):
+            for n in range(num_downloads):
-            # save next 100 articles
+                # save next 100 articles
-            for i in range(100):
+                for i in range(100):
-                # check if correct source 'reuters'
+                    try:
-                if not re.search(r'reuters',
+                        # write article as row to csv
-                                 output['posts'][i]['thread']['site_section']):
+                        writer.writerow(# 0:'Uuid'
-                    continue
+                                        [output['posts'][i]['thread']['uuid'],
-                else:
+                                        # 1:'Title'
-                    article = []
+                                        output['posts'][i]['thread']['title']
-                    article.append(output['posts'][i]['published']) # Timestamp
+                                                .replace('|', '-'),
-                    article.append(output['posts'][i]['title'].replace('|', ' ')) # Title
+                                        # 2:'Text'
-                    # remove white spaces and separators
+                                        output['posts'][i]['text'].replace('\n', ' ')\
-                    text = output['posts'][i]['text'].replace('\n', ' ')\
+                                                .replace('\r', ' ').replace('|', '-'),
-                           .replace('\r', ' ').replace('|', ' ') # Text
+                                        # 3:'Site'
-                    section = output['posts'][i]['thread']['site_section'] # SiteSection
+                                        output['posts'][i]['thread']['site'],
-                    article.append(text)
+                                        # 4:'SiteSection'
-                    # remove '\r' at end of some urls
+                                        output['posts'][i]['thread']['site_section']
-                    section = section.replace('\r', '')
+                                                .replace('\r', ' '),
-                    article.append(section)
+                                        # 5:'Url'
-                    # add article to list
+                                        output['posts'][i]['url'],
-                    list_articles.append(article)
+                                        # 6:'Timestamp'
-                    ## DORIS: WARUM SCHREIBST DU ES NICHT DIREKT IN EINE CSV, SONDERN KONVERTIERST NOCHMAL?
+                                        output['posts'][i]['published']])
                    # handle undefined characters (videos and other spam)
                    except UnicodeEncodeError:
                        print('# filtered out {} (UnicodeEncodeError)'
                                .format(output['posts'][i]['thread']['site_section']))
-            # Get the next batch of 100 posts
+                # Get the next batch of 100 posts
-            output = webhoseio.get_next()
+                output = webhoseio.get_next()
        # create DataFrame
        df = pd.DataFrame(data=list_articles,
                          columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
        # save csv
        CsvHandler.write_csv(df, filestring)
     if __name__ == '__main__':
-        print('# starting requester')
+if __name__ == '__main__':
-        print('# ...')
+
-        save_articles_from_webhoseio()
+    print('# starting requester')
-        print('# ending requester')
+    print('# ...')
    Requester.save_articles_from_webhoseio()
    print('# saved articles in file {}'.format(Requester.filestring))
    print('# ending requester')