refactoring

2018-10-18 12:11:11 +02:00 · 2018-10-18 12:11:11 +02:00 · a0471f3087
commit a0471f3087
parent cd3a90101f
1 changed files with 62 additions and 62 deletions
--- a/Requester.py
+++ b/Requester.py
@ -6,38 +6,30 @@ retrieves JSON files from webhose.io
 saves articles' relevant information in csv file
 '''

-# toDo: add Uuid, URL, Site and change order to:
-# Title, Text, Site, SiteSection, Url, Timestamp
+# toDo: update your webhose query and 
+# insert personal webhose key

-# toDo: insert personal webhose key
-
-import re
+import csv
 from datetime import datetime

 import pandas as pd
 import webhoseio

-from CsvHandler import CsvHandler
-
 class Requester:

-    def save_articles_from_webhoseio():
-        ''' create DataFrame of articles with
-        Timestamp, Title, Text, SiteSection
-        and then save it in csv target file
-        '''
-        datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
-        filestring = 'download_articles_{}.csv'.format(datestring)
+    datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
+    filestring = 'webhoseio_articles_{}.csv'.format(datestring)

-        # print message
+    # personal API key
+    # webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
+    webhoseio.config(token="6a8c073f-8ba1-47a6-96e8-7603f228948d")
+
+    def save_articles_from_webhoseio():
+    
        print('# retrieving articles from webhose.io')
        print('# ...')

-        # personal API key
-        webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
-
        # webhose.io query
-        # suboptimal: usage of search terms :-(
        query_params = {
            "q": "thread.title:(merger OR merges OR merge OR merged OR "
                 "acquisition OR \"take over\" OR \"take-over\" OR "
@ -46,10 +38,11 @@ class Requester:
                 "\"combined company\") "
                 "is_first:true "
                 "site_type:news "
-                 "site:reuters.com "
+                 "site:(nytimes.com OR reuters.com OR bloomberg.com OR "
+                    "cnn.com OR economist.com OR theguardian.com) "
                 "language:english "
                 "has_video:false",
-            "ts": "1527411742661",
+            "ts": "1537264167885",
            "sort": "crawled"}

        output = webhoseio.query("filterWebContent", query_params)
@ -60,48 +53,55 @@ class Requester:
        # 100 articles per batch (download)
        num_downloads = int(sum_posts / 100)
        print('# collecting first {} articles'.format(num_downloads * 100))
-        print('# sorting out other sources than reuters')
-        print('# ...')

-        # twodimensional list of all articles
-        list_articles = []
+        with open(Requester.filestring, 'w', newline='') as csvfile:
+            writer = csv.writer(csvfile, 
+                                delimiter='|',
+                                quotechar='\'',
+                                quoting=csv.QUOTE_NONNUMERIC)
+            # write header / column names
+            writer.writerow(['Uuid',        #0
+                             'Title',       #1
+                             'Text',        #2
+                             'Site',        #3
+                             'SiteSection', #4
+                             'Url',         #5
+                             'Timestamp'])  #6

-        for n in range(num_downloads):
-            # save next 100 articles
-            for i in range(100):
-                # check if correct source 'reuters'
-                if not re.search(r'reuters',
-                                 output['posts'][i]['thread']['site_section']):
-                    continue
-                else:
-                    article = []
-                    article.append(output['posts'][i]['published']) # Timestamp
-                    article.append(output['posts'][i]['title'].replace('|', ' ')) # Title
-                    # remove white spaces and separators
-                    text = output['posts'][i]['text'].replace('\n', ' ')\
-                           .replace('\r', ' ').replace('|', ' ') # Text
-                    section = output['posts'][i]['thread']['site_section'] # SiteSection
-                    article.append(text)
-                    # remove '\r' at end of some urls
-                    section = section.replace('\r', '')
-                    article.append(section)
-                    # add article to list
-                    list_articles.append(article)
-                    ## DORIS: WARUM SCHREIBST DU ES NICHT DIREKT IN EINE CSV, SONDERN KONVERTIERST NOCHMAL?
+            for n in range(num_downloads):
+                # save next 100 articles
+                for i in range(100):
+                    try:
+                        # write article as row to csv
+                        writer.writerow(# 0:'Uuid'
+                                        [output['posts'][i]['thread']['uuid'],
+                                        # 1:'Title'
+                                        output['posts'][i]['thread']['title']
+                                                .replace('|', '-'),
+                                        # 2:'Text'
+                                        output['posts'][i]['text'].replace('\n', ' ')\
+                                                .replace('\r', ' ').replace('|', '-'),
+                                        # 3:'Site'
+                                        output['posts'][i]['thread']['site'],
+                                        # 4:'SiteSection'
+                                        output['posts'][i]['thread']['site_section']
+                                                .replace('\r', ' '),
+                                        # 5:'Url'
+                                        output['posts'][i]['url'],
+                                        # 6:'Timestamp'
+                                        output['posts'][i]['published']])
+                    # handle undefined characters (videos and other spam)
+                    except UnicodeEncodeError:
+                        print('# filtered out {} (UnicodeEncodeError)'
+                                .format(output['posts'][i]['thread']['site_section']))

-            # Get the next batch of 100 posts
-            output = webhoseio.get_next()
-        
-        
-        # create DataFrame
-        df = pd.DataFrame(data=list_articles,
-                          columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
-        # save csv
-        CsvHandler.write_csv(df, filestring)
-        
-     if __name__ == '__main__':
+                # Get the next batch of 100 posts
+                output = webhoseio.get_next()

-        print('# starting requester')
-        print('# ...')
-        save_articles_from_webhoseio()
-        print('# ending requester')
+if __name__ == '__main__':
+
+    print('# starting requester')
+    print('# ...')
+    Requester.save_articles_from_webhoseio()
+    print('# saved articles in file {}'.format(Requester.filestring))
+    print('# ending requester')