'''
Requester
=========

retrieves JSON files from webhose.io
saves articles' relevant information in csv file
'''

#toDo: insert personal webhose key

import re
from datetime import datetime

import pandas as pd
import webhoseio   

from CsvHandler import CsvHandler    

class Requester:
                
    def save_articles_from_webhoseio():
        ''' create DataFrame of articles with
        Timestamp, Title, Text, SiteSection
        and then save it in csv target file
        '''
        datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
        filestring = 'download_articles_{}.csv'.format(datestring)
    
        # print message
        print('# retrieving articles from webhose.io')
    
        # personal API key
         webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")

        # webhose.io query 
        # suboptimal: usage of search terms :-(
        query_params = {
            "q": "thread.title:(merger OR merges OR merge OR merged OR "
                 "acquisition OR \"take over\" OR \"take-over\" OR "
                 "\"takeover\" OR deal OR transaction OR buy OR sell OR "
                 "approval OR approve OR \"business combination\" OR "
                 "\"combined company\") "
                 "is_first:true "
                 "site_type:news "
                 "site:reuters.com "
                 "language:english "
                 "has_video:false",
            "ts": "1527411742661",
            "sort": "crawled"}
    
        output = webhoseio.query("filterWebContent", query_params)
        
        sum_posts = output['totalResults']
        print('# total sum of posts: ' + str(sum_posts))
        
        # 100 articles per batch (download)
        num_downloads = int(sum_posts / 100)       
        print('# collecting first {} articles'.format(num_downloads * 100))
        print('# sorting out other sources than reuters')
        
        # twodimensional list of all articles
        list_articles = []

        for n in range(num_downloads):
            # save next 100 articles
            for i in range(100):  
                # check if correct source 'reuters'
                if not re.search(r'reuters', 
                                 output['posts'][i]['thread']['site_section']):
                    continue
                else:
                    article = []
                    article.append(output['posts'][i]['published'])
                    article.append(output['posts'][i]['title'].replace('|', ' '))
                    # remove white spaces and separators
                    text = output['posts'][i]['text'].replace('\n', ' ')
                           .replace('\r', ' ').replace('|', ' ')                
                    section = output['posts'][i]['thread']['site_section']
                    article.append(text)
                    # remove '\r' at end of some urls
                    section = section.replace('\r', '') 
                    article.append(section)
                    # add article to list
                    list_articles.append(article)
                
            # Get the next batch of 100 posts
            output = webhoseio.get_next()
        
        # create DataFrame
        df = pd.DataFrame(data=list_articles, 
                          columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
        # save csv
        CsvHandler.write_csv(df, filestring)