129 lines
5.6 KiB
Python
129 lines
5.6 KiB
Python
|
'''
|
||
|
JSON Handler
|
||
|
============
|
||
|
|
||
|
JSON Handler reads articles from JSON files,
|
||
|
extracts relevant information and
|
||
|
writes it to a csv file.
|
||
|
'''
|
||
|
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
import csv
|
||
|
import glob
|
||
|
import json
|
||
|
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
|
||
|
class JsonHandler:
|
||
|
|
||
|
def select_randoms(df, n):
|
||
|
'''selects n random samples from dataset.
|
||
|
params: df DataFrame to select items from,
|
||
|
n number of items to select randomly,
|
||
|
returns new DataFrame with only selected items
|
||
|
'''
|
||
|
# initialize random => reproducible sequence
|
||
|
np.random.seed(5)
|
||
|
# add new column 'Random'
|
||
|
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
|
||
|
# sort DataFrame by random numbers
|
||
|
df = df.sort_values('Random')
|
||
|
# return first n elements of randomly sorted dataset
|
||
|
return df.iloc[0:n]
|
||
|
|
||
|
def create_csv(file_name):
|
||
|
# create new csv file for each month.
|
||
|
# each row contains an news article.
|
||
|
|
||
|
with open(file_name, 'w', newline='') as csvfile:
|
||
|
writer = csv.writer(csvfile,
|
||
|
delimiter='|',
|
||
|
quotechar='\'',
|
||
|
quoting=csv.QUOTE_NONNUMERIC)
|
||
|
# write header / column names
|
||
|
writer.writerow(['Uuid', #0
|
||
|
'Title', #1
|
||
|
'Text', #2
|
||
|
'Site', #3
|
||
|
'SiteSection', #4
|
||
|
'Url', #5
|
||
|
'Timestamp']) #6
|
||
|
|
||
|
def write_articles_to_csv(file_name):
|
||
|
# path of JSON files
|
||
|
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json'
|
||
|
files = glob.glob(path)
|
||
|
|
||
|
# reliable sources (site_sections)
|
||
|
site_sections = ['http://feeds.reuters.com/reuters/financialsNews',
|
||
|
'http://feeds.reuters.com/reuters/INbusinessNews',
|
||
|
'http://feeds.reuters.com/reuters/businessNews',
|
||
|
'http://feeds.reuters.com/reuters/companyNews',
|
||
|
'http://www.reuters.com/finance/deals',
|
||
|
'http://feeds.reuters.com/reuters/mergersNews',
|
||
|
'http://rss.cnn.com/rss/money_topstories.rss',
|
||
|
'http://rss.cnn.com/rss/money_latest.rss',
|
||
|
'http://www.economist.com/sections/business-finance/rss.xml',
|
||
|
'http://rss.cnn.com/rss/edition_business.rss',
|
||
|
'http://in.reuters.com/finance/deals',
|
||
|
'http://feeds.reuters.com/reuters/technologyNews',
|
||
|
'http://feeds.reuters.com/reuters/technologysectorNews',
|
||
|
'https://www.ft.com/companies/us',
|
||
|
'http://feeds.reuters.com/reuters/UKScienceNews',
|
||
|
'http://in.reuters.com/news/technology',
|
||
|
'http://in.reuters.com/finance/economy',
|
||
|
'https://www.bloomberg.com/middleeast',
|
||
|
'http://in.reuters.com/news/top-news']
|
||
|
|
||
|
# file counter
|
||
|
n = 0
|
||
|
# article counter
|
||
|
a = 0
|
||
|
# read every JSON file in current folder
|
||
|
with open(file_name, 'a', newline='') as csvfile:
|
||
|
writer = csv.writer(csvfile,
|
||
|
delimiter='|',
|
||
|
quotechar='\'',
|
||
|
quoting=csv.QUOTE_NONNUMERIC)
|
||
|
for file in files:
|
||
|
n += 1
|
||
|
with open(file, encoding='utf-8') as f:
|
||
|
# Json is converted to dict
|
||
|
dict = json.load(f)
|
||
|
#print(n)
|
||
|
# leave out comments or posts, take only reuters as source
|
||
|
if ((dict['ord_in_thread'] != 0) or
|
||
|
(dict['language'] != 'english') or
|
||
|
(dict['thread']['spam_score'] > 0.3) or
|
||
|
(dict['thread']['site_section'] not in site_sections)):
|
||
|
continue
|
||
|
# pick only relevant information of article
|
||
|
# and put in in list
|
||
|
article = [dict['thread']['uuid'], # 0:'Uuid'
|
||
|
dict['thread']['title'], # 1:'Title'
|
||
|
dict['text'], # 2:'Text'
|
||
|
dict['thread']['site'], # 3:'Site'
|
||
|
dict['thread']['site_section'], # 4:'SiteSection'
|
||
|
dict['url'], # 5:'Url'
|
||
|
dict['published']] # 6:'Timestamp'
|
||
|
|
||
|
# remove newlines and delimiter char
|
||
|
article[1] = article[1].replace('|', '-') # in 'Title'
|
||
|
article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text'
|
||
|
|
||
|
try:
|
||
|
writer.writerow(article)
|
||
|
a += 1
|
||
|
# handle undefined characters (videos and other spam)
|
||
|
except UnicodeEncodeError:
|
||
|
print('# filtered out site_section: {} (UnicodeEncodeError)'
|
||
|
.format(dict['thread']['site_section']))
|
||
|
print()
|
||
|
print('# saved {} articles in file {}'.format(a, file_name))
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
file_name = 'test.csv'
|
||
|
JsonHandler.create_csv(file_name)
|
||
|
JsonHandler.write_articles_to_csv(file_name)
|