thesis-anne/FileHandler.py

167 lines
7.1 KiB
Python
Raw Normal View History

2018-10-22 08:17:52 +00:00
'''
2018-10-22 09:53:03 +00:00
File Handler
2018-10-22 08:17:52 +00:00
============
2018-10-22 09:53:03 +00:00
FileHandler reads articles from JSON files,
2018-10-22 08:17:52 +00:00
extracts relevant information and
writes it to a csv file.
'''
# -*- coding: utf-8 -*-
import csv
import glob
import json
import numpy as np
import pandas as pd
2018-10-22 09:53:03 +00:00
class FileHandler:
2018-10-22 08:17:52 +00:00
2018-10-22 09:53:03 +00:00
# strings for every month of the year
2018-10-22 08:17:52 +00:00
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
'11', '12']
def select_randoms(df, n):
'''select n random samples from dataset.
params: df DataFrame to select items from,
n number of items to select randomly,
return new DataFrame with only selected items.
'''
# initialize random => reproducible sequence
np.random.seed(5)
# add new column 'Random'
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
# sort DataFrame by random numbers
df = df.sort_values('Random')
# return first n elements of randomly sorted dataset
return df.iloc[0:n]
def create_labeling_dataset():
2018-10-22 09:53:03 +00:00
# output file
o_file = 'data\\interactive_labeling_dataset.csv'
# create file and write header
with open(o_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile,
delimiter='|',
quotechar='\'',
quoting=csv.QUOTE_NONNUMERIC)
writer.writerow(['Uuid', #0
'Title', #1
'Text', #2
'Site', #3
'SiteSection', #4
'Url', #5
'Timestamp']) #6
# number of articles to select from each month (10000/12=833,33)
2018-10-22 08:17:52 +00:00
n_select = 833
2018-10-22 09:53:03 +00:00
for m in FileHandler.months:
df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
delimiter='|',
header=0,
index_col=None,
engine='python',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
2018-10-22 08:17:52 +00:00
# pick one more from every third article
2018-10-22 09:53:03 +00:00
if m in ['03', '06', '09', '12']:
2018-10-22 08:17:52 +00:00
n_select = 834
2018-10-22 09:53:03 +00:00
random_articles = FileHandler.select_randoms(df, n_select)
del random_articles['Random']
random_articles.to_csv(o_file,
header=False,
index=False,
sep='|',
mode='a',
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
2018-10-22 08:17:52 +00:00
def write_articles_to_csv_files():
'''read JSON files, select articles and write them to csv.
'''
# reliable sources (site_sections)
site_sections = []
# read list from 'sections.txt' file
2018-10-22 09:53:03 +00:00
with open('data\\sections.txt', 'r') as s_list:
2018-10-22 08:17:52 +00:00
site_sections = s_list.read().split('\n')
# article counter
a = 0
2018-10-22 09:53:03 +00:00
for m in FileHandler.months:
2018-10-22 08:17:52 +00:00
# 1 output file per month
2018-10-22 09:53:03 +00:00
output_file = 'data\\articles\\all_{}.csv'.format(m)
2018-10-22 08:17:52 +00:00
# path of input JSON files per month
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
'\\news_[0-9]*.json'.format(m)
files = glob.glob(path)
# file counter
n = 0
# write separate csv file for every month
with open(output_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile,
delimiter='|',
quotechar='\'',
quoting=csv.QUOTE_NONNUMERIC)
# write header / column names
writer.writerow(['Uuid', #0
'Title', #1
'Text', #2
'Site', #3
'SiteSection', #4
'Url', #5
'Timestamp']) #6
# write articles
for file in files:
n += 1
# read every JSON file
with open(file, encoding='utf-8') as f:
# Json is converted to dict
dict = json.load(f)
# check if comment or post
if ((dict['ord_in_thread'] != 0) or
# check if not english
(dict['language'] != 'english') or
# check if spam
(dict['thread']['spam_score'] > 0.3) or
# check if reliable source
(dict['thread']['site_section'] not in site_sections) or
# check if text parsed correctly
('Further company coverage:' in dict['text']) or
(('subscription' or 'subscribe') in dict['text']) or
2018-10-26 09:30:19 +00:00
(len(dict['text']) < 200)):
2018-10-22 08:17:52 +00:00
continue
else:
try:
# replace whitespaces and delimiter chars
# and write to csv
writer.writerow([dict['thread']['uuid'],
dict['thread']['title']\
.replace('|', '-'),
dict['text']\
.replace('\n', '')\
.replace('\r', '')\
.replace('|', '-'),
dict['thread']['site'],
dict['thread']['site_section']\
.replace('\n', '')\
.replace('\r', ''),
dict['url'],
dict['published']])
a += 1
# handle undefined characters (videos and other spam)
except UnicodeEncodeError:
print('# filtered out: {} (UnicodeEncodeError)'
.format(dict['thread']['site_section']))
print('# saved articles in file {}, now {} in total'.format(output_file, a))
print('#')
print('# saved {} articles in total'.format(a))
print('#')
2018-10-22 09:53:03 +00:00
def join_all_csv_files():
2018-10-22 08:17:52 +00:00
if __name__ == '__main__':
2018-10-22 09:53:03 +00:00
# FileHandler.write_articles_to_csv_files()
# FileHandler.create_labeling_dataset()