236 lines
9.5 KiB
Python
236 lines
9.5 KiB
Python
'''
|
|
File Handler
|
|
============
|
|
|
|
FileHandler reads articles from JSON files,
|
|
extracts relevant information and
|
|
writes it to a csv file.
|
|
'''
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import csv
|
|
import glob
|
|
import json
|
|
import string
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
class FileHandler:
|
|
|
|
# strings for every month of the year
|
|
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
|
|
'11', '12']
|
|
|
|
def select_randoms(df, n):
|
|
'''select n random samples from dataset.
|
|
params: df DataFrame to select items from,
|
|
n number of items to select randomly,
|
|
return new DataFrame with only selected items.
|
|
'''
|
|
# initialize random => reproducible sequence
|
|
np.random.seed(5)
|
|
# add new column 'Random'
|
|
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
|
|
# sort DataFrame by random numbers
|
|
df = df.sort_values('Random')
|
|
# return first n elements of randomly sorted dataset
|
|
return df.iloc[0:n]
|
|
|
|
def create_labeling_dataset():
|
|
# output file
|
|
o_file = '..\\data\\cleaned_data_set_without_header.csv'
|
|
# create file and write header
|
|
with open(o_file, 'w', newline='') as csvfile:
|
|
writer = csv.writer(csvfile,
|
|
delimiter='|',
|
|
quotechar='\'',
|
|
quoting=csv.QUOTE_NONNUMERIC)
|
|
writer.writerow(['Uuid', #0
|
|
'Title', #1
|
|
'Text', #2
|
|
'Site', #3
|
|
'SiteSection', #4
|
|
'Url', #5
|
|
'Timestamp']) #6
|
|
# number of articles to select from each month (10000/12=833,33)
|
|
n_select = 833
|
|
for m in FileHandler.months:
|
|
df = pd.read_csv('..\\data\\articles\\all_{}.csv'.format(m),
|
|
delimiter='|',
|
|
header=0,
|
|
index_col=None,
|
|
engine='python',
|
|
quoting=csv.QUOTE_NONNUMERIC,
|
|
quotechar='\'')
|
|
# pick one more from every third article
|
|
if m in ['03', '06', '09', '12']:
|
|
n_select = 834
|
|
random_articles = FileHandler.select_randoms(df, n_select)
|
|
del random_articles['Random']
|
|
random_articles.to_csv(o_file,
|
|
header=False,
|
|
index=False,
|
|
sep='|',
|
|
mode='a',
|
|
encoding='utf-8',
|
|
quoting=csv.QUOTE_NONNUMERIC,
|
|
quotechar='\'')
|
|
|
|
def clean_articles():
|
|
'''clean articles in data set: filter out all non-printable characters
|
|
'''
|
|
# read data set
|
|
file = '..\\data\\cleaned_data_set_without_header.csv'
|
|
df = pd.read_csv(file,
|
|
delimiter='|',
|
|
header=None,
|
|
index_col=None,
|
|
engine='python',
|
|
#usecols=[1,2],
|
|
#nrows=100,
|
|
quoting=csv.QUOTE_NONNUMERIC,
|
|
quotechar='\'')
|
|
|
|
# for each article(row)
|
|
for i in range (len(df)):
|
|
# filter headline
|
|
df.iloc[i][1] = ''.join(x for x in df.iloc[i][1] if x in string.printable)
|
|
# filter text
|
|
df.iloc[i][2] = ''.join(x for x in df.iloc[i][2] if x in string.printable)
|
|
print(df)
|
|
# save cleaned dataframe
|
|
df.to_csv('data\\cleaned_data_set_without_header.csv',
|
|
header=False,
|
|
index=False,
|
|
sep='|',
|
|
mode='a',
|
|
encoding='utf-8',
|
|
quoting=csv.QUOTE_NONNUMERIC,
|
|
quotechar='\'')
|
|
|
|
def remove_duplicates():
|
|
'''remove articles with exactly same headline
|
|
'''
|
|
# read data set
|
|
file = '..\\data\\cleaned_data_set_without_header.csv'
|
|
df = pd.read_csv(file,
|
|
delimiter='|',
|
|
header=None,
|
|
index_col=None,
|
|
engine='python',
|
|
#usecols=[1,2],
|
|
#nrows=100,
|
|
quoting=csv.QUOTE_NONNUMERIC,
|
|
quotechar='\'')
|
|
print(type(df))
|
|
|
|
# for each article(row)
|
|
i = 1
|
|
while i < len(df):
|
|
# check if already in column
|
|
if df.iloc[i][1] in df.iloc[0:(i-1)][1]:
|
|
# remove duplicate
|
|
del df.iloc[i]
|
|
else:
|
|
i += 1
|
|
|
|
# save cleaned dataframe
|
|
df.to_csv('..\\data\\cleaned_data_set_without_header.csv',
|
|
header=False,
|
|
index=False,
|
|
sep='|',
|
|
mode='a',
|
|
encoding='utf-8',
|
|
quoting=csv.QUOTE_NONNUMERIC,
|
|
quotechar='\'')
|
|
|
|
def write_articles_to_csv_files():
|
|
'''read JSON files, select articles and write them to csv.
|
|
'''
|
|
# reliable sources (site_sections)
|
|
site_sections = []
|
|
# read list from 'sections.txt' file
|
|
with open('..\\data\\sections.txt', 'r') as s_list:
|
|
site_sections = s_list.read().split('\n')
|
|
|
|
# article counter
|
|
a = 0
|
|
for m in FileHandler.months:
|
|
# 1 output file per month
|
|
output_file = '..\\data\\articles\\all_{}.csv'.format(m)
|
|
# path of input JSON files per month
|
|
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
|
|
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
|
|
'\\news_[0-9]*.json'.format(m)
|
|
files = glob.glob(path)
|
|
|
|
# file counter
|
|
n = 0
|
|
# write separate csv file for every month
|
|
with open(output_file, 'w', newline='') as csvfile:
|
|
writer = csv.writer(csvfile,
|
|
delimiter='|',
|
|
quotechar='\'',
|
|
quoting=csv.QUOTE_NONNUMERIC)
|
|
|
|
# write header / column names
|
|
writer.writerow(['Uuid', #0
|
|
'Title', #1
|
|
'Text', #2
|
|
'Site', #3
|
|
'SiteSection', #4
|
|
'Url', #5
|
|
'Timestamp']) #6
|
|
# write articles
|
|
for file in files:
|
|
n += 1
|
|
# read every JSON file
|
|
with open(file, encoding='utf-8') as f:
|
|
# Json is converted to dict
|
|
dict = json.load(f)
|
|
# check if comment or post
|
|
if ((dict['ord_in_thread'] != 0) or
|
|
# check if not english
|
|
(dict['language'] != 'english') or
|
|
# check if spam
|
|
(dict['thread']['spam_score'] > 0.3) or
|
|
# check if reliable source
|
|
(dict['thread']['site_section'] not in site_sections) or
|
|
# check if text parsed correctly
|
|
('Further company coverage:' in dict['text']) or
|
|
(('subscription' or 'subscribe') in dict['text']) or
|
|
(len(dict['text']) < 200)):
|
|
continue
|
|
else:
|
|
try:
|
|
# replace whitespaces and delimiter chars
|
|
# and write to csv
|
|
writer.writerow([dict['thread']['uuid'],
|
|
dict['thread']['title']\
|
|
.replace('|', '-'),
|
|
dict['text']\
|
|
.replace('\n', '')\
|
|
.replace('\r', '')\
|
|
.replace('|', '-'),
|
|
dict['thread']['site'],
|
|
dict['thread']['site_section']\
|
|
.replace('\n', '')\
|
|
.replace('\r', ''),
|
|
dict['url'],
|
|
dict['published']])
|
|
a += 1
|
|
# handle undefined characters (videos and other spam)
|
|
except UnicodeEncodeError:
|
|
print('# filtered out: {} (UnicodeEncodeError)'
|
|
.format(dict['thread']['site_section']))
|
|
print('# saved articles in file {}, now {} in total'.format(output_file, a))
|
|
print('#')
|
|
print('# saved {} articles in total'.format(a))
|
|
print('#')
|
|
|
|
if __name__ == '__main__':
|
|
# FileHandler.write_articles_to_csv_files()
|
|
# FileHandler.create_labeling_dataset()
|
|
FileHandler.remove_duplicates() |