|
|
@ -24,9 +24,7 @@ class JsonHandler: |
|
|
|
n number of items to select randomly, |
|
|
|
returns new DataFrame with only selected items |
|
|
|
''' |
|
|
|
|
|
|
|
## df.sample(n=5, random_state=42) gibt dir 5 zufallswerte, ist das das, was du suchst? |
|
|
|
|
|
|
|
|
|
|
|
# initialize random => reproducible sequence |
|
|
|
np.random.seed(5) |
|
|
|
# add new column 'Random' |
|
|
@ -56,7 +54,9 @@ class JsonHandler: |
|
|
|
|
|
|
|
def write_articles_to_csv(file_name): |
|
|
|
# path of JSON files |
|
|
|
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json' |
|
|
|
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\ |
|
|
|
'\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4'\ |
|
|
|
'\\news_[0-9]*.json' |
|
|
|
files = glob.glob(path) |
|
|
|
|
|
|
|
# reliable sources (site_sections) |
|
|
@ -104,24 +104,25 @@ class JsonHandler: |
|
|
|
continue |
|
|
|
# pick only relevant information of article |
|
|
|
# and put in in list |
|
|
|
article = [dict['thread']['uuid'], # 0:'Uuid' |
|
|
|
dict['thread']['title'], # 1:'Title' |
|
|
|
dict['text'], # 2:'Text' |
|
|
|
dict['thread']['site'], # 3:'Site' |
|
|
|
dict['thread']['site_section'], # 4:'SiteSection' |
|
|
|
dict['url'], # 5:'Url' |
|
|
|
dict['published']] # 6:'Timestamp' |
|
|
|
article = [dict['thread']['uuid'], # 0:'Uuid' |
|
|
|
dict['thread']['title'], # 1:'Title' |
|
|
|
dict['text'], # 2:'Text' |
|
|
|
dict['thread']['site'], # 3:'Site' |
|
|
|
dict['thread']['site_section'],# 4:'SiteSection' |
|
|
|
dict['url'], # 5:'Url' |
|
|
|
dict['published']] # 6:'Timestamp' |
|
|
|
|
|
|
|
# remove newlines and delimiter char |
|
|
|
article[1] = article[1].replace('|', '-') # in 'Title' |
|
|
|
article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text' |
|
|
|
# remove newlines and delimiter chars |
|
|
|
article[1] = article[1].replace('|', '-') |
|
|
|
article[2] = article[2].replace('\n', ' ')\ |
|
|
|
.replace('\r', ' ').replace('|', '-') |
|
|
|
|
|
|
|
try: |
|
|
|
writer.writerow(article) |
|
|
|
a += 1 |
|
|
|
# handle undefined characters (videos and other spam) |
|
|
|
except UnicodeEncodeError: |
|
|
|
print('# filtered out site_section: {} (UnicodeEncodeError)' |
|
|
|
print('# filtered out: {} (UnicodeEncodeError)' |
|
|
|
.format(dict['thread']['site_section'])) |
|
|
|
print() |
|
|
|
print('# saved {} articles in file {}'.format(a, file_name)) |
|
|
|