refactoring
This commit is contained in:
parent
0c15d49d7e
commit
b6e48feb16
@ -111,7 +111,7 @@ class DecisionTree:
|
||||
print('# starting decision tree')
|
||||
print('# ...')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
file = 'data\\classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
|
@ -1,8 +1,8 @@
|
||||
'''
|
||||
Json Handler
|
||||
File Handler
|
||||
============
|
||||
|
||||
JsonHandler reads articles from JSON files,
|
||||
FileHandler reads articles from JSON files,
|
||||
extracts relevant information and
|
||||
writes it to a csv file.
|
||||
'''
|
||||
@ -16,9 +16,9 @@ import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
class JsonHandler:
|
||||
class FileHandler:
|
||||
|
||||
# string for every month of the year
|
||||
# strings for every month of the year
|
||||
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
|
||||
'11', '12']
|
||||
|
||||
@ -28,7 +28,6 @@ class JsonHandler:
|
||||
n number of items to select randomly,
|
||||
return new DataFrame with only selected items.
|
||||
'''
|
||||
|
||||
# initialize random => reproducible sequence
|
||||
np.random.seed(5)
|
||||
# add new column 'Random'
|
||||
@ -39,29 +38,44 @@ class JsonHandler:
|
||||
return df.iloc[0:n]
|
||||
|
||||
def create_labeling_dataset():
|
||||
# number of articles to select from each month:
|
||||
# 10.000 / 12 = 833,33
|
||||
# output file
|
||||
o_file = 'data\\interactive_labeling_dataset.csv'
|
||||
# create file and write header
|
||||
with open(o_file, 'w', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile,
|
||||
delimiter='|',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONNUMERIC)
|
||||
writer.writerow(['Uuid', #0
|
||||
'Title', #1
|
||||
'Text', #2
|
||||
'Site', #3
|
||||
'SiteSection', #4
|
||||
'Url', #5
|
||||
'Timestamp']) #6
|
||||
# number of articles to select from each month (10000/12=833,33)
|
||||
n_select = 833
|
||||
# except every third month:
|
||||
every_third_month = ['03', '06', '09', '12']
|
||||
for m in JsonHandler.month:
|
||||
df = pandas.read_csv('all_{}.csv'.format(m),
|
||||
delimiter='|',
|
||||
header=0,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
quotechar='\'',
|
||||
quoting=0,
|
||||
encoding='utf-8')
|
||||
for m in FileHandler.months:
|
||||
df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
|
||||
delimiter='|',
|
||||
header=0,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
# pick one more from every third article
|
||||
if m in every_third_month:
|
||||
if m in ['03', '06', '09', '12']:
|
||||
n_select = 834
|
||||
JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv',
|
||||
header=True,
|
||||
mode='a',
|
||||
encoding='python',
|
||||
quoting=QUOTE_MINIMAL,
|
||||
quotechar='\'')
|
||||
random_articles = FileHandler.select_randoms(df, n_select)
|
||||
del random_articles['Random']
|
||||
random_articles.to_csv(o_file,
|
||||
header=False,
|
||||
index=False,
|
||||
sep='|',
|
||||
mode='a',
|
||||
encoding='utf-8',
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
|
||||
def write_articles_to_csv_files():
|
||||
'''read JSON files, select articles and write them to csv.
|
||||
@ -69,14 +83,14 @@ class JsonHandler:
|
||||
# reliable sources (site_sections)
|
||||
site_sections = []
|
||||
# read list from 'sections.txt' file
|
||||
with open('sections.txt', 'r') as s_list:
|
||||
with open('data\\sections.txt', 'r') as s_list:
|
||||
site_sections = s_list.read().split('\n')
|
||||
|
||||
# article counter
|
||||
a = 0
|
||||
for m in JsonHandler.months:
|
||||
for m in FileHandler.months:
|
||||
# 1 output file per month
|
||||
output_file = 'all_{}.csv'.format(m)
|
||||
output_file = 'data\\articles\\all_{}.csv'.format(m)
|
||||
# path of input JSON files per month
|
||||
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
|
||||
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
|
||||
@ -146,7 +160,8 @@ class JsonHandler:
|
||||
print('#')
|
||||
print('# saved {} articles in total'.format(a))
|
||||
print('#')
|
||||
def join_all_csv_files():
|
||||
|
||||
if __name__ == '__main__':
|
||||
JsonHandler.write_articles_to_csv_files()
|
||||
#JsonHandler.create_labeling_dataset()
|
||||
# FileHandler.write_articles_to_csv_files()
|
||||
# FileHandler.create_labeling_dataset()
|
13
NER.py
13
NER.py
@ -5,15 +5,15 @@ Named Entity Recognition (NER)
|
||||
Stanford NER takes a text as input and returns a list of entities
|
||||
like persons, organizations and countries, e.g.
|
||||
'''
|
||||
import csv
|
||||
import os
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from nltk.tag import StanfordNERTagger
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
class NER:
|
||||
|
||||
# toDo: complete lists:
|
||||
@ -118,8 +118,13 @@ class NER:
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
filepath = 'classification_labelled_corrected.csv'
|
||||
df = CsvHandler.read_csv(filepath)
|
||||
filepath = 'data\\classification_labelled_corrected.csv'
|
||||
df = pd.read_csv(filepath,
|
||||
sep='|',
|
||||
engine='python',
|
||||
decimal='.',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONE)
|
||||
|
||||
# only articles with label==1
|
||||
df_hits = df[df['Label'] == 1]
|
||||
|
@ -177,7 +177,7 @@ class NaiveBayes:
|
||||
print('# starting naive bayes')
|
||||
print('# ...')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
file = 'data\\classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
|
@ -153,7 +153,7 @@ class NaiveBayes_Interactive:
|
||||
print('# starting naive bayes')
|
||||
print('# ...')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
file = 'data\\classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
|
2
SVM.py
2
SVM.py
@ -91,7 +91,7 @@ class SVM:
|
||||
print('# starting svm')
|
||||
print('# ...')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
file = 'data\\classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
|
4514
data/articles/all_01.csv
Normal file
4514
data/articles/all_01.csv
Normal file
File diff suppressed because one or more lines are too long
3853
data/articles/all_02.csv
Normal file
3853
data/articles/all_02.csv
Normal file
File diff suppressed because one or more lines are too long
4299
data/articles/all_03.csv
Normal file
4299
data/articles/all_03.csv
Normal file
File diff suppressed because one or more lines are too long
3312
data/articles/all_04.csv
Normal file
3312
data/articles/all_04.csv
Normal file
File diff suppressed because one or more lines are too long
4127
data/articles/all_05.csv
Normal file
4127
data/articles/all_05.csv
Normal file
File diff suppressed because one or more lines are too long
3388
data/articles/all_06.csv
Normal file
3388
data/articles/all_06.csv
Normal file
File diff suppressed because one or more lines are too long
2372
data/articles/all_07.csv
Normal file
2372
data/articles/all_07.csv
Normal file
File diff suppressed because one or more lines are too long
2981
data/articles/all_08.csv
Normal file
2981
data/articles/all_08.csv
Normal file
File diff suppressed because one or more lines are too long
3296
data/articles/all_09.csv
Normal file
3296
data/articles/all_09.csv
Normal file
File diff suppressed because one or more lines are too long
3491
data/articles/all_10.csv
Normal file
3491
data/articles/all_10.csv
Normal file
File diff suppressed because one or more lines are too long
3391
data/articles/all_11.csv
Normal file
3391
data/articles/all_11.csv
Normal file
File diff suppressed because one or more lines are too long
2777
data/articles/all_12.csv
Normal file
2777
data/articles/all_12.csv
Normal file
File diff suppressed because one or more lines are too long
Can't render this file because it is too large.
|
10007
data/interactive_labeling_dataset.csv
Normal file
10007
data/interactive_labeling_dataset.csv
Normal file
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user