refactoring

This commit is contained in:
Anne Lorenz 2018-10-22 11:53:03 +02:00
parent 0c15d49d7e
commit b6e48feb16
21 changed files with 51866 additions and 38 deletions

View File

@ -111,7 +111,7 @@ class DecisionTree:
print('# starting decision tree') print('# starting decision tree')
print('# ...') print('# ...')
file = 'classification_labelled_corrected.csv' file = 'data\\classification_labelled_corrected.csv'
# read csv file # read csv file
print('# reading dataset') print('# reading dataset')

View File

@ -1,8 +1,8 @@
''' '''
Json Handler File Handler
============ ============
JsonHandler reads articles from JSON files, FileHandler reads articles from JSON files,
extracts relevant information and extracts relevant information and
writes it to a csv file. writes it to a csv file.
''' '''
@ -16,9 +16,9 @@ import json
import numpy as np import numpy as np
import pandas as pd import pandas as pd
class JsonHandler: class FileHandler:
# string for every month of the year # strings for every month of the year
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
'11', '12'] '11', '12']
@ -28,7 +28,6 @@ class JsonHandler:
n number of items to select randomly, n number of items to select randomly,
return new DataFrame with only selected items. return new DataFrame with only selected items.
''' '''
# initialize random => reproducible sequence # initialize random => reproducible sequence
np.random.seed(5) np.random.seed(5)
# add new column 'Random' # add new column 'Random'
@ -39,29 +38,44 @@ class JsonHandler:
return df.iloc[0:n] return df.iloc[0:n]
def create_labeling_dataset(): def create_labeling_dataset():
# number of articles to select from each month: # output file
# 10.000 / 12 = 833,33 o_file = 'data\\interactive_labeling_dataset.csv'
# create file and write header
with open(o_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile,
delimiter='|',
quotechar='\'',
quoting=csv.QUOTE_NONNUMERIC)
writer.writerow(['Uuid', #0
'Title', #1
'Text', #2
'Site', #3
'SiteSection', #4
'Url', #5
'Timestamp']) #6
# number of articles to select from each month (10000/12=833,33)
n_select = 833 n_select = 833
# except every third month: for m in FileHandler.months:
every_third_month = ['03', '06', '09', '12'] df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
for m in JsonHandler.month: delimiter='|',
df = pandas.read_csv('all_{}.csv'.format(m), header=0,
delimiter='|', index_col=None,
header=0, engine='python',
index_col=None, quoting=csv.QUOTE_NONNUMERIC,
engine='python', quotechar='\'')
quotechar='\'',
quoting=0,
encoding='utf-8')
# pick one more from every third article # pick one more from every third article
if m in every_third_month: if m in ['03', '06', '09', '12']:
n_select = 834 n_select = 834
JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv', random_articles = FileHandler.select_randoms(df, n_select)
header=True, del random_articles['Random']
mode='a', random_articles.to_csv(o_file,
encoding='python', header=False,
quoting=QUOTE_MINIMAL, index=False,
quotechar='\'') sep='|',
mode='a',
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
def write_articles_to_csv_files(): def write_articles_to_csv_files():
'''read JSON files, select articles and write them to csv. '''read JSON files, select articles and write them to csv.
@ -69,14 +83,14 @@ class JsonHandler:
# reliable sources (site_sections) # reliable sources (site_sections)
site_sections = [] site_sections = []
# read list from 'sections.txt' file # read list from 'sections.txt' file
with open('sections.txt', 'r') as s_list: with open('data\\sections.txt', 'r') as s_list:
site_sections = s_list.read().split('\n') site_sections = s_list.read().split('\n')
# article counter # article counter
a = 0 a = 0
for m in JsonHandler.months: for m in FileHandler.months:
# 1 output file per month # 1 output file per month
output_file = 'all_{}.csv'.format(m) output_file = 'data\\articles\\all_{}.csv'.format(m)
# path of input JSON files per month # path of input JSON files per month
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\ path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\ '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
@ -146,7 +160,8 @@ class JsonHandler:
print('#') print('#')
print('# saved {} articles in total'.format(a)) print('# saved {} articles in total'.format(a))
print('#') print('#')
def join_all_csv_files():
if __name__ == '__main__': if __name__ == '__main__':
JsonHandler.write_articles_to_csv_files() # FileHandler.write_articles_to_csv_files()
#JsonHandler.create_labeling_dataset() # FileHandler.create_labeling_dataset()

13
NER.py
View File

@ -5,15 +5,15 @@ Named Entity Recognition (NER)
Stanford NER takes a text as input and returns a list of entities Stanford NER takes a text as input and returns a list of entities
like persons, organizations and countries, e.g. like persons, organizations and countries, e.g.
''' '''
import csv
import os import os
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd
from nltk.tag import StanfordNERTagger from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from CsvHandler import CsvHandler
class NER: class NER:
# toDo: complete lists: # toDo: complete lists:
@ -118,8 +118,13 @@ class NER:
if __name__ == '__main__': if __name__ == '__main__':
filepath = 'classification_labelled_corrected.csv' filepath = 'data\\classification_labelled_corrected.csv'
df = CsvHandler.read_csv(filepath) df = pd.read_csv(filepath,
sep='|',
engine='python',
decimal='.',
quotechar='\'',
quoting=csv.QUOTE_NONE)
# only articles with label==1 # only articles with label==1
df_hits = df[df['Label'] == 1] df_hits = df[df['Label'] == 1]

View File

@ -177,7 +177,7 @@ class NaiveBayes:
print('# starting naive bayes') print('# starting naive bayes')
print('# ...') print('# ...')
file = 'classification_labelled_corrected.csv' file = 'data\\classification_labelled_corrected.csv'
# read csv file # read csv file
print('# reading dataset') print('# reading dataset')

View File

@ -153,7 +153,7 @@ class NaiveBayes_Interactive:
print('# starting naive bayes') print('# starting naive bayes')
print('# ...') print('# ...')
file = 'classification_labelled_corrected.csv' file = 'data\\classification_labelled_corrected.csv'
# read csv file # read csv file
print('# reading dataset') print('# reading dataset')

2
SVM.py
View File

@ -91,7 +91,7 @@ class SVM:
print('# starting svm') print('# starting svm')
print('# ...') print('# ...')
file = 'classification_labelled_corrected.csv' file = 'data\\classification_labelled_corrected.csv'
# read csv file # read csv file
print('# reading dataset') print('# reading dataset')

4514
data/articles/all_01.csv Normal file

File diff suppressed because one or more lines are too long

3853
data/articles/all_02.csv Normal file

File diff suppressed because one or more lines are too long

4299
data/articles/all_03.csv Normal file

File diff suppressed because one or more lines are too long

3312
data/articles/all_04.csv Normal file

File diff suppressed because one or more lines are too long

4127
data/articles/all_05.csv Normal file

File diff suppressed because one or more lines are too long

3388
data/articles/all_06.csv Normal file

File diff suppressed because one or more lines are too long

2372
data/articles/all_07.csv Normal file

File diff suppressed because one or more lines are too long

2981
data/articles/all_08.csv Normal file

File diff suppressed because one or more lines are too long

3296
data/articles/all_09.csv Normal file

File diff suppressed because one or more lines are too long

3491
data/articles/all_10.csv Normal file

File diff suppressed because one or more lines are too long

3391
data/articles/all_11.csv Normal file

File diff suppressed because one or more lines are too long

2777
data/articles/all_12.csv Normal file

File diff suppressed because one or more lines are too long

View File

Can't render this file because it is too large.

File diff suppressed because one or more lines are too long