refactoring

This commit is contained in:
Anne Lorenz 2018-10-22 11:53:03 +02:00
parent 0c15d49d7e
commit b6e48feb16
21 changed files with 51866 additions and 38 deletions

View File

@ -111,7 +111,7 @@ class DecisionTree:
print('# starting decision tree')
print('# ...')
file = 'classification_labelled_corrected.csv'
file = 'data\\classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')

View File

@ -1,8 +1,8 @@
'''
Json Handler
File Handler
============
JsonHandler reads articles from JSON files,
FileHandler reads articles from JSON files,
extracts relevant information and
writes it to a csv file.
'''
@ -16,9 +16,9 @@ import json
import numpy as np
import pandas as pd
class JsonHandler:
class FileHandler:
# string for every month of the year
# strings for every month of the year
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
'11', '12']
@ -28,7 +28,6 @@ class JsonHandler:
n number of items to select randomly,
return new DataFrame with only selected items.
'''
# initialize random => reproducible sequence
np.random.seed(5)
# add new column 'Random'
@ -39,28 +38,43 @@ class JsonHandler:
return df.iloc[0:n]
def create_labeling_dataset():
# number of articles to select from each month:
# 10.000 / 12 = 833,33
# output file
o_file = 'data\\interactive_labeling_dataset.csv'
# create file and write header
with open(o_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile,
delimiter='|',
quotechar='\'',
quoting=csv.QUOTE_NONNUMERIC)
writer.writerow(['Uuid', #0
'Title', #1
'Text', #2
'Site', #3
'SiteSection', #4
'Url', #5
'Timestamp']) #6
# number of articles to select from each month (10000/12=833,33)
n_select = 833
# except every third month:
every_third_month = ['03', '06', '09', '12']
for m in JsonHandler.month:
df = pandas.read_csv('all_{}.csv'.format(m),
for m in FileHandler.months:
df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
delimiter='|',
header=0,
index_col=None,
engine='python',
quotechar='\'',
quoting=0,
encoding='utf-8')
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# pick one more from every third article
if m in every_third_month:
if m in ['03', '06', '09', '12']:
n_select = 834
JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv',
header=True,
random_articles = FileHandler.select_randoms(df, n_select)
del random_articles['Random']
random_articles.to_csv(o_file,
header=False,
index=False,
sep='|',
mode='a',
encoding='python',
quoting=QUOTE_MINIMAL,
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
def write_articles_to_csv_files():
@ -69,14 +83,14 @@ class JsonHandler:
# reliable sources (site_sections)
site_sections = []
# read list from 'sections.txt' file
with open('sections.txt', 'r') as s_list:
with open('data\\sections.txt', 'r') as s_list:
site_sections = s_list.read().split('\n')
# article counter
a = 0
for m in JsonHandler.months:
for m in FileHandler.months:
# 1 output file per month
output_file = 'all_{}.csv'.format(m)
output_file = 'data\\articles\\all_{}.csv'.format(m)
# path of input JSON files per month
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
@ -146,7 +160,8 @@ class JsonHandler:
print('#')
print('# saved {} articles in total'.format(a))
print('#')
def join_all_csv_files():
if __name__ == '__main__':
JsonHandler.write_articles_to_csv_files()
#JsonHandler.create_labeling_dataset()
# FileHandler.write_articles_to_csv_files()
# FileHandler.create_labeling_dataset()

13
NER.py
View File

@ -5,15 +5,15 @@ Named Entity Recognition (NER)
Stanford NER takes a text as input and returns a list of entities
like persons, organizations and countries, e.g.
'''
import csv
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from CsvHandler import CsvHandler
class NER:
# toDo: complete lists:
@ -118,8 +118,13 @@ class NER:
if __name__ == '__main__':
filepath = 'classification_labelled_corrected.csv'
df = CsvHandler.read_csv(filepath)
filepath = 'data\\classification_labelled_corrected.csv'
df = pd.read_csv(filepath,
sep='|',
engine='python',
decimal='.',
quotechar='\'',
quoting=csv.QUOTE_NONE)
# only articles with label==1
df_hits = df[df['Label'] == 1]

View File

@ -177,7 +177,7 @@ class NaiveBayes:
print('# starting naive bayes')
print('# ...')
file = 'classification_labelled_corrected.csv'
file = 'data\\classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')

View File

@ -153,7 +153,7 @@ class NaiveBayes_Interactive:
print('# starting naive bayes')
print('# ...')
file = 'classification_labelled_corrected.csv'
file = 'data\\classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')

2
SVM.py
View File

@ -91,7 +91,7 @@ class SVM:
print('# starting svm')
print('# ...')
file = 'classification_labelled_corrected.csv'
file = 'data\\classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')

4514
data/articles/all_01.csv Normal file

File diff suppressed because one or more lines are too long

3853
data/articles/all_02.csv Normal file

File diff suppressed because one or more lines are too long

4299
data/articles/all_03.csv Normal file

File diff suppressed because one or more lines are too long

3312
data/articles/all_04.csv Normal file

File diff suppressed because one or more lines are too long

4127
data/articles/all_05.csv Normal file

File diff suppressed because one or more lines are too long

3388
data/articles/all_06.csv Normal file

File diff suppressed because one or more lines are too long

2372
data/articles/all_07.csv Normal file

File diff suppressed because one or more lines are too long

2981
data/articles/all_08.csv Normal file

File diff suppressed because one or more lines are too long

3296
data/articles/all_09.csv Normal file

File diff suppressed because one or more lines are too long

3491
data/articles/all_10.csv Normal file

File diff suppressed because one or more lines are too long

3391
data/articles/all_11.csv Normal file

File diff suppressed because one or more lines are too long

2777
data/articles/all_12.csv Normal file

File diff suppressed because one or more lines are too long

View File

Can't render this file because it is too large.

File diff suppressed because one or more lines are too long