refactoring
This commit is contained in:
parent
0c15d49d7e
commit
b6e48feb16
|
@ -111,7 +111,7 @@ class DecisionTree:
|
||||||
print('# starting decision tree')
|
print('# starting decision tree')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
file = 'classification_labelled_corrected.csv'
|
file = 'data\\classification_labelled_corrected.csv'
|
||||||
|
|
||||||
# read csv file
|
# read csv file
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
'''
|
'''
|
||||||
Json Handler
|
File Handler
|
||||||
============
|
============
|
||||||
|
|
||||||
JsonHandler reads articles from JSON files,
|
FileHandler reads articles from JSON files,
|
||||||
extracts relevant information and
|
extracts relevant information and
|
||||||
writes it to a csv file.
|
writes it to a csv file.
|
||||||
'''
|
'''
|
||||||
|
@ -16,9 +16,9 @@ import json
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
class JsonHandler:
|
class FileHandler:
|
||||||
|
|
||||||
# string for every month of the year
|
# strings for every month of the year
|
||||||
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
|
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
|
||||||
'11', '12']
|
'11', '12']
|
||||||
|
|
||||||
|
@ -28,7 +28,6 @@ class JsonHandler:
|
||||||
n number of items to select randomly,
|
n number of items to select randomly,
|
||||||
return new DataFrame with only selected items.
|
return new DataFrame with only selected items.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# initialize random => reproducible sequence
|
# initialize random => reproducible sequence
|
||||||
np.random.seed(5)
|
np.random.seed(5)
|
||||||
# add new column 'Random'
|
# add new column 'Random'
|
||||||
|
@ -39,29 +38,44 @@ class JsonHandler:
|
||||||
return df.iloc[0:n]
|
return df.iloc[0:n]
|
||||||
|
|
||||||
def create_labeling_dataset():
|
def create_labeling_dataset():
|
||||||
# number of articles to select from each month:
|
# output file
|
||||||
# 10.000 / 12 = 833,33
|
o_file = 'data\\interactive_labeling_dataset.csv'
|
||||||
|
# create file and write header
|
||||||
|
with open(o_file, 'w', newline='') as csvfile:
|
||||||
|
writer = csv.writer(csvfile,
|
||||||
|
delimiter='|',
|
||||||
|
quotechar='\'',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC)
|
||||||
|
writer.writerow(['Uuid', #0
|
||||||
|
'Title', #1
|
||||||
|
'Text', #2
|
||||||
|
'Site', #3
|
||||||
|
'SiteSection', #4
|
||||||
|
'Url', #5
|
||||||
|
'Timestamp']) #6
|
||||||
|
# number of articles to select from each month (10000/12=833,33)
|
||||||
n_select = 833
|
n_select = 833
|
||||||
# except every third month:
|
for m in FileHandler.months:
|
||||||
every_third_month = ['03', '06', '09', '12']
|
df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
|
||||||
for m in JsonHandler.month:
|
delimiter='|',
|
||||||
df = pandas.read_csv('all_{}.csv'.format(m),
|
header=0,
|
||||||
delimiter='|',
|
index_col=None,
|
||||||
header=0,
|
engine='python',
|
||||||
index_col=None,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
engine='python',
|
quotechar='\'')
|
||||||
quotechar='\'',
|
|
||||||
quoting=0,
|
|
||||||
encoding='utf-8')
|
|
||||||
# pick one more from every third article
|
# pick one more from every third article
|
||||||
if m in every_third_month:
|
if m in ['03', '06', '09', '12']:
|
||||||
n_select = 834
|
n_select = 834
|
||||||
JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv',
|
random_articles = FileHandler.select_randoms(df, n_select)
|
||||||
header=True,
|
del random_articles['Random']
|
||||||
mode='a',
|
random_articles.to_csv(o_file,
|
||||||
encoding='python',
|
header=False,
|
||||||
quoting=QUOTE_MINIMAL,
|
index=False,
|
||||||
quotechar='\'')
|
sep='|',
|
||||||
|
mode='a',
|
||||||
|
encoding='utf-8',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
def write_articles_to_csv_files():
|
def write_articles_to_csv_files():
|
||||||
'''read JSON files, select articles and write them to csv.
|
'''read JSON files, select articles and write them to csv.
|
||||||
|
@ -69,14 +83,14 @@ class JsonHandler:
|
||||||
# reliable sources (site_sections)
|
# reliable sources (site_sections)
|
||||||
site_sections = []
|
site_sections = []
|
||||||
# read list from 'sections.txt' file
|
# read list from 'sections.txt' file
|
||||||
with open('sections.txt', 'r') as s_list:
|
with open('data\\sections.txt', 'r') as s_list:
|
||||||
site_sections = s_list.read().split('\n')
|
site_sections = s_list.read().split('\n')
|
||||||
|
|
||||||
# article counter
|
# article counter
|
||||||
a = 0
|
a = 0
|
||||||
for m in JsonHandler.months:
|
for m in FileHandler.months:
|
||||||
# 1 output file per month
|
# 1 output file per month
|
||||||
output_file = 'all_{}.csv'.format(m)
|
output_file = 'data\\articles\\all_{}.csv'.format(m)
|
||||||
# path of input JSON files per month
|
# path of input JSON files per month
|
||||||
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
|
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
|
||||||
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
|
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
|
||||||
|
@ -146,7 +160,8 @@ class JsonHandler:
|
||||||
print('#')
|
print('#')
|
||||||
print('# saved {} articles in total'.format(a))
|
print('# saved {} articles in total'.format(a))
|
||||||
print('#')
|
print('#')
|
||||||
|
def join_all_csv_files():
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
JsonHandler.write_articles_to_csv_files()
|
# FileHandler.write_articles_to_csv_files()
|
||||||
#JsonHandler.create_labeling_dataset()
|
# FileHandler.create_labeling_dataset()
|
13
NER.py
13
NER.py
|
@ -5,15 +5,15 @@ Named Entity Recognition (NER)
|
||||||
Stanford NER takes a text as input and returns a list of entities
|
Stanford NER takes a text as input and returns a list of entities
|
||||||
like persons, organizations and countries, e.g.
|
like persons, organizations and countries, e.g.
|
||||||
'''
|
'''
|
||||||
|
import csv
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from nltk.tag import StanfordNERTagger
|
from nltk.tag import StanfordNERTagger
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
from CsvHandler import CsvHandler
|
|
||||||
|
|
||||||
class NER:
|
class NER:
|
||||||
|
|
||||||
# toDo: complete lists:
|
# toDo: complete lists:
|
||||||
|
@ -118,8 +118,13 @@ class NER:
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
filepath = 'classification_labelled_corrected.csv'
|
filepath = 'data\\classification_labelled_corrected.csv'
|
||||||
df = CsvHandler.read_csv(filepath)
|
df = pd.read_csv(filepath,
|
||||||
|
sep='|',
|
||||||
|
engine='python',
|
||||||
|
decimal='.',
|
||||||
|
quotechar='\'',
|
||||||
|
quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
# only articles with label==1
|
# only articles with label==1
|
||||||
df_hits = df[df['Label'] == 1]
|
df_hits = df[df['Label'] == 1]
|
||||||
|
|
|
@ -177,7 +177,7 @@ class NaiveBayes:
|
||||||
print('# starting naive bayes')
|
print('# starting naive bayes')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
file = 'classification_labelled_corrected.csv'
|
file = 'data\\classification_labelled_corrected.csv'
|
||||||
|
|
||||||
# read csv file
|
# read csv file
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
||||||
|
|
|
@ -153,7 +153,7 @@ class NaiveBayes_Interactive:
|
||||||
print('# starting naive bayes')
|
print('# starting naive bayes')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
file = 'classification_labelled_corrected.csv'
|
file = 'data\\classification_labelled_corrected.csv'
|
||||||
|
|
||||||
# read csv file
|
# read csv file
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
||||||
|
|
2
SVM.py
2
SVM.py
|
@ -91,7 +91,7 @@ class SVM:
|
||||||
print('# starting svm')
|
print('# starting svm')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
file = 'classification_labelled_corrected.csv'
|
file = 'data\\classification_labelled_corrected.csv'
|
||||||
|
|
||||||
# read csv file
|
# read csv file
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Can't render this file because it is too large.
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue