refactoring

2018-10-22 11:53:03 +02:00 · 2018-10-22 11:53:03 +02:00 · b6e48feb16
parent 0c15d49d7e
commit b6e48feb16
21 changed files with 51866 additions and 38 deletions
--- a/DecisionTree.py
+++ b/DecisionTree.py
@ -111,7 +111,7 @@ class DecisionTree:
        print('# starting decision tree')
        print('# ...')
-        file = 'classification_labelled_corrected.csv'
+        file = 'data\\classification_labelled_corrected.csv'
        # read csv file
        print('# reading dataset')
--- a/FileHandler.py
+++ b/FileHandler.py
@ -1,8 +1,8 @@
 '''
-Json Handler
+File Handler
 ============
-JsonHandler reads articles from JSON files,
+FileHandler reads articles from JSON files,
 extracts relevant information and
 writes it to a csv file.
 '''
@ -16,9 +16,9 @@ import json
 import numpy as np
 import pandas as pd
-class JsonHandler:
+class FileHandler:
-    # string for every month of the year
+    # strings for every month of the year
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
              '11', '12']
@ -28,7 +28,6 @@ class JsonHandler:
        n number of items to select randomly,
        return new DataFrame with only selected items.
        '''
        # initialize random => reproducible sequence
        np.random.seed(5)
        # add new column 'Random'
@ -39,29 +38,44 @@ class JsonHandler:
        return df.iloc[0:n]
    def create_labeling_dataset():
-        # number of articles to select from each month:
+        # output file
-        # 10.000 / 12 = 833,33
+        o_file = 'data\\interactive_labeling_dataset.csv'
        # create file and write header
        with open(o_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, 
                                delimiter='|',
                                quotechar='\'', 
                                quoting=csv.QUOTE_NONNUMERIC)
            writer.writerow(['Uuid',        #0
                             'Title',       #1
                             'Text',        #2
                             'Site',        #3
                             'SiteSection', #4
                             'Url',         #5
                             'Timestamp'])  #6
        # number of articles to select from each month (10000/12=833,33)
        n_select = 833
-        # except every third month:
+        for m in FileHandler.months:
-        every_third_month = ['03', '06', '09', '12']
+            df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
-        for m in JsonHandler.month:
+                             delimiter='|',
-            df = pandas.read_csv('all_{}.csv'.format(m),
+                             header=0,
-                                  delimiter='|',
+                             index_col=None,
-                                  header=0,
+                             engine='python',
-                                  index_col=None,
+                             quoting=csv.QUOTE_NONNUMERIC,
-                                  engine='python',
+                             quotechar='\'')
                                  quotechar='\'',
                                  quoting=0,
                                  encoding='utf-8')
            # pick one more from every third article
-            if m in every_third_month:
+            if m in ['03', '06', '09', '12']:
                n_select = 834
-            JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv', 
+            random_articles = FileHandler.select_randoms(df, n_select)
-                                                            header=True, 
+            del random_articles['Random']
-                                                            mode='a', 
+            random_articles.to_csv(o_file,
-                                                            encoding='python', 
+                                   header=False,
-                                                            quoting=QUOTE_MINIMAL, 
+                                   index=False,
-                                                            quotechar='\'')
+                                   sep='|',
                                   mode='a',
                                   encoding='utf-8',
                                   quoting=csv.QUOTE_NONNUMERIC,
                                   quotechar='\'')
    def write_articles_to_csv_files():
        '''read JSON files, select articles and write them to csv.
@ -69,14 +83,14 @@ class JsonHandler:
        # reliable sources (site_sections)
        site_sections = []
        # read list from 'sections.txt' file
-        with open('sections.txt', 'r') as s_list:
+        with open('data\\sections.txt', 'r') as s_list:
            site_sections = s_list.read().split('\n')
        # article counter
        a = 0
-        for m in JsonHandler.months:
+        for m in FileHandler.months:
            # 1 output file per month
-            output_file = 'all_{}.csv'.format(m)
+            output_file = 'data\\articles\\all_{}.csv'.format(m)
            # path of input JSON files per month
            path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
                   '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
@ -146,7 +160,8 @@ class JsonHandler:
            print('#')
        print('# saved {} articles in total'.format(a))
        print('#')
    def join_all_csv_files():
 if __name__ == '__main__':
-    JsonHandler.write_articles_to_csv_files()
+    # FileHandler.write_articles_to_csv_files()
-    #JsonHandler.create_labeling_dataset()
+    # FileHandler.create_labeling_dataset()
--- a/NER.py
+++ b/NER.py
@ -5,15 +5,15 @@ Named Entity Recognition (NER)
 Stanford NER takes a text as input and returns a list of entities
 like persons, organizations and countries, e.g.
 '''
 import csv
 import os
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from nltk.tag import StanfordNERTagger
 from nltk.tokenize import word_tokenize
 from CsvHandler import CsvHandler
 class NER:
    # toDo: complete lists:
@ -118,8 +118,13 @@ class NER:
 if __name__ == '__main__':
-    filepath = 'classification_labelled_corrected.csv'
+    filepath = 'data\\classification_labelled_corrected.csv'
-    df = CsvHandler.read_csv(filepath)
+    df = pd.read_csv(filepath,
                     sep='|',
                     engine='python',
                     decimal='.',
                     quotechar='\'',
                     quoting=csv.QUOTE_NONE)
    # only articles with label==1
    df_hits = df[df['Label'] == 1]
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -177,7 +177,7 @@ class NaiveBayes:
        print('# starting naive bayes')
        print('# ...')
-        file = 'classification_labelled_corrected.csv'
+        file = 'data\\classification_labelled_corrected.csv'
        # read csv file
        print('# reading dataset')
--- a/NaiveBayes_Interactive.py
+++ b/NaiveBayes_Interactive.py
@ -153,7 +153,7 @@ class NaiveBayes_Interactive:
        print('# starting naive bayes')
        print('# ...')
-        file = 'classification_labelled_corrected.csv'
+        file = 'data\\classification_labelled_corrected.csv'
        # read csv file
        print('# reading dataset')
--- a/SVM.py
+++ b/SVM.py
@ -91,7 +91,7 @@ class SVM:
        print('# starting svm')
        print('# ...')
-        file = 'classification_labelled_corrected.csv'
+        file = 'data\\classification_labelled_corrected.csv'
        # read csv file
        print('# reading dataset')
--- a/data/articles/all_01.csv
+++ b/data/articles/all_01.csv
--- a/data/articles/all_02.csv
+++ b/data/articles/all_02.csv
--- a/data/articles/all_03.csv
+++ b/data/articles/all_03.csv
--- a/data/articles/all_04.csv
+++ b/data/articles/all_04.csv
--- a/data/articles/all_05.csv
+++ b/data/articles/all_05.csv
--- a/data/articles/all_06.csv
+++ b/data/articles/all_06.csv
--- a/data/articles/all_07.csv
+++ b/data/articles/all_07.csv
--- a/data/articles/all_08.csv
+++ b/data/articles/all_08.csv
--- a/data/articles/all_09.csv
+++ b/data/articles/all_09.csv
--- a/data/articles/all_10.csv
+++ b/data/articles/all_10.csv
--- a/data/articles/all_11.csv
+++ b/data/articles/all_11.csv
--- a/data/articles/all_12.csv
+++ b/data/articles/all_12.csv
--- a/data/classification_labelled_corrected.csv
+++ b/data/classification_labelled_corrected.csv
--- a/data/interactive_labeling_dataset.csv
+++ b/data/interactive_labeling_dataset.csv
--- a/data/sections.txt
+++ b/data/sections.txt