refactoring

2018-10-22 11:53:03 +02:00 · 2018-10-22 11:53:03 +02:00 · b6e48feb16
commit b6e48feb16
parent 0c15d49d7e
21 changed files with 51866 additions and 38 deletions
--- a/DecisionTree.py
+++ b/DecisionTree.py
@ -111,7 +111,7 @@ class DecisionTree:
        print('# starting decision tree')
        print('# ...')

-        file = 'classification_labelled_corrected.csv'
+        file = 'data\\classification_labelled_corrected.csv'

        # read csv file
        print('# reading dataset')
--- a/FileHandler.py
+++ b/FileHandler.py
@ -1,8 +1,8 @@
 '''
-Json Handler
+File Handler
 ============

-JsonHandler reads articles from JSON files,
+FileHandler reads articles from JSON files,
 extracts relevant information and
 writes it to a csv file.
 '''
@ -16,9 +16,9 @@ import json
 import numpy as np
 import pandas as pd

-class JsonHandler:
+class FileHandler:

-    # string for every month of the year
+    # strings for every month of the year
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
              '11', '12']

@ -28,7 +28,6 @@ class JsonHandler:
        n number of items to select randomly,
        return new DataFrame with only selected items.
        '''
-
        # initialize random => reproducible sequence
        np.random.seed(5)
        # add new column 'Random'
@ -39,28 +38,43 @@ class JsonHandler:
        return df.iloc[0:n]

    def create_labeling_dataset():
-        # number of articles to select from each month:
-        # 10.000 / 12 = 833,33
+        # output file
+        o_file = 'data\\interactive_labeling_dataset.csv'
+        # create file and write header
+        with open(o_file, 'w', newline='') as csvfile:
+            writer = csv.writer(csvfile, 
+                                delimiter='|',
+                                quotechar='\'', 
+                                quoting=csv.QUOTE_NONNUMERIC)
+            writer.writerow(['Uuid',        #0
+                             'Title',       #1
+                             'Text',        #2
+                             'Site',        #3
+                             'SiteSection', #4
+                             'Url',         #5
+                             'Timestamp'])  #6
+        # number of articles to select from each month (10000/12=833,33)
        n_select = 833
-        # except every third month:
-        every_third_month = ['03', '06', '09', '12']
-        for m in JsonHandler.month:
-            df = pandas.read_csv('all_{}.csv'.format(m),
+        for m in FileHandler.months:
+            df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
                             delimiter='|',
                             header=0,
                             index_col=None,
                             engine='python',
-                                  quotechar='\'',
-                                  quoting=0,
-                                  encoding='utf-8')
+                             quoting=csv.QUOTE_NONNUMERIC,
+                             quotechar='\'')
            # pick one more from every third article
-            if m in every_third_month:
+            if m in ['03', '06', '09', '12']:
                n_select = 834
-            JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv', 
-                                                            header=True, 
+            random_articles = FileHandler.select_randoms(df, n_select)
+            del random_articles['Random']
+            random_articles.to_csv(o_file,
+                                   header=False,
+                                   index=False,
+                                   sep='|',
                                   mode='a',
-                                                            encoding='python', 
-                                                            quoting=QUOTE_MINIMAL, 
+                                   encoding='utf-8',
+                                   quoting=csv.QUOTE_NONNUMERIC,
                                   quotechar='\'')

    def write_articles_to_csv_files():
@ -69,14 +83,14 @@ class JsonHandler:
        # reliable sources (site_sections)
        site_sections = []
        # read list from 'sections.txt' file
-        with open('sections.txt', 'r') as s_list:
+        with open('data\\sections.txt', 'r') as s_list:
            site_sections = s_list.read().split('\n')

        # article counter
        a = 0
-        for m in JsonHandler.months:
+        for m in FileHandler.months:
            # 1 output file per month
-            output_file = 'all_{}.csv'.format(m)
+            output_file = 'data\\articles\\all_{}.csv'.format(m)
            # path of input JSON files per month
            path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
                   '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
@ -146,7 +160,8 @@ class JsonHandler:
            print('#')
        print('# saved {} articles in total'.format(a))
        print('#')
+    def join_all_csv_files():

 if __name__ == '__main__':
-    JsonHandler.write_articles_to_csv_files()
-    #JsonHandler.create_labeling_dataset()
+    # FileHandler.write_articles_to_csv_files()
+    # FileHandler.create_labeling_dataset()
--- a/NER.py
+++ b/NER.py
@ -5,15 +5,15 @@ Named Entity Recognition (NER)
 Stanford NER takes a text as input and returns a list of entities
 like persons, organizations and countries, e.g.
 '''
+import csv
 import os

 import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
 from nltk.tag import StanfordNERTagger
 from nltk.tokenize import word_tokenize

-from CsvHandler import CsvHandler
-
 class NER:

    # toDo: complete lists:
@ -118,8 +118,13 @@ class NER:

 if __name__ == '__main__':

-    filepath = 'classification_labelled_corrected.csv'
-    df = CsvHandler.read_csv(filepath)
+    filepath = 'data\\classification_labelled_corrected.csv'
+    df = pd.read_csv(filepath,
+                     sep='|',
+                     engine='python',
+                     decimal='.',
+                     quotechar='\'',
+                     quoting=csv.QUOTE_NONE)

    # only articles with label==1
    df_hits = df[df['Label'] == 1]
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -177,7 +177,7 @@ class NaiveBayes:
        print('# starting naive bayes')
        print('# ...')

-        file = 'classification_labelled_corrected.csv'
+        file = 'data\\classification_labelled_corrected.csv'

        # read csv file
        print('# reading dataset')
--- a/NaiveBayes_Interactive.py
+++ b/NaiveBayes_Interactive.py
@ -153,7 +153,7 @@ class NaiveBayes_Interactive:
        print('# starting naive bayes')
        print('# ...')

-        file = 'classification_labelled_corrected.csv'
+        file = 'data\\classification_labelled_corrected.csv'

        # read csv file
        print('# reading dataset')
--- a/SVM.py
+++ b/SVM.py
@ -91,7 +91,7 @@ class SVM:
        print('# starting svm')
        print('# ...')

-        file = 'classification_labelled_corrected.csv'
+        file = 'data\\classification_labelled_corrected.csv'

        # read csv file
        print('# reading dataset')
--- a/data/articles/all_01.csv
+++ b/data/articles/all_01.csv
--- a/data/articles/all_02.csv
+++ b/data/articles/all_02.csv
--- a/data/articles/all_03.csv
+++ b/data/articles/all_03.csv
--- a/data/articles/all_04.csv
+++ b/data/articles/all_04.csv
--- a/data/articles/all_05.csv
+++ b/data/articles/all_05.csv
--- a/data/articles/all_06.csv
+++ b/data/articles/all_06.csv
--- a/data/articles/all_07.csv
+++ b/data/articles/all_07.csv
--- a/data/articles/all_08.csv
+++ b/data/articles/all_08.csv
--- a/data/articles/all_09.csv
+++ b/data/articles/all_09.csv
--- a/data/articles/all_10.csv
+++ b/data/articles/all_10.csv
--- a/data/articles/all_11.csv
+++ b/data/articles/all_11.csv
--- a/data/articles/all_12.csv
+++ b/data/articles/all_12.csv
--- a/data/classification_labelled_corrected.csv
+++ b/data/classification_labelled_corrected.csv
--- a/data/interactive_labeling_dataset.csv
+++ b/data/interactive_labeling_dataset.csv
--- a/data/sections.txt
+++ b/data/sections.txt