From b648ab70d26a34c3ece4f1536cb17186573658d8 Mon Sep 17 00:00:00 2001
From: Doris Birkefeld <doris.birkefeld@DEDBIRKEFELD02M.local>
Date: Thu, 18 Oct 2018 09:22:51 +0200
Subject: [PATCH] Code Kommentare von Doris

---
 BagOfWords.py        | 1 +
 FilterKeywords.py    | 1 +
 NER.py               | 1 +
 NaiveBayes.py        | 7 ++++++-
 NaiveBayes_simple.py | 2 ++
 Requester.py         | 4 +++-
 6 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/BagOfWords.py b/BagOfWords.py
index 681db3c..3d63b9c 100644
--- a/BagOfWords.py
+++ b/BagOfWords.py
@@ -106,6 +106,7 @@ class BagOfWords:
         print('# ...')
         vocab = set()
         for text in series:
+            ## DORIS: DIE FOLGENDE ZEILE VERSTEH ICH NICHT
             vocab |= set(BagOfWords.extract_words(text))
         # transform to list
         vocab = list(vocab)
diff --git a/FilterKeywords.py b/FilterKeywords.py
index 20e550e..1989a9f 100644
--- a/FilterKeywords.py
+++ b/FilterKeywords.py
@@ -54,6 +54,7 @@ class FilterKeywords:
             for kword in keywords:
                 if re.match(kword, key):
                     # if match, increase value of matching key
+                    ## DORIS: Hier könntest du ein defaultdict verwenden, https://www.accelebrate.com/blog/using-defaultdict-python/ 
                     if str(kword) in dict_keywords:
                         dict_keywords[str(kword)] += dict_input[key]
                     else:
diff --git a/NER.py b/NER.py
index 320c7b9..c7562d5 100644
--- a/NER.py
+++ b/NER.py
@@ -29,6 +29,7 @@ class NER:
             'NYSE']
 
     def tag_words(text):
+        ## DORIS: Kannst du die auch ins repository und den ordner schieben, dass ich das laufen lassen kann?
         stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
         stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
         # create tagger object
diff --git a/NaiveBayes.py b/NaiveBayes.py
index 3829b26..f4834c6 100644
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@@ -75,6 +75,8 @@ class NaiveBayes:
             # # apply select percentile
             # selector = SelectPercentile(percentile=25)
             # selector.fit(training_data, y[train])
+            
+            ##DORIS: WIRD SELECT PERCENTILE IN DEINE ARBEIT MIT NB EINBEZOGEN?
 
             # training_data_r = selector.transform(training_data)
             # testing_data_r = selector.transform(testing_data)
@@ -95,6 +97,7 @@ class NaiveBayes:
             rec = recall_score(y[test], predictions_test)
             print('rec: ' + str(rec))
             recall_scores.append(rec)
+            ##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!!
             prec = precision_score(y[train], predictions_train)
             print('prec: ' + str(prec))
             print('#')
@@ -186,7 +189,9 @@ class NaiveBayes:
         # read csv file
         print('# reading dataset')
         print('# ...')
-
+        
+        ## DORIS: ICH VERSTEHE NICHT, WARUM DU HIER EINE EXTRA FUNKTION SCHREIBST, PD.READ_CSV MÜSSTE DOCH AUCH SO GEHEN?
+        ## KOMMT VIELLEICHT NOCH, VIELLEICHT BIN ICH ZU VORSCHNELL
         dataset = CsvHandler.read_csv(file)
 
         make_naive_bayes(dataset)
diff --git a/NaiveBayes_simple.py b/NaiveBayes_simple.py
index 0e2532d..50473fd 100644
--- a/NaiveBayes_simple.py
+++ b/NaiveBayes_simple.py
@@ -31,6 +31,7 @@ class NaiveBayes_simple:
 
         cv = CountVectorizer()
 
+        ##DORIS: DU BRAUCHST IMMER EINEN STRATIFIED SPLIT, WEIL DIEN DATASET UNBALANCED IST
         # k-fold cross-validation as split method
         kf = KFold(n_splits=10, shuffle=True, random_state=5)
 
@@ -69,6 +70,7 @@ class NaiveBayes_simple:
             rec = recall_score(y[test], predictions_test)
             print('rec: ' + str(rec))
             recall_scores.append(rec)
+            ##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!!
             prec = precision_score(y[train], predictions_train)
             print('prec: ' + str(prec))
             print('#')
diff --git a/Requester.py b/Requester.py
index cdd4448..782cc14 100644
--- a/Requester.py
+++ b/Requester.py
@@ -84,10 +84,12 @@ class Requester:
                     article.append(section)
                     # add article to list
                     list_articles.append(article)
+                    ## DORIS: WARUM SCHREIBST DU ES NICHT DIREKT IN EINE CSV, SONDERN KONVERTIERST NOCHMAL?
 
             # Get the next batch of 100 posts
             output = webhoseio.get_next()
-
+        
+        
         # create DataFrame
         df = pd.DataFrame(data=list_articles,
                           columns=['Timestamp', 'Title', 'Text', 'SiteSection'])