corrected calculation of precision

2018-10-19 10:28:26 +02:00 · 2018-10-19 10:28:26 +02:00 · cbfbdffdb7
commit cbfbdffdb7
parent 3406d3e975
4 changed files with 20 additions and 24 deletions
--- a/JSONHandler.py
+++ b/JSONHandler.py
@ -24,9 +24,7 @@ class JsonHandler:
        n number of items to select randomly,
        returns new DataFrame with only selected items
        '''
-        
-        ## df.sample(n=5, random_state=42) gibt dir 5 zufallswerte, ist das das, was du suchst?
-        
+
        # initialize random => reproducible sequence
        np.random.seed(5)
        # add new column 'Random'
@ -56,7 +54,9 @@ class JsonHandler:

    def write_articles_to_csv(file_name):
        # path of JSON files
-        path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json'
+        path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
+               '\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4'\
+               '\\news_[0-9]*.json'
        files = glob.glob(path)

        # reliable sources (site_sections)
@ -104,24 +104,25 @@ class JsonHandler:
                        continue
                    # pick only relevant information of article
                    # and put in in list
-                    article = [dict['thread']['uuid'],         # 0:'Uuid'
-                               dict['thread']['title'],        # 1:'Title'
-                               dict['text'],                   # 2:'Text'
-                               dict['thread']['site'],         # 3:'Site'
-                               dict['thread']['site_section'], # 4:'SiteSection'
-                               dict['url'],                    # 5:'Url'
-                               dict['published']]              # 6:'Timestamp'
+                    article = [dict['thread']['uuid'],        # 0:'Uuid'
+                               dict['thread']['title'],       # 1:'Title'
+                               dict['text'],                  # 2:'Text'
+                               dict['thread']['site'],        # 3:'Site'
+                               dict['thread']['site_section'],# 4:'SiteSection'
+                               dict['url'],                   # 5:'Url'
+                               dict['published']]             # 6:'Timestamp'

-                    # remove newlines and delimiter char
-                    article[1] = article[1].replace('|', '-') # in 'Title'
-                    article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text'
+                    # remove newlines and delimiter chars
+                    article[1] = article[1].replace('|', '-')
+                    article[2] = article[2].replace('\n', ' ')\
+                                 .replace('\r', ' ').replace('|', '-')

                    try:
                        writer.writerow(article)
                        a += 1
                    # handle undefined characters (videos and other spam)
                    except UnicodeEncodeError:
-                        print('# filtered out site_section: {} (UnicodeEncodeError)'
+                        print('# filtered out: {} (UnicodeEncodeError)'
                                    .format(dict['thread']['site_section']))
        print()
        print('# saved {} articles in file {}'.format(a, file_name))
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -91,9 +91,7 @@ class NaiveBayes:
            rec = recall_score(y[test], predictions_test)
            print('rec: ' + str(rec))
            recall_scores.append(rec)
-            ##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!!
-            ## Hier auch trainings- gegen testwerte tauschen
-            prec = precision_score(y[train], predictions_train)
+            prec = precision_score(y[test], predictions_test)
            print('prec: ' + str(prec))
            print('#')
            precision_scores.append(prec)
--- a/NaiveBayes_Interactive.py
+++ b/NaiveBayes_Interactive.py
@ -67,10 +67,7 @@ class NaiveBayes_Interactive:
            rec = recall_score(y[test], predictions_test)
            print('rec: ' + str(rec))
            recall_scores.append(rec)
-            ##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!!
-            ## DU hast in der Zeile untendrunter y[train] und predicitons_train verwendet, du brauchst aber y[test] etc.,
-            ## da precision ja nur ein anderes maß als recall ist, es muss aber genauso mit den testwerten berechnet werden
-            prec = precision_score(y[train], predictions_train)
+            prec = precision_score(y[test], predictions_test)
            print('prec: ' + str(prec))
            print('#')
            precision_scores.append(prec)
--- a/README.md
+++ b/README.md
@ -12,8 +12,8 @@ Best parameters set found on development set:
 {'SVC\__C': 0.1, 'SVC\__gamma': 0.01, 'SVC\__kernel': 'linear', 'perc\__percentile': 50}

 * **Naive Bayes Classifier**:  
-F1 score: 0.832 (average)  
-Parameters: SelectPercentile(25), own Bag of Words implementation, 10-fold cross validation  
+F1 score: 0.841 (average)  
+Parameters: SelectPercentile(100), own Bag of Words implementation, 10-fold cross validation  

 The complete documentation can be found in the latex document in the *thesis* folder.