diff --git a/JSONHandler.py b/JSONHandler.py index 855e0fc..dc253a4 100644 --- a/JSONHandler.py +++ b/JSONHandler.py @@ -24,9 +24,7 @@ class JsonHandler: n number of items to select randomly, returns new DataFrame with only selected items ''' - - ## df.sample(n=5, random_state=42) gibt dir 5 zufallswerte, ist das das, was du suchst? - + # initialize random => reproducible sequence np.random.seed(5) # add new column 'Random' @@ -56,7 +54,9 @@ class JsonHandler: def write_articles_to_csv(file_name): # path of JSON files - path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json' + path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\ + '\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4'\ + '\\news_[0-9]*.json' files = glob.glob(path) # reliable sources (site_sections) @@ -104,24 +104,25 @@ class JsonHandler: continue # pick only relevant information of article # and put in in list - article = [dict['thread']['uuid'], # 0:'Uuid' - dict['thread']['title'], # 1:'Title' - dict['text'], # 2:'Text' - dict['thread']['site'], # 3:'Site' - dict['thread']['site_section'], # 4:'SiteSection' - dict['url'], # 5:'Url' - dict['published']] # 6:'Timestamp' + article = [dict['thread']['uuid'], # 0:'Uuid' + dict['thread']['title'], # 1:'Title' + dict['text'], # 2:'Text' + dict['thread']['site'], # 3:'Site' + dict['thread']['site_section'],# 4:'SiteSection' + dict['url'], # 5:'Url' + dict['published']] # 6:'Timestamp' - # remove newlines and delimiter char - article[1] = article[1].replace('|', '-') # in 'Title' - article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text' + # remove newlines and delimiter chars + article[1] = article[1].replace('|', '-') + article[2] = article[2].replace('\n', ' ')\ + .replace('\r', ' ').replace('|', '-') try: writer.writerow(article) a += 1 # handle undefined characters (videos and other spam) except UnicodeEncodeError: - print('# filtered out site_section: {} (UnicodeEncodeError)' + print('# filtered out: {} (UnicodeEncodeError)' .format(dict['thread']['site_section'])) print() print('# saved {} articles in file {}'.format(a, file_name)) diff --git a/NaiveBayes.py b/NaiveBayes.py index a142617..41bb5e4 100644 --- a/NaiveBayes.py +++ b/NaiveBayes.py @@ -91,9 +91,7 @@ class NaiveBayes: rec = recall_score(y[test], predictions_test) print('rec: ' + str(rec)) recall_scores.append(rec) - ##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!! - ## Hier auch trainings- gegen testwerte tauschen - prec = precision_score(y[train], predictions_train) + prec = precision_score(y[test], predictions_test) print('prec: ' + str(prec)) print('#') precision_scores.append(prec) diff --git a/NaiveBayes_Interactive.py b/NaiveBayes_Interactive.py index 6bad450..97216e6 100644 --- a/NaiveBayes_Interactive.py +++ b/NaiveBayes_Interactive.py @@ -67,10 +67,7 @@ class NaiveBayes_Interactive: rec = recall_score(y[test], predictions_test) print('rec: ' + str(rec)) recall_scores.append(rec) - ##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!! - ## DU hast in der Zeile untendrunter y[train] und predicitons_train verwendet, du brauchst aber y[test] etc., - ## da precision ja nur ein anderes maß als recall ist, es muss aber genauso mit den testwerten berechnet werden - prec = precision_score(y[train], predictions_train) + prec = precision_score(y[test], predictions_test) print('prec: ' + str(prec)) print('#') precision_scores.append(prec) diff --git a/README.md b/README.md index 7c89db7..c1a5ab6 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,8 @@ Best parameters set found on development set: {'SVC\__C': 0.1, 'SVC\__gamma': 0.01, 'SVC\__kernel': 'linear', 'perc\__percentile': 50} * **Naive Bayes Classifier**: -F1 score: 0.832 (average) -Parameters: SelectPercentile(25), own Bag of Words implementation, 10-fold cross validation +F1 score: 0.841 (average) +Parameters: SelectPercentile(100), own Bag of Words implementation, 10-fold cross validation The complete documentation can be found in the latex document in the *thesis* folder.