corrected calculation of precision

This commit is contained in:
Anne Lorenz 2018-10-19 10:28:26 +02:00
parent 3406d3e975
commit cbfbdffdb7
4 changed files with 20 additions and 24 deletions

View File

@ -24,9 +24,7 @@ class JsonHandler:
n number of items to select randomly,
returns new DataFrame with only selected items
'''
## df.sample(n=5, random_state=42) gibt dir 5 zufallswerte, ist das das, was du suchst?
# initialize random => reproducible sequence
np.random.seed(5)
# add new column 'Random'
@ -56,7 +54,9 @@ class JsonHandler:
def write_articles_to_csv(file_name):
# path of JSON files
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json'
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
'\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4'\
'\\news_[0-9]*.json'
files = glob.glob(path)
# reliable sources (site_sections)
@ -104,24 +104,25 @@ class JsonHandler:
continue
# pick only relevant information of article
# and put in in list
article = [dict['thread']['uuid'], # 0:'Uuid'
dict['thread']['title'], # 1:'Title'
dict['text'], # 2:'Text'
dict['thread']['site'], # 3:'Site'
dict['thread']['site_section'], # 4:'SiteSection'
dict['url'], # 5:'Url'
dict['published']] # 6:'Timestamp'
article = [dict['thread']['uuid'], # 0:'Uuid'
dict['thread']['title'], # 1:'Title'
dict['text'], # 2:'Text'
dict['thread']['site'], # 3:'Site'
dict['thread']['site_section'],# 4:'SiteSection'
dict['url'], # 5:'Url'
dict['published']] # 6:'Timestamp'
# remove newlines and delimiter char
article[1] = article[1].replace('|', '-') # in 'Title'
article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text'
# remove newlines and delimiter chars
article[1] = article[1].replace('|', '-')
article[2] = article[2].replace('\n', ' ')\
.replace('\r', ' ').replace('|', '-')
try:
writer.writerow(article)
a += 1
# handle undefined characters (videos and other spam)
except UnicodeEncodeError:
print('# filtered out site_section: {} (UnicodeEncodeError)'
print('# filtered out: {} (UnicodeEncodeError)'
.format(dict['thread']['site_section']))
print()
print('# saved {} articles in file {}'.format(a, file_name))

View File

@ -91,9 +91,7 @@ class NaiveBayes:
rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec))
recall_scores.append(rec)
##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!!
## Hier auch trainings- gegen testwerte tauschen
prec = precision_score(y[train], predictions_train)
prec = precision_score(y[test], predictions_test)
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)

View File

@ -67,10 +67,7 @@ class NaiveBayes_Interactive:
rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec))
recall_scores.append(rec)
##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!!
## DU hast in der Zeile untendrunter y[train] und predicitons_train verwendet, du brauchst aber y[test] etc.,
## da precision ja nur ein anderes maß als recall ist, es muss aber genauso mit den testwerten berechnet werden
prec = precision_score(y[train], predictions_train)
prec = precision_score(y[test], predictions_test)
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)

View File

@ -12,8 +12,8 @@ Best parameters set found on development set:
{'SVC\__C': 0.1, 'SVC\__gamma': 0.01, 'SVC\__kernel': 'linear', 'perc\__percentile': 50}
* **Naive Bayes Classifier**:
F1 score: 0.832 (average)
Parameters: SelectPercentile(25), own Bag of Words implementation, 10-fold cross validation
F1 score: 0.841 (average)
Parameters: SelectPercentile(100), own Bag of Words implementation, 10-fold cross validation
The complete documentation can be found in the latex document in the *thesis* folder.