corrected calculation of precision
This commit is contained in:
parent
3406d3e975
commit
cbfbdffdb7
|
@ -24,9 +24,7 @@ class JsonHandler:
|
|||
n number of items to select randomly,
|
||||
returns new DataFrame with only selected items
|
||||
'''
|
||||
|
||||
## df.sample(n=5, random_state=42) gibt dir 5 zufallswerte, ist das das, was du suchst?
|
||||
|
||||
|
||||
# initialize random => reproducible sequence
|
||||
np.random.seed(5)
|
||||
# add new column 'Random'
|
||||
|
@ -56,7 +54,9 @@ class JsonHandler:
|
|||
|
||||
def write_articles_to_csv(file_name):
|
||||
# path of JSON files
|
||||
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json'
|
||||
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
|
||||
'\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4'\
|
||||
'\\news_[0-9]*.json'
|
||||
files = glob.glob(path)
|
||||
|
||||
# reliable sources (site_sections)
|
||||
|
@ -104,24 +104,25 @@ class JsonHandler:
|
|||
continue
|
||||
# pick only relevant information of article
|
||||
# and put in in list
|
||||
article = [dict['thread']['uuid'], # 0:'Uuid'
|
||||
dict['thread']['title'], # 1:'Title'
|
||||
dict['text'], # 2:'Text'
|
||||
dict['thread']['site'], # 3:'Site'
|
||||
dict['thread']['site_section'], # 4:'SiteSection'
|
||||
dict['url'], # 5:'Url'
|
||||
dict['published']] # 6:'Timestamp'
|
||||
article = [dict['thread']['uuid'], # 0:'Uuid'
|
||||
dict['thread']['title'], # 1:'Title'
|
||||
dict['text'], # 2:'Text'
|
||||
dict['thread']['site'], # 3:'Site'
|
||||
dict['thread']['site_section'],# 4:'SiteSection'
|
||||
dict['url'], # 5:'Url'
|
||||
dict['published']] # 6:'Timestamp'
|
||||
|
||||
# remove newlines and delimiter char
|
||||
article[1] = article[1].replace('|', '-') # in 'Title'
|
||||
article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text'
|
||||
# remove newlines and delimiter chars
|
||||
article[1] = article[1].replace('|', '-')
|
||||
article[2] = article[2].replace('\n', ' ')\
|
||||
.replace('\r', ' ').replace('|', '-')
|
||||
|
||||
try:
|
||||
writer.writerow(article)
|
||||
a += 1
|
||||
# handle undefined characters (videos and other spam)
|
||||
except UnicodeEncodeError:
|
||||
print('# filtered out site_section: {} (UnicodeEncodeError)'
|
||||
print('# filtered out: {} (UnicodeEncodeError)'
|
||||
.format(dict['thread']['site_section']))
|
||||
print()
|
||||
print('# saved {} articles in file {}'.format(a, file_name))
|
||||
|
|
|
@ -91,9 +91,7 @@ class NaiveBayes:
|
|||
rec = recall_score(y[test], predictions_test)
|
||||
print('rec: ' + str(rec))
|
||||
recall_scores.append(rec)
|
||||
##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!!
|
||||
## Hier auch trainings- gegen testwerte tauschen
|
||||
prec = precision_score(y[train], predictions_train)
|
||||
prec = precision_score(y[test], predictions_test)
|
||||
print('prec: ' + str(prec))
|
||||
print('#')
|
||||
precision_scores.append(prec)
|
||||
|
|
|
@ -67,10 +67,7 @@ class NaiveBayes_Interactive:
|
|||
rec = recall_score(y[test], predictions_test)
|
||||
print('rec: ' + str(rec))
|
||||
recall_scores.append(rec)
|
||||
##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!!
|
||||
## DU hast in der Zeile untendrunter y[train] und predicitons_train verwendet, du brauchst aber y[test] etc.,
|
||||
## da precision ja nur ein anderes maß als recall ist, es muss aber genauso mit den testwerten berechnet werden
|
||||
prec = precision_score(y[train], predictions_train)
|
||||
prec = precision_score(y[test], predictions_test)
|
||||
print('prec: ' + str(prec))
|
||||
print('#')
|
||||
precision_scores.append(prec)
|
||||
|
|
|
@ -12,8 +12,8 @@ Best parameters set found on development set:
|
|||
{'SVC\__C': 0.1, 'SVC\__gamma': 0.01, 'SVC\__kernel': 'linear', 'perc\__percentile': 50}
|
||||
|
||||
* **Naive Bayes Classifier**:
|
||||
F1 score: 0.832 (average)
|
||||
Parameters: SelectPercentile(25), own Bag of Words implementation, 10-fold cross validation
|
||||
F1 score: 0.841 (average)
|
||||
Parameters: SelectPercentile(100), own Bag of Words implementation, 10-fold cross validation
|
||||
|
||||
The complete documentation can be found in the latex document in the *thesis* folder.
|
||||
|
||||
|
|
Loading…
Reference in New Issue