corrected calculation of precision
This commit is contained in:
parent
3406d3e975
commit
cbfbdffdb7
|
@ -25,8 +25,6 @@ class JsonHandler:
|
||||||
returns new DataFrame with only selected items
|
returns new DataFrame with only selected items
|
||||||
'''
|
'''
|
||||||
|
|
||||||
## df.sample(n=5, random_state=42) gibt dir 5 zufallswerte, ist das das, was du suchst?
|
|
||||||
|
|
||||||
# initialize random => reproducible sequence
|
# initialize random => reproducible sequence
|
||||||
np.random.seed(5)
|
np.random.seed(5)
|
||||||
# add new column 'Random'
|
# add new column 'Random'
|
||||||
|
@ -56,7 +54,9 @@ class JsonHandler:
|
||||||
|
|
||||||
def write_articles_to_csv(file_name):
|
def write_articles_to_csv(file_name):
|
||||||
# path of JSON files
|
# path of JSON files
|
||||||
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json'
|
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
|
||||||
|
'\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4'\
|
||||||
|
'\\news_[0-9]*.json'
|
||||||
files = glob.glob(path)
|
files = glob.glob(path)
|
||||||
|
|
||||||
# reliable sources (site_sections)
|
# reliable sources (site_sections)
|
||||||
|
@ -104,24 +104,25 @@ class JsonHandler:
|
||||||
continue
|
continue
|
||||||
# pick only relevant information of article
|
# pick only relevant information of article
|
||||||
# and put in in list
|
# and put in in list
|
||||||
article = [dict['thread']['uuid'], # 0:'Uuid'
|
article = [dict['thread']['uuid'], # 0:'Uuid'
|
||||||
dict['thread']['title'], # 1:'Title'
|
dict['thread']['title'], # 1:'Title'
|
||||||
dict['text'], # 2:'Text'
|
dict['text'], # 2:'Text'
|
||||||
dict['thread']['site'], # 3:'Site'
|
dict['thread']['site'], # 3:'Site'
|
||||||
dict['thread']['site_section'], # 4:'SiteSection'
|
dict['thread']['site_section'],# 4:'SiteSection'
|
||||||
dict['url'], # 5:'Url'
|
dict['url'], # 5:'Url'
|
||||||
dict['published']] # 6:'Timestamp'
|
dict['published']] # 6:'Timestamp'
|
||||||
|
|
||||||
# remove newlines and delimiter char
|
# remove newlines and delimiter chars
|
||||||
article[1] = article[1].replace('|', '-') # in 'Title'
|
article[1] = article[1].replace('|', '-')
|
||||||
article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text'
|
article[2] = article[2].replace('\n', ' ')\
|
||||||
|
.replace('\r', ' ').replace('|', '-')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
writer.writerow(article)
|
writer.writerow(article)
|
||||||
a += 1
|
a += 1
|
||||||
# handle undefined characters (videos and other spam)
|
# handle undefined characters (videos and other spam)
|
||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
print('# filtered out site_section: {} (UnicodeEncodeError)'
|
print('# filtered out: {} (UnicodeEncodeError)'
|
||||||
.format(dict['thread']['site_section']))
|
.format(dict['thread']['site_section']))
|
||||||
print()
|
print()
|
||||||
print('# saved {} articles in file {}'.format(a, file_name))
|
print('# saved {} articles in file {}'.format(a, file_name))
|
||||||
|
|
|
@ -91,9 +91,7 @@ class NaiveBayes:
|
||||||
rec = recall_score(y[test], predictions_test)
|
rec = recall_score(y[test], predictions_test)
|
||||||
print('rec: ' + str(rec))
|
print('rec: ' + str(rec))
|
||||||
recall_scores.append(rec)
|
recall_scores.append(rec)
|
||||||
##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!!
|
prec = precision_score(y[test], predictions_test)
|
||||||
## Hier auch trainings- gegen testwerte tauschen
|
|
||||||
prec = precision_score(y[train], predictions_train)
|
|
||||||
print('prec: ' + str(prec))
|
print('prec: ' + str(prec))
|
||||||
print('#')
|
print('#')
|
||||||
precision_scores.append(prec)
|
precision_scores.append(prec)
|
||||||
|
|
|
@ -67,10 +67,7 @@ class NaiveBayes_Interactive:
|
||||||
rec = recall_score(y[test], predictions_test)
|
rec = recall_score(y[test], predictions_test)
|
||||||
print('rec: ' + str(rec))
|
print('rec: ' + str(rec))
|
||||||
recall_scores.append(rec)
|
recall_scores.append(rec)
|
||||||
##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!!
|
prec = precision_score(y[test], predictions_test)
|
||||||
## DU hast in der Zeile untendrunter y[train] und predicitons_train verwendet, du brauchst aber y[test] etc.,
|
|
||||||
## da precision ja nur ein anderes maß als recall ist, es muss aber genauso mit den testwerten berechnet werden
|
|
||||||
prec = precision_score(y[train], predictions_train)
|
|
||||||
print('prec: ' + str(prec))
|
print('prec: ' + str(prec))
|
||||||
print('#')
|
print('#')
|
||||||
precision_scores.append(prec)
|
precision_scores.append(prec)
|
||||||
|
|
|
@ -12,8 +12,8 @@ Best parameters set found on development set:
|
||||||
{'SVC\__C': 0.1, 'SVC\__gamma': 0.01, 'SVC\__kernel': 'linear', 'perc\__percentile': 50}
|
{'SVC\__C': 0.1, 'SVC\__gamma': 0.01, 'SVC\__kernel': 'linear', 'perc\__percentile': 50}
|
||||||
|
|
||||||
* **Naive Bayes Classifier**:
|
* **Naive Bayes Classifier**:
|
||||||
F1 score: 0.832 (average)
|
F1 score: 0.841 (average)
|
||||||
Parameters: SelectPercentile(25), own Bag of Words implementation, 10-fold cross validation
|
Parameters: SelectPercentile(100), own Bag of Words implementation, 10-fold cross validation
|
||||||
|
|
||||||
The complete documentation can be found in the latex document in the *thesis* folder.
|
The complete documentation can be found in the latex document in the *thesis* folder.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue