evaluation interactive labeling: update

This commit is contained in:
annealias 2019-03-07 04:53:54 +01:00
parent a2c7a7279e
commit b2aa2df0f7
7 changed files with 2186 additions and 270 deletions

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -56,7 +56,7 @@ class LabelingPlotter():
def plot_cumulative():
# load pickle object
with open('../obj/array_class_probs_round_11.pkl', 'rb') as input:
with open('../obj/array_class_probs_stratified_round_9.pkl', 'rb') as input:
list = pickle.load(input)
# sort list in descending order
@ -86,11 +86,12 @@ class LabelingPlotter():
ax.set_xlabel('Highest estimated probability')
ax.set_ylabel('Fraction of articles with this highest estimated probability')
#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
plt.axis([0.5, 1, 0, 0.015]) # round 9 stratified
#plt.axis([0.65, 1, 0, 0.003]) # round 10
plt.axis([0.7, 1, 0, 0.002]) # round 11
#plt.axis([0.7, 1, 0, 0.002]) # round 11
#ax.set_xbound(lower=0.5, upper=0.99)
plt.savefig('..\\visualization\\proba_round_11.png')
plt.savefig('..\\visualization\\proba_round_11.eps')
plt.savefig('..\\visualization\\proba_stratified_round_9.png')
plt.savefig('..\\visualization\\proba_stratified_round_9.eps')
plt.show()

View File

@ -25,7 +25,11 @@ class MultinomialNaiveBayes:
# split data into text and label set
# join title and text
X = dataset['Title'] + '. ' + dataset['Text']
print(X[:12])
y = dataset['Label']
print(y[:12])
if sklearn_cv:
cv = CountVectorizer()
@ -57,6 +61,12 @@ class MultinomialNaiveBayes:
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
print('Title + Text von train')
print(X[train])
print('Label von train')
print(y[train])
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
@ -172,4 +182,24 @@ class MultinomialNaiveBayes:
print(y_test[i])
print()
#print metrics
print('F1 score: ', format(f1_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))
if __name__ == '__main__':
# read csv file
print('# reading dataset')
print('# ...')
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# select only labeled articles
#print('Anzahl aller gelabelten:')
#print(len(df.loc[df['Label'] != -1]))
#print(df.loc[df['Label'] != -1][:5])
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reindex(), sklearn_cv=True, percentile=100)

View File

@ -1,7 +1,7 @@
%!PS-Adobe-3.0 EPSF-3.0
%%Title: ..\visualization\proba_round_11.eps
%%Creator: matplotlib version 3.0.2, http://matplotlib.org/
%%CreationDate: Thu Feb 21 14:11:04 2019
%%CreationDate: Tue Mar 5 08:51:48 2019
%%Orientation: portrait
%%BoundingBox: 18 252 594 540
%%EndComments

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB