evaluation interactive labeling: update

2019-03-07 04:53:54 +01:00 · 2019-03-07 04:53:54 +01:00 · b2aa2df0f7
commit b2aa2df0f7
parent a2c7a7279e
7 changed files with 2186 additions and 270 deletions
--- a/obj/array_class_probs_stratified_round_9.pkl
+++ b/obj/array_class_probs_stratified_round_9.pkl
--- a/src/2019-02-24-al-resubstitution-error.ipynb
+++ b/src/2019-02-24-al-resubstitution-error.ipynb
--- a/src/LabelingPlotter.py
+++ b/src/LabelingPlotter.py
@ -56,7 +56,7 @@ class LabelingPlotter():
 	def plot_cumulative():
 		# load pickle object
-		with open('../obj/array_class_probs_round_11.pkl', 'rb') as input:
+		with open('../obj/array_class_probs_stratified_round_9.pkl', 'rb') as input:
 			list = pickle.load(input)
 		# sort list in descending order
@ -86,11 +86,12 @@ class LabelingPlotter():
 		ax.set_xlabel('Highest estimated probability')
 		ax.set_ylabel('Fraction of articles with this highest estimated probability')
 		#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
 		plt.axis([0.5, 1, 0, 0.015]) # round 9 stratified
 		#plt.axis([0.65, 1, 0, 0.003]) # round 10
-		plt.axis([0.7, 1, 0, 0.002]) # round 11
+		#plt.axis([0.7, 1, 0, 0.002]) # round 11
 		#ax.set_xbound(lower=0.5, upper=0.99)
-		plt.savefig('..\\visualization\\proba_round_11.png')
+		plt.savefig('..\\visualization\\proba_stratified_round_9.png')
-		plt.savefig('..\\visualization\\proba_round_11.eps')
+		plt.savefig('..\\visualization\\proba_stratified_round_9.eps')
 		plt.show()
--- a/src/MultinomialNaiveBayes.py
+++ b/src/MultinomialNaiveBayes.py
@ -25,7 +25,11 @@ class MultinomialNaiveBayes:
 		# split data into text and label set
 		# join title and text
 		X = dataset['Title'] + '. ' + dataset['Text']
 		print(X[:12])
 		y = dataset['Label']
 		print(y[:12])
 		if sklearn_cv:
 			cv = CountVectorizer()
@ -57,6 +61,12 @@ class MultinomialNaiveBayes:
 			if sklearn_cv:
 				# use sklearn CountVectorizer
 				# fit the training data and then return the matrix
 				print('Title + Text von train')
 				print(X[train])
 				print('Label von train')
 				print(y[train])
 				training_data = cv.fit_transform(X[train], y[train]).toarray()
 				# transform testing data and return the matrix
 				testing_data = cv.transform(X[test]).toarray()
@ -172,4 +182,24 @@ class MultinomialNaiveBayes:
 				print(y_test[i])
 				print()
 		#print metrics
-		print('F1 score: ', format(f1_score(y_test, predictions)))
+		print('F1 score: ', format(f1_score(y_test, predictions)))
 if __name__ == '__main__':
 	# read csv file
 	print('# reading dataset')
 	print('# ...')
 	# read current data set from csv
 	df = pd.read_csv('../data/interactive_labeling_round_11.csv',
 			  sep='|',
 			  usecols=range(1,13), # drop first column 'unnamed'
 			  encoding='utf-8',
 			  quoting=csv.QUOTE_NONNUMERIC,
 			  quotechar='\'')
 	# select only labeled articles
 	#print('Anzahl aller gelabelten:')
 	#print(len(df.loc[df['Label'] != -1]))
 	#print(df.loc[df['Label'] != -1][:5])
 	MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reindex(), sklearn_cv=True, percentile=100)
--- a/visualization/proba_round_11.eps
+++ b/visualization/proba_round_11.eps
@ -1,7 +1,7 @@
 %!PS-Adobe-3.0 EPSF-3.0
 %%Title: ..\visualization\proba_round_11.eps
 %%Creator: matplotlib version 3.0.2, http://matplotlib.org/
-%%CreationDate: Thu Feb 21 14:11:04 2019
+%%CreationDate: Tue Mar  5 08:51:48 2019
 %%Orientation: portrait
 %%BoundingBox: 18 252 594 540
 %%EndComments
--- a/visualization/proba_stratified_round_9.eps
+++ b/visualization/proba_stratified_round_9.eps
--- a/visualization/proba_stratified_round_9.png
+++ b/visualization/proba_stratified_round_9.png