evaluation interactive labeling: update

2019-03-07 04:53:54 +01:00 · 2019-03-07 04:53:54 +01:00 · b2aa2df0f7
commit b2aa2df0f7
parent a2c7a7279e
7 changed files with 2186 additions and 270 deletions
--- a/obj/array_class_probs_stratified_round_9.pkl
+++ b/obj/array_class_probs_stratified_round_9.pkl
--- a/src/2019-02-24-al-resubstitution-error.ipynb
+++ b/src/2019-02-24-al-resubstitution-error.ipynb
--- a/src/LabelingPlotter.py
+++ b/src/LabelingPlotter.py
@ -56,7 +56,7 @@ class LabelingPlotter():

 	def plot_cumulative():
 		# load pickle object
-		with open('../obj/array_class_probs_round_11.pkl', 'rb') as input:
+		with open('../obj/array_class_probs_stratified_round_9.pkl', 'rb') as input:
 			list = pickle.load(input)

 		# sort list in descending order
@ -86,11 +86,12 @@ class LabelingPlotter():
 		ax.set_xlabel('Highest estimated probability')
 		ax.set_ylabel('Fraction of articles with this highest estimated probability')
 		#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
+		plt.axis([0.5, 1, 0, 0.015]) # round 9 stratified
 		#plt.axis([0.65, 1, 0, 0.003]) # round 10
-		plt.axis([0.7, 1, 0, 0.002]) # round 11
+		#plt.axis([0.7, 1, 0, 0.002]) # round 11
 		#ax.set_xbound(lower=0.5, upper=0.99)
-		plt.savefig('..\\visualization\\proba_round_11.png')
-		plt.savefig('..\\visualization\\proba_round_11.eps')
+		plt.savefig('..\\visualization\\proba_stratified_round_9.png')
+		plt.savefig('..\\visualization\\proba_stratified_round_9.eps')

 		plt.show()

--- a/src/MultinomialNaiveBayes.py
+++ b/src/MultinomialNaiveBayes.py
@ -25,7 +25,11 @@ class MultinomialNaiveBayes:
 		# split data into text and label set
 		# join title and text
 		X = dataset['Title'] + '. ' + dataset['Text']
+		
+		print(X[:12])
+		
 		y = dataset['Label']
+		print(y[:12])

 		if sklearn_cv:
 			cv = CountVectorizer()
@ -57,6 +61,12 @@ class MultinomialNaiveBayes:
 			if sklearn_cv:
 				# use sklearn CountVectorizer
 				# fit the training data and then return the matrix
+				print('Title + Text von train')
+				print(X[train])
+				
+				print('Label von train')
+				print(y[train])
+				
 				training_data = cv.fit_transform(X[train], y[train]).toarray()
 				# transform testing data and return the matrix
 				testing_data = cv.transform(X[test]).toarray()
@ -172,4 +182,24 @@ class MultinomialNaiveBayes:
 				print(y_test[i])
 				print()
 		#print metrics
-		print('F1 score: ', format(f1_score(y_test, predictions)))
+		print('F1 score: ', format(f1_score(y_test, predictions)))
+
+if __name__ == '__main__':
+
+	# read csv file
+	print('# reading dataset')
+	print('# ...')
+
+	# read current data set from csv
+	df = pd.read_csv('../data/interactive_labeling_round_11.csv',
+			  sep='|',
+			  usecols=range(1,13), # drop first column 'unnamed'
+			  encoding='utf-8',
+			  quoting=csv.QUOTE_NONNUMERIC,
+			  quotechar='\'')
+
+	# select only labeled articles
+	#print('Anzahl aller gelabelten:')
+	#print(len(df.loc[df['Label'] != -1]))
+	#print(df.loc[df['Label'] != -1][:5])
+	MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reindex(), sklearn_cv=True, percentile=100)
--- a/visualization/proba_round_11.eps
+++ b/visualization/proba_round_11.eps
@ -1,7 +1,7 @@
 %!PS-Adobe-3.0 EPSF-3.0
 %%Title: ..\visualization\proba_round_11.eps
 %%Creator: matplotlib version 3.0.2, http://matplotlib.org/
-%%CreationDate: Thu Feb 21 14:11:04 2019
+%%CreationDate: Tue Mar  5 08:51:48 2019
 %%Orientation: portrait
 %%BoundingBox: 18 252 594 540
 %%EndComments
--- a/visualization/proba_stratified_round_9.eps
+++ b/visualization/proba_stratified_round_9.eps
--- a/visualization/proba_stratified_round_9.png
+++ b/visualization/proba_stratified_round_9.png