update labeling / documentation

2019-05-06 11:18:38 +02:00 · 2019-05-06 11:18:38 +02:00 · 7c3353edab
parent 8ddf23d801
commit 7c3353edab
46 changed files with 18153 additions and 51041 deletions
--- a/data/articles/all_01.csv
+++ b/data/articles/all_01.csv
--- a/data/articles/all_02.csv
+++ b/data/articles/all_02.csv
--- a/data/articles/all_03.csv
+++ b/data/articles/all_03.csv
--- a/data/articles/all_04.csv
+++ b/data/articles/all_04.csv
--- a/data/articles/all_05.csv
+++ b/data/articles/all_05.csv
--- a/data/articles/all_06.csv
+++ b/data/articles/all_06.csv
--- a/data/articles/all_07.csv
+++ b/data/articles/all_07.csv
--- a/data/articles/all_08.csv
+++ b/data/articles/all_08.csv
--- a/data/articles/all_09.csv
+++ b/data/articles/all_09.csv
--- a/data/articles/all_10.csv
+++ b/data/articles/all_10.csv
--- a/data/articles/all_11.csv
+++ b/data/articles/all_11.csv
--- a/data/articles/all_12.csv
+++ b/data/articles/all_12.csv
--- a/data/interactive_labeling_round_18_20190503.csv
+++ b/data/interactive_labeling_round_18_20190503.csv
--- a/obj/array_class_probs_round_15_svm_190502.pkl
+++ b/obj/array_class_probs_round_15_svm_190502.pkl
--- a/obj/array_class_probs_round_16_svm_190502_2.pkl
+++ b/obj/array_class_probs_round_16_svm_190502_2.pkl
--- a/obj/array_class_probs_round_17_svm_190502_3.pkl
+++ b/obj/array_class_probs_round_17_svm_190502_3.pkl
--- a/data/sections.txt
+++ b/data/sections.txt
--- a/src/2019-04-02-al-interactive-labeling-best-strategy.ipynb
+++ b/src/2019-04-02-al-interactive-labeling-best-strategy.ipynb
@ -146,46 +146,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'm' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-4-9a40b379906c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mm\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;31mNameError\u001b[0m: name 'm' is not defined"
-     ]
-    }
-   ],
-   "source": [
-    "m"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
-    "m=15"
+    "m=16"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "This round number: 15\n",
-      "Number of manually labeled articles: 1122\n",
-      "Number of manually unlabeled articles: 8878\n"
+      "This round number: 16\n",
+      "Number of manually labeled articles: 1132\n",
+      "Number of manually unlabeled articles: 8868\n"
     ]
    }
   ],
@ -205,6 +184,24 @@
    "print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1082"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -214,14 +211,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "52\n"
+      "50\n"
     ]
    }
   ],
@ -242,16 +239,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "8878"
+       "0"
      ]
     },
-     "execution_count": 9,
+     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
--- a/src/2019-05-03-al-interactive-labeling-best-strategy.ipynb
+++ b/src/2019-05-03-al-interactive-labeling-best-strategy.ipynb
--- a/src/CosineSimilarity.py
+++ b/src/CosineSimilarity.py
@ -1,80 +0,0 @@
-'''
-Cosine Similarity
-=================
-
-CosineSimilarity measures the similarity between to articles.
-It calculates c: the cosine of the angle between the articles
-vectors text_1 and text_2.
-c = (text_1 * text_2) / (|text_1| * |text_2|).
-c = 1, if articles are equal => identicalness is 100%
-0 > c > 1, else => identicalness is (c*100)%
-(The greater c, the more similar two articles are.)
-'''
-from BagOfWords import BagOfWords
-
-import csv
-import math
-
-import pandas as pd
-
-class CosineSimilarity:
-
-    def calc_similarity(text_1, text_2, rel_freq=True, stemming=True):
-        ''' calculates cosine similarity of two input articles
-        '''
-        print('# calculating cosine similarity...')
-        print()
-
-        # extract words from articles
-        extracted_words_1 = BagOfWords.extract_words(text_1, stemming)
-        extracted_words_2 = BagOfWords.extract_words(text_2, stemming)
-        print(extracted_words_1)
-        print(extracted_words_2)
-
-        # insert words into vocab
-        both_extracted = []
-        both_extracted.append(extracted_words_1)
-        both_extracted.append(extracted_words_2)
-        vocab = BagOfWords.make_vocab(both_extracted, stemming)
-
-        # create vectors
-        matrix = BagOfWords.make_matrix(both_extracted, vocab,\
-                                          rel_freq, stemming)
-
-        # start calculation
-        # calculate numerator of formula
-        sum_1 = 0
-
-        for i in range (0,len(matrix.iloc[0])):
-            sum_1 += matrix.iloc[0][i] * matrix.iloc[1][i]
-
-        # calculate denominator of formula
-        sum_2 = 0
-
-        for entry in matrix.iloc[0]:
-            sum_2 += entry ** 2
-
-        sum_3 = 0
-        for entry in matrix.iloc[1]:
-            sum_3 += entry ** 2
-
-        return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
-
-if __name__ == '__main__':
-        # read data set
-        file = '..\\data\\cleaned_data_set_without_header.csv'
-        df = pd.read_csv(file,
-                         delimiter='|',
-                         header=None,
-                         index_col=None,
-                         engine='python',
-                         usecols=[1,2],
-                         nrows=100,
-                         quoting=csv.QUOTE_NONNUMERIC,
-                         quotechar='\'')
-
-        texts = df[1] + '. ' + df[2]
-
-        # compare first and second article in data set
-        print(CosineSimilarity.calc_similarity(texts.iloc[0], texts.iloc[1],\
-                                            rel_freq=True, stemming=True))
--- a/src/DecisionTree.py
+++ b/src/DecisionTree.py
@ -15,7 +15,7 @@ from BagOfWords import BagOfWords
 import csv
 import operator

-import graphviz
+#import graphviz
 import numpy as np
 import pandas as pd
 from sklearn import tree
@ -26,7 +26,7 @@ from sklearn.model_selection import StratifiedKFold

 class DecisionTree:

-    def make_tree(dataset, sklearn_cv=False, stemming=False, percentile=100):
+    def make_tree(dataset, sklearn_cv=True, stemming=False, percentile=100):
        print('# fitting model')
        print('# ...')

@ -131,18 +131,18 @@ class DecisionTree:
        print('# starting decision tree')
        print('# ...')

-        file = '..\\data\\classification_labelled_corrected.csv'
+        file = '..\\data\\interactive_labeling_round_17_20190502.csv'

        # read csv file
        print('# reading dataset')
        print('# ...')

        data = pd.read_csv(file,
-                           sep='|',
-                           engine='python',
-                           decimal='.',
-                           quotechar='\'',
-                           quoting=csv.QUOTE_NONE)
+			  sep='|',
+			  usecols=range(1,13), # drop first column 'unnamed'
+			  encoding='utf-8',
+			  quoting=csv.QUOTE_NONNUMERIC,
+			  quotechar='\'')

        make_tree(data)

--- a/src/LabelPropagation.py
+++ b/src/LabelPropagation.py
@ -1,86 +0,0 @@
-'''
-Label Propagation Algorithm for Interactive Labeling
-====================================================
-
-Uses scikit learn's implementation of label propagation:
-Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled
-data with label propagation.
-(Technical Report CMU-CALD-02-107, Carnegie Mellon University, 2002.)
-
-Prints out probabilities for classes needed for interactive labeling.
-'''
-
-from BagOfWords import BagOfWords
-
-import pandas as pd
-from sklearn.feature_extraction.text import CountVectorizer
-
-from sklearn.metrics import recall_score, precision_score
-
-from sklearn.semi_supervised import label_propagation
-
-class LabelPropagation:
-
-	def propagate_labels(labeled_data, unlabeled_data, sklearn_cv=False):
-
-		print('# MNB: starting label propagation')
-
-		# assign algorithm
-		classifier = label_propagation.LabelSpreading()
-
-		# split labeled data into text and label set
-		# join title and text
-		X = labeled_data['Title'] + '. ' + labeled_data['Text']
-		y = labeled_data['Label']
-
-		# split unlabeled data into text and label set
-		# join title and text
-		U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
-		l = unlabeled_data['Label']
-		
-		if sklearn_cv:
-				cv = CountVectorizer()
-
-		# probabilities of each class (of each fold)
-		class_probs = []
-
-		# number of training samples observed in each class 
-		class_counts = []
-
-		if sklearn_cv:
-			# fit the training data and then return the matrix
-			training_data = cv.fit_transform(X, y).toarray()
-			# transform testing data and return the matrix
-			testing_data = cv.transform(U).toarray()
-		else:
-			# use my own BagOfWords python implementation
-			stemming = True
-			rel_freq = False
-			extracted_words = BagOfWords.extract_all_words(X)
-			vocab = BagOfWords.make_vocab(extracted_words)
-
-			# fit the training data and then return the matrix
-			print('# MNB: fit training data and calculate matrix...')
-			print()
-			training_data = BagOfWords.make_matrix(extracted_words,
-								 vocab, rel_freq, stemming)
-
-			# transform testing data and return the matrix
-			print('# MNB: transform testing data to matrix...')
-			print()
-			extracted_words = BagOfWords.extract_all_words(U)
-			testing_data = BagOfWords.make_matrix(extracted_words,
-								 vocab, rel_freq, stemming)
-
-		#fit classifier
-		classifier.fit(training_data, y)
-		  
-		# probability estimates for the test vector (testing_data)
-		class_probs = classifier.predict_proba(testing_data)
-
-		predictions = classifier.predict(testing_data)
-
-		print('# MNB: ending label propagation')
-
-		# return vector of class estimates
-		return class_probs, predictions
--- a/src/LabelingPlotter.py
+++ b/src/LabelingPlotter.py
@ -139,7 +139,7 @@ class LabelingPlotter():

 	def plot_cumulative():
 		# load pickle object
-		with open('../obj/array_3model_svm_class2.pkl', 'rb') as input:
+		with open('../obj/array_class_probs_round_15_svm_190502.pkl', 'rb') as input:
 			list = pickle.load(input)

 		# sort list in descending order
@ -165,12 +165,12 @@ class LabelingPlotter():

 		#ax.grid(True)
 		#ax.legend(loc='right')
-		ax.set_title('Predictions class 2 (SVM)')
+		#ax.set_title('Predictions class 2 (SVM)')
 		# for iterations
 		#ax.set_xlabel('Highest estimated probability')
 		#ax.set_ylabel('Fraction of articles with this highest estimated probability')
 		# for 3-models
-		ax.set_xlabel('Estimated probability for class 2')
+		ax.set_xlabel('Estimated probabilities after iteration 14')
 		ax.set_ylabel('Fraction of articles with this probability')
 		#plt.axis([0.97, 1, 0.95, 1.01])
 		#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
@ -180,8 +180,8 @@ class LabelingPlotter():
 		#ax.set_xbound(lower=0.5, upper=0.99)
 		#plt.savefig('..\\visualization\\proba_stratified_round_9.png')
 		#plt.savefig('..\\visualization\\proba_stratified_round_9.eps')
-		plt.savefig('..\\visualization\\3model_svm_class2.png')
-		plt.savefig('..\\visualization\\3model_svm_class2.eps')
+		#plt.savefig('..\\visualization\\3model_svm_class2.png')
+		#plt.savefig('..\\visualization\\3model_svm_class2.eps')

 		plt.show()

@ -211,5 +211,5 @@ class LabelingPlotter():

 if __name__ == '__main__':
    #LabelingPlotter.plot_correlation()
-	#LabelingPlotter.plot_cumulative()
-	LabelingPlotter.plot_labeling_rounds_naive()
+	LabelingPlotter.plot_cumulative()
+	#LabelingPlotter.plot_labeling_rounds_naive()
--- a/src/SVM.py
+++ b/src/SVM.py
@ -19,7 +19,7 @@ import csv
 import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
-from sklearn.metrics import f1_score, make_scorer
+from sklearn.metrics import f1_score, make_scorer, recall_score
 from sklearn.model_selection import StratifiedKFold
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
@ -56,12 +56,12 @@ class SVM:

        pipeline = Pipeline([('perc', selector), ('SVC', SVC())])

-        grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75],
+        grid = GridSearchCV(pipeline, {'perc__percentile': [100],
                            'SVC__kernel': ['linear'],
                            'SVC__gamma': [0.00001, 0.0001],
                            'SVC__C': [0.1, 1]},
                            cv=skf,
-                            scoring=make_scorer(f1_score))
+                            scoring=make_scorer(recall_score))

        print('# fit classifier')
        print('# ...')
--- a/src/ThreeModelApproach.py
+++ b/src/ThreeModelApproach.py
@ -0,0 +1,70 @@
+'''
+Comparing Three Model Approach to MNB
+'''
+from BagOfWords import BagOfWords
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_selection import SelectPercentile
+from sklearn.metrics import recall_score, precision_score
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import GaussianNB
+
+class ThreeModelApproach:
+
+	def calc_model_1(labeled_data):
+
+		print('# MNB: starting interactive multinomial naives bayes...')
+		print()
+
+		# split labeled data into text and label set
+		# join title and text
+		X = labeled_data['Title'] + '. ' + labeled_data['Text']
+		y = labeled_data['Label']
+
+		cv = CountVectorizer()
+
+		# fit_prior=False: a uniform prior will be used instead
+		# of learning class prior probabilities
+		classifier = GaussianNB()
+
+		# metrics
+		recall_scores = []
+		precision_scores = []
+		
+		X_train, X_test, y_train, y_test = train_test_split(X, y,
+															stratify=y, 
+															test_size=0.25)
+
+
+		# use sklearn CountVectorizer
+		# fit the training data and then return the matrix
+		training_data = cv.fit_transform(X_train, y_train).toarray()
+		# transform testing data and return the matrix
+		testing_data = cv.transform(X_test).toarray()
+
+		#fit classifier
+		classifier.fit(training_data, y_train)
+		
+		predictions_test = classifier.predict(testing_data)
+
+		#print and store metrics
+		rec = recall_score(y_test, predictions_test)
+		print('rec: ' + str(rec))
+
+		prec = precision_score(y_test, predictions_test)
+		print('prec: ' + str(prec))
+		print('#')
+
+if __name__ == '__main__':
+
+	file = '..\\data\\interactive_labeling_round_17_20190502.csv'
+
+	data = pd.read_csv('../data/interactive_labeling_round_11.csv',
+			  sep='|',
+			  usecols=range(1,13), # drop first column 'unnamed'
+			  encoding='utf-8',
+			  quoting=csv.QUOTE_NONNUMERIC,
+			  quotechar='\'')
+
+	ThreeModelApproach.calc_model_1(df.loc[df['Label'] != -1].reset_index(drop=True))
--- a/notebooks/2019-01-29-al-interactive-labeling.ipynb
+++ b/notebooks/2019-01-29-al-interactive-labeling.ipynb
--- a/notebooks/2019-02-04-al-label-propagation.ipynb
+++ b/notebooks/2019-02-04-al-label-propagation.ipynb
--- a/notebooks/2019-02-06-al-labeling-analysis.ipynb
+++ b/notebooks/2019-02-06-al-labeling-analysis.ipynb
--- a/notebooks/2019-02-11-interactive-labeling-analysis.ipynb
+++ b/notebooks/2019-02-11-interactive-labeling-analysis.ipynb
--- a/notebooks/2019-02-19-al-interactive-labeling-part2.ipynb
+++ b/notebooks/2019-02-19-al-interactive-labeling-part2.ipynb
--- a/notebooks/2019-02-19-al-neueRunden0-9.ipynb
+++ b/notebooks/2019-02-19-al-neueRunden0-9.ipynb
--- a/notebooks/2019-02-24-al-resubstitution-error.ipynb
+++ b/notebooks/2019-02-24-al-resubstitution-error.ipynb
@ -618,27 +618,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 158,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Nachberechnung fürs Latex:\n",
-    "zero_0 = 80\n",
-    "zero_1 = 2\n",
-    "zero_2 = 14\n",
+    "zero_0 = 0\n",
+    "zero_1 = 0\n",
+    "zero_2 = 0\n",
    "\n",
-    "one_0 = 0\n",
-    "one_1 = 0\n",
-    "one_2 = 1\n",
+    "one_0 = 58\n",
+    "one_1 = 22\n",
+    "one_2 = 20\n",
    "\n",
    "two_0 = 0\n",
    "two_1 = 0\n",
-    "two_2 = 3"
+    "two_2 = 0"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 129,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
@ -650,108 +650,15 @@
     ]
    },
    {
-     "data": {
-      "text/plain": [
-       "68"
-      ]
-     },
-     "execution_count": 129,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 129,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "6"
-      ]
-     },
-     "execution_count": 129,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/\n"
+     "ename": "NameError",
+     "evalue": "name 'testing_data' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-2-2e477f7d128e>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'confusion matrix:'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'###############'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mzero_0\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Estimated'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m&\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Label'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m \u001b[0mzero_0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0mzero_1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Estimated'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m&\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Label'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'testing_data' is not defined"
     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "8"
-      ]
-     },
-     "execution_count": 129,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "1"
-      ]
-     },
-     "execution_count": 129,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "11"
-      ]
-     },
-     "execution_count": 129,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "4"
-      ]
-     },
-     "execution_count": 129,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "1"
-      ]
-     },
-     "execution_count": 129,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "1"
-      ]
-     },
-     "execution_count": 129,
-     "metadata": {},
-     "output_type": "execute_result"
    }
   ],
   "source": [
@ -782,7 +689,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 159,
+   "execution_count": 11,
   "metadata": {
    "scrolled": false
   },
@ -795,51 +702,51 @@
      "\n",
      "class 0:\n",
      "\n",
-      "TP: 80\n",
-      "TN: 4\n",
-      "FP: 16\n",
-      "FN: 0\n",
+      "TP: 0\n",
+      "TN: 42\n",
+      "FP: 0\n",
+      "FN: 58\n",
      "\n",
      "class 1:\n",
      "\n",
-      "TP: 0\n",
-      "TN: 97\n",
-      "FP: 1\n",
-      "FN: 2\n",
+      "TP: 22\n",
+      "TN: 0\n",
+      "FP: 78\n",
+      "FN: 0\n",
      "\n",
      "class 2:\n",
      "\n",
-      "TP: 3\n",
-      "TN: 82\n",
+      "TP: 0\n",
+      "TN: 80\n",
      "FP: 0\n",
-      "FN: 15\n",
+      "FN: 20\n",
      "###############\n",
      "\n",
      "METRICS:\n",
      "\n",
      "class 0:\n",
      "\n",
-      "precision: 83.33\n",
-      "recall: 100.0\n",
-      "accuracy: 84.0\n",
+      "precision: 0\n",
+      "recall: 0.0\n",
+      "accuracy: 42.0\n",
      "\n",
      "class 1:\n",
      "\n",
-      "precision: 0.0\n",
-      "recall: 0.0\n",
-      "accuracy: 97.0\n",
+      "precision: 22.0\n",
+      "recall: 100.0\n",
+      "accuracy: 22.0\n",
      "\n",
      "class 2:\n",
      "\n",
-      "precision: 100.0\n",
-      "recall: 16.67\n",
-      "accuracy: 85.0\n",
+      "precision: 0\n",
+      "recall: 0.0\n",
+      "accuracy: 80.0\n",
      "\n",
      "Average Metrics:\n",
      "\n",
-      "precision: 61.111111111111114\n",
-      "recall: 38.888888888888886\n",
-      "accuracy: 88.66666666666667\n"
+      "precision: 7.333333333333333\n",
+      "recall: 33.333333333333336\n",
+      "accuracy: 48.0\n"
     ]
    }
   ],
@ -885,7 +792,7 @@
    "print()\n",
    "print('class 0:')\n",
    "print()\n",
-    "prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
+    "prec_0 = tp_0 #/ (tp_0 + fp_0) * 100\n",
    "print('precision: {}'.format(round(prec_0, 2)))\n",
    "rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
    "print('recall: {}'.format(round(rec_0, 2)))\n",
@ -903,7 +810,7 @@
    "print()\n",
    "print('class 2:')\n",
    "print()\n",
-    "prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
+    "prec_2 = tp_2 #/ (tp_2 + fp_2) * 100\n",
    "print('precision: {}'.format(round(prec_2, 2)))\n",
    "rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
    "print('recall: {}'.format(round(rec_2, 2)))\n",
--- a/notebooks/2019-03-12-al-finding-the-best-model-new.ipynb
+++ b/notebooks/2019-03-12-al-finding-the-best-model-new.ipynb
--- a/notebooks/2019-03-12-al-finding-the-best-model.ipynb
+++ b/notebooks/2019-03-12-al-finding-the-best-model.ipynb
--- a/notebooks/2019-03-12-al-model-evaluation.ipynb
+++ b/notebooks/2019-03-12-al-model-evaluation.ipynb
--- a/notebooks/2019-04-20-interactive-labeling-demo-0.ipynb
+++ b/notebooks/2019-04-20-interactive-labeling-demo-0.ipynb
--- a/notebooks/2019-05-04-al-three-model-approach.ipynb
+++ b/notebooks/2019-05-04-al-three-model-approach.ipynb
@ -0,0 +1,713 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Last round number: 17\n",
+      "Number of manually labeled articles: 1412\n",
+      "Number of manually unlabeled articles: 8588\n"
+     ]
+    }
+   ],
+   "source": [
+    "import csv\n",
+    "import operator\n",
+    "import pickle\n",
+    "import random\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "\n",
+    "# initialize random => reproducible sequence\n",
+    "random.seed(5)\n",
+    "random_state=5\n",
+    "\n",
+    "# set up wider display area\n",
+    "pd.set_option('display.max_colwidth', -1)\n",
+    "\n",
+    "# read current data set from csv\n",
+    "df = pd.read_csv('../../data/interactive_labeling_round_17_20190502.csv',\n",
+    "          sep='|',\n",
+    "          usecols=range(1,13), # drop first column 'unnamed'\n",
+    "          encoding='utf-8',\n",
+    "          quoting=csv.QUOTE_NONNUMERIC,\n",
+    "          quotechar='\\'')\n",
+    "\n",
+    "# find current iteration/round number\n",
+    "m = int(df['Round'].max())\n",
+    "print('Last round number: {}'.format(m))\n",
+    "print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
+    "print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "m = 10\n",
+    "df.loc[(df['Round'] >= m), 'Label'] = -1\n",
+    "df.loc[(df['Round'] >= m), 'Round'] = np.nan\n",
+    "\n",
+    "len(df.loc[df['Label'] != -1])\n",
+    "\n",
+    "labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
+    "labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
+    "labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "737\n",
+      "35\n",
+      "128\n",
+      "655\n",
+      "31\n",
+      "114\n",
+      "573\n",
+      "27\n",
+      "100\n",
+      "491\n",
+      "23\n",
+      "86\n",
+      "409\n",
+      "19\n",
+      "72\n",
+      "327\n",
+      "15\n",
+      "58\n",
+      "245\n",
+      "11\n",
+      "44\n",
+      "163\n",
+      "7\n",
+      "30\n",
+      "81\n",
+      "3\n",
+      "16\n",
+      "0\n",
+      "0\n",
+      "0\n"
+     ]
+    }
+   ],
+   "source": [
+    "sampling_0_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
+    "sampling_0_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
+    "sampling_0_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
+    "labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_0_class0['Index'].tolist())]\n",
+    "labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_0_class1['Index'].tolist())]\n",
+    "labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_0_class2['Index'].tolist())]\n",
+    "print(len(labeled_pos_0)) # 33\n",
+    "print(len(labeled_pos_1)) # 33\n",
+    "print(len(labeled_pos_2)) \n",
+    "sampling_1_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
+    "sampling_1_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
+    "sampling_1_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
+    "labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_1_class0['Index'].tolist())]\n",
+    "labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_1_class1['Index'].tolist())]\n",
+    "labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_1_class2['Index'].tolist())]\n",
+    "print(len(labeled_pos_0)) # 33\n",
+    "print(len(labeled_pos_1)) # 33\n",
+    "print(len(labeled_pos_2)) \n",
+    "sampling_2_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
+    "sampling_2_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
+    "sampling_2_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
+    "labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_2_class0['Index'].tolist())]\n",
+    "labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_2_class1['Index'].tolist())]\n",
+    "labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_2_class2['Index'].tolist())]\n",
+    "print(len(labeled_pos_0)) # 33\n",
+    "print(len(labeled_pos_1)) # 33\n",
+    "print(len(labeled_pos_2)) \n",
+    "sampling_3_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
+    "sampling_3_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
+    "sampling_3_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
+    "labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_3_class0['Index'].tolist())]\n",
+    "labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_3_class1['Index'].tolist())]\n",
+    "labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_3_class2['Index'].tolist())]\n",
+    "print(len(labeled_pos_0)) # 33\n",
+    "print(len(labeled_pos_1)) # 33\n",
+    "print(len(labeled_pos_2)) \n",
+    "sampling_4_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
+    "sampling_4_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
+    "sampling_4_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
+    "labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_4_class0['Index'].tolist())]\n",
+    "labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_4_class1['Index'].tolist())]\n",
+    "labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_4_class2['Index'].tolist())]\n",
+    "print(len(labeled_pos_0)) # 33\n",
+    "print(len(labeled_pos_1)) # 33\n",
+    "print(len(labeled_pos_2)) \n",
+    "sampling_5_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
+    "sampling_5_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
+    "sampling_5_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
+    "labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_5_class0['Index'].tolist())]\n",
+    "labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_5_class1['Index'].tolist())]\n",
+    "labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_5_class2['Index'].tolist())]\n",
+    "print(len(labeled_pos_0)) # 33\n",
+    "print(len(labeled_pos_1)) # 33\n",
+    "print(len(labeled_pos_2)) \n",
+    "sampling_6_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
+    "sampling_6_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
+    "sampling_6_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
+    "labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_6_class0['Index'].tolist())]\n",
+    "labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_6_class1['Index'].tolist())]\n",
+    "labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_6_class2['Index'].tolist())]\n",
+    "print(len(labeled_pos_0)) # 33\n",
+    "print(len(labeled_pos_1)) # 33\n",
+    "print(len(labeled_pos_2)) \n",
+    "sampling_7_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
+    "sampling_7_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
+    "sampling_7_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
+    "labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_7_class0['Index'].tolist())]\n",
+    "labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_7_class1['Index'].tolist())]\n",
+    "labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_7_class2['Index'].tolist())]\n",
+    "print(len(labeled_pos_0)) # 33\n",
+    "print(len(labeled_pos_1)) # 33\n",
+    "print(len(labeled_pos_2)) \n",
+    "sampling_8_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
+    "sampling_8_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
+    "sampling_8_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
+    "labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_8_class0['Index'].tolist())]\n",
+    "labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_8_class1['Index'].tolist())]\n",
+    "labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_8_class2['Index'].tolist())]\n",
+    "print(len(labeled_pos_0)) # 33\n",
+    "print(len(labeled_pos_1)) # 33\n",
+    "print(len(labeled_pos_2)) \n",
+    "sampling_9_class0 = labeled_pos_0.sample(n=81, replace=False, random_state=random_state) # 737\n",
+    "sampling_9_class1 = labeled_pos_1.sample(n=3, replace=False, random_state=random_state) # 35\n",
+    "sampling_9_class2 = labeled_pos_2.sample(n=16, replace=False, random_state=random_state) # 128\n",
+    "labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_9_class0['Index'].tolist())]\n",
+    "labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_9_class1['Index'].tolist())]\n",
+    "labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_9_class2['Index'].tolist())]\n",
+    "print(len(labeled_pos_0)) # 33\n",
+    "print(len(labeled_pos_1)) # 33\n",
+    "print(len(labeled_pos_2)) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 238,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TESTING DATA\n",
+    "#testing_data = pd.concat([sampling_0_class0, sampling_0_class1, sampling_0_class2])\n",
+    "#testing_data = pd.concat([sampling_1_class0, sampling_1_class1, sampling_1_class2])\n",
+    "#testing_data = pd.concat([sampling_2_class0, sampling_2_class1, sampling_2_class2])\n",
+    "#testing_data = pd.concat([sampling_3_class0, sampling_3_class1, sampling_3_class2])\n",
+    "#testing_data = pd.concat([sampling_4_class0, sampling_4_class1, sampling_4_class2])\n",
+    "#testing_data = pd.concat([sampling_5_class0, sampling_5_class1, sampling_5_class2])\n",
+    "#testing_data = pd.concat([sampling_6_class0, sampling_6_class1, sampling_6_class2])\n",
+    "#testing_data = pd.concat([sampling_7_class0, sampling_7_class1, sampling_7_class2])\n",
+    "#testing_data = pd.concat([sampling_8_class0, sampling_8_class1, sampling_8_class2])\n",
+    "testing_data = pd.concat([sampling_9_class0, sampling_9_class1, sampling_9_class2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 239,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "100"
+      ]
+     },
+     "execution_count": 239,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "indices_testing_data = testing_data['Index'].tolist()\n",
+    "len(testing_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 240,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "900"
+      ]
+     },
+     "execution_count": 240,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# TRAINING DATA\n",
+    "training_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_testing_data))].reset_index(drop=True)\n",
+    "indices_training_data = training_data['Index'].tolist()\n",
+    "len(training_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 241,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model 2:\n",
+    "labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
+    "labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
+    "labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
+    "sampling_class0 = labeled_pos_0.sample(n=18, random_state=random_state) # 737\n",
+    "sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 0\n",
+    "sampling_class1 = labeled_pos_1.sample(n=18, random_state=random_state) # 35\n",
+    "sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 0\n",
+    "sampling_class2 = labeled_pos_2.sample(n=35, random_state=random_state) # 128\n",
+    "sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 1\n",
+    "training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
+    "\n",
+    "testing_data.loc[testing_data['Label'] == 0, 'Label'] = 0\n",
+    "testing_data.loc[testing_data['Label'] == 1, 'Label'] = 0\n",
+    "testing_data.loc[testing_data['Label'] == 2, 'Label'] = 1\n",
+    "classifier = GaussianNB()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 181,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model 1:\n",
+    "labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
+    "labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
+    "labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
+    "sampling_class0 = labeled_pos_0.sample(n=18, random_state=random_state) # 737\n",
+    "sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 0\n",
+    "sampling_class1 = labeled_pos_1.sample(n=35, random_state=random_state) # 35\n",
+    "sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 1\n",
+    "sampling_class2 = labeled_pos_2.sample(n=18, random_state=random_state) # 128\n",
+    "sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 0\n",
+    "training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
+    "\n",
+    "testing_data.loc[testing_data['Label'] == 0, 'Label'] = 0\n",
+    "testing_data.loc[testing_data['Label'] == 1, 'Label'] = 1\n",
+    "testing_data.loc[testing_data['Label'] == 2, 'Label'] = 0\n",
+    "classifier = GaussianNB()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model 0:\n",
+    "labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
+    "labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
+    "labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
+    "sampling_class0 = labeled_pos_0.sample(n=35, random_state=random_state) # 737\n",
+    "sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 1\n",
+    "sampling_class1 = labeled_pos_1.sample(n=18, random_state=random_state) # 35\n",
+    "sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 0\n",
+    "sampling_class2 = labeled_pos_2.sample(n=18, random_state=random_state) # 128\n",
+    "sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 0\n",
+    "training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
+    "\n",
+    "testing_data.loc[testing_data['Label'] == 0, 'Label'] = 1\n",
+    "testing_data.loc[testing_data['Label'] == 1, 'Label'] = 0\n",
+    "testing_data.loc[testing_data['Label'] == 2, 'Label'] = 0\n",
+    "classifier = GaussianNB()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "737\n",
+      "36\n",
+      "126\n"
+     ]
+    }
+   ],
+   "source": [
+    "# MNB:\n",
+    "labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
+    "labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
+    "labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
+    "print(len(labeled_pos_0)) # 33\n",
+    "print(len(labeled_pos_1)) # 33\n",
+    "print(len(labeled_pos_2)) \n",
+    "sampling_class0 = labeled_pos_0.sample(n=24, random_state=random_state) # 737\n",
+    "sampling_class1 = labeled_pos_1.sample(n=24, random_state=random_state) # 35\n",
+    "sampling_class2 = labeled_pos_2.sample(n=24, random_state=random_state) # 128\n",
+    "training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
+    "indices_training_data = training_data['Index'].tolist()\n",
+    "len(training_data)\n",
+    "classifier = MultinomialNB()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 242,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# split training data into text and label set\n",
+    "# join title and text\n",
+    "X = training_data['Title'] + '. ' + training_data['Text']\n",
+    "y = training_data['Label']\n",
+    "\n",
+    "# split testing data into text and label set\n",
+    "U = testing_data['Title'] + '. ' + testing_data['Text']\n",
+    "v = testing_data['Label']\n",
+    "\n",
+    "cv = CountVectorizer()\n",
+    "# fit the training data and then return the matrix\n",
+    "training_data = cv.fit_transform(X, y).toarray()\n",
+    "# transform testing data and return the matrix\n",
+    "testing_data = cv.transform(U).toarray()\n",
+    "#fit classifier\n",
+    "classifier.fit(training_data, y)\n",
+    "#predict class\n",
+    "predictions_test = classifier.predict(testing_data)\n",
+    "\n",
+    "# annotate estimated labels\n",
+    "df['Estimated'] = np.nan\n",
+    "for i, value in enumerate(indices_testing_data):\n",
+    "    df.loc[(df['Index'] == value), 'Estimated'] = predictions_test[i]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 243,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###############\n",
+      "69\n",
+      "1\n",
+      "###############\n",
+      "12\n",
+      "2\n",
+      "###############\n",
+      "metrics:\n",
+      "\n",
+      "69\n",
+      "2\n",
+      "1\n",
+      "12\n",
+      "###############\n",
+      "2\n",
+      "69\n",
+      "12\n",
+      "1\n",
+      "###############\n",
+      "98.57142857142858\n",
+      "85.18518518518519\n",
+      "84.52380952380952\n",
+      "###############\n",
+      "14.285714285714285\n",
+      "66.66666666666666\n",
+      "84.52380952380952\n",
+      "###############\n",
+      "56.42857142857143\n",
+      "75.92592592592592\n",
+      "84.52380952380952\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Model 0-2:\n",
+    "print('###############')\n",
+    "zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
+    "print(zero_0)\n",
+    "zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
+    "print(zero_1)\n",
+    "print('###############')\n",
+    "one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
+    "print(one_0)\n",
+    "one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
+    "print(one_1)\n",
+    "print('###############')\n",
+    "\n",
+    "print('metrics:')\n",
+    "print()\n",
+    "\n",
+    "total = zero_0 + zero_1 + one_0 + one_1\n",
+    "\n",
+    "tp_0 = zero_0\n",
+    "print(tp_0)\n",
+    "tn_0 = one_1\n",
+    "print(tn_0)\n",
+    "fp_0 = zero_1\n",
+    "print(fp_0)\n",
+    "fn_0 = one_0\n",
+    "print(fn_0)\n",
+    "print('###############')\n",
+    "\n",
+    "tp_1 = one_1\n",
+    "print(tp_1)\n",
+    "tn_1 = zero_0\n",
+    "print(tn_1)\n",
+    "fp_1 = one_0\n",
+    "print(fp_1)\n",
+    "fn_1 = zero_1\n",
+    "print(fn_1)\n",
+    "print('###############')\n",
+    "\n",
+    "prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
+    "print(prec_0)\n",
+    "rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
+    "print(rec_0)\n",
+    "acc_0 = (tp_0 + tn_0) / total * 100\n",
+    "print(acc_0)\n",
+    "print('###############')\n",
+    "\n",
+    "prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
+    "print(prec_1)\n",
+    "rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
+    "print(rec_1)\n",
+    "acc_1 = (tp_1 + tn_1) / total * 100\n",
+    "print(acc_1)\n",
+    "print('###############')\n",
+    "\n",
+    "print((prec_1 + prec_0) / 2)\n",
+    "print((rec_1 + rec_0) / 2)\n",
+    "print((acc_1 + acc_0) / 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "confusion matrix:\n",
+      "###############\n",
+      "62\n",
+      "0\n",
+      "0\n",
+      "/\n",
+      "12\n",
+      "3\n",
+      "11\n",
+      "/\n",
+      "8\n",
+      "0\n",
+      "5\n",
+      "###############\n",
+      "\n",
+      "class 0:\n",
+      "\n",
+      "TP: 62\n",
+      "TN: 19\n",
+      "FP: 0\n",
+      "FN: 20\n",
+      "\n",
+      "class 1:\n",
+      "\n",
+      "TP: 3\n",
+      "TN: 75\n",
+      "FP: 23\n",
+      "FN: 0\n",
+      "\n",
+      "class 2:\n",
+      "\n",
+      "TP: 5\n",
+      "TN: 77\n",
+      "FP: 8\n",
+      "FN: 11\n",
+      "###############\n",
+      "\n",
+      "METRICS:\n",
+      "\n",
+      "class 0:\n",
+      "\n",
+      "precision: 100.0\n",
+      "recall: 75.61\n",
+      "accuracy: 80.2\n",
+      "\n",
+      "class 1:\n",
+      "\n",
+      "precision: 11.54\n",
+      "recall: 100.0\n",
+      "accuracy: 77.23\n",
+      "\n",
+      "class 2:\n",
+      "\n",
+      "precision: 38.46\n",
+      "recall: 31.25\n",
+      "accuracy: 81.19\n",
+      "\n",
+      "Average Metrics:\n",
+      "\n",
+      "precision: 50.0\n",
+      "recall: 68.95325203252033\n",
+      "accuracy: 79.53795379537955\n"
+     ]
+    }
+   ],
+   "source": [
+    "# MNB:\n",
+    "print('confusion matrix:')\n",
+    "print('###############')\n",
+    "zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
+    "print(zero_0)\n",
+    "zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
+    "print(zero_1)\n",
+    "zero_2 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 2)])\n",
+    "print(zero_2)\n",
+    "print('/')\n",
+    "one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
+    "print(one_0)\n",
+    "one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
+    "print(one_1)\n",
+    "one_2 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 2)])\n",
+    "print(one_2)\n",
+    "print('/')\n",
+    "\n",
+    "two_0 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 0)])\n",
+    "print(two_0)\n",
+    "two_1 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 1)])\n",
+    "print(two_1)\n",
+    "two_2 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 2)])\n",
+    "print(two_2)\n",
+    "\n",
+    "print('###############')\n",
+    "print()\n",
+    "total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
+    "print('class 0:')\n",
+    "print()\n",
+    "tp_0 = zero_0\n",
+    "print('TP: {}'.format(tp_0))\n",
+    "tn_0 = one_1 + one_2 + two_1 + two_2\n",
+    "print('TN: {}'.format(tn_0))\n",
+    "fp_0 = zero_1 + zero_2\n",
+    "print('FP: {}'.format(fp_0))\n",
+    "fn_0 = one_0 + two_0\n",
+    "print('FN: {}'.format(fn_0))\n",
+    "print()\n",
+    "print('class 1:')\n",
+    "print()\n",
+    "tp_1 = one_1\n",
+    "print('TP: {}'.format(tp_1))\n",
+    "tn_1 = zero_0 + zero_2 + two_0 + two_2\n",
+    "print('TN: {}'.format(tn_1))\n",
+    "fp_1 = one_0 + one_2\n",
+    "print('FP: {}'.format(fp_1))\n",
+    "fn_1 = zero_1 + two_1\n",
+    "print('FN: {}'.format(fn_1))\n",
+    "print()\n",
+    "print('class 2:')\n",
+    "print()\n",
+    "tp_2 = two_2\n",
+    "print('TP: {}'.format(tp_2))\n",
+    "tn_2 = zero_0 + zero_1 + one_0 + one_1\n",
+    "print('TN: {}'.format(tn_2))\n",
+    "fp_2 = two_0 + two_1\n",
+    "print('FP: {}'.format(fp_2))\n",
+    "fn_2 = zero_2 + one_2\n",
+    "print('FN: {}'.format(fn_2))\n",
+    "print('###############')\n",
+    "print()\n",
+    "print('METRICS:')\n",
+    "print()\n",
+    "print('class 0:')\n",
+    "print()\n",
+    "prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
+    "print('precision: {}'.format(round(prec_0, 2)))\n",
+    "rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
+    "print('recall: {}'.format(round(rec_0, 2)))\n",
+    "acc_0 = (tp_0 + tn_0) / total * 100\n",
+    "print('accuracy: {}'.format(round(acc_0, 2)))\n",
+    "print()\n",
+    "print('class 1:')\n",
+    "print()\n",
+    "prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
+    "print('precision: {}'.format(round(prec_1, 2)))\n",
+    "rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
+    "print('recall: {}'.format(round(rec_1, 2)))\n",
+    "acc_1 = (tp_1 + tn_1) / total * 100\n",
+    "print('accuracy: {}'.format(round(acc_1, 2)))\n",
+    "print()\n",
+    "print('class 2:')\n",
+    "print()\n",
+    "prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
+    "print('precision: {}'.format(round(prec_2, 2)))\n",
+    "rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
+    "print('recall: {}'.format(round(rec_2, 2)))\n",
+    "acc_2 = (tp_2 + tn_2) / total * 100\n",
+    "print('accuracy: {}'.format(round(acc_2, 2)))\n",
+    "print()\n",
+    "print('Average Metrics:')\n",
+    "print()\n",
+    "print('precision: {}'.format((prec_1 + prec_2 + prec_0) / 3))\n",
+    "print('recall: {}'.format((rec_1 + rec_2 + rec_0) / 3))\n",
+    "print('accuracy: {}'.format((acc_1 + acc_2 + acc_0) / 3))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/visualization/Labeling_plot_190404.png
+++ b/visualization/Labeling_plot_190404.png
--- a/visualization/proba_after_round_1_stratified.png
+++ b/visualization/proba_after_round_1_stratified.png
--- a/visualization/probabilities_after_round_10.png
+++ b/visualization/probabilities_after_round_10.png
--- a/visualization/probabilities_after_round_11_svm.png
+++ b/visualization/probabilities_after_round_11_svm.png
--- a/visualization/probabilities_after_round_12_mnb.png
+++ b/visualization/probabilities_after_round_12_mnb.png
--- a/visualization/probabilities_after_round_13_mnb.png
+++ b/visualization/probabilities_after_round_13_mnb.png
--- a/visualization/probabilities_after_round_13_svm.png
+++ b/visualization/probabilities_after_round_13_svm.png
--- a/visualization/probabilities_after_round_14_svm.png
+++ b/visualization/probabilities_after_round_14_svm.png
--- a/visualization/probabilities_round_15_svm_190502.png
+++ b/visualization/probabilities_round_15_svm_190502.png