update labeling and documentation

2019-04-17 13:20:46 +02:00 · 2019-04-17 13:20:46 +02:00 · 8ddf23d801
commit 8ddf23d801
parent 94f501ab6d
68 changed files with 39651 additions and 30404 deletions
--- a/data/interactive_labeling_round_16_temp.csv
+++ b/data/interactive_labeling_round_16_temp.csv
--- a/data/interactive_labeling_triple_model.csv
+++ b/data/interactive_labeling_triple_model.csv
--- a/data/interactive_labeling_triple_model_auf_1082.csv
+++ b/data/interactive_labeling_triple_model_auf_1082.csv
--- a/obj/array_3model_naive_bayes_class0.pkl
+++ b/obj/array_3model_naive_bayes_class0.pkl
--- a/obj/array_3model_naive_bayes_class1.pkl
+++ b/obj/array_3model_naive_bayes_class1.pkl
--- a/obj/array_3model_naive_bayes_class2.pkl
+++ b/obj/array_3model_naive_bayes_class2.pkl
--- a/obj/array_3model_svm_class0.pkl
+++ b/obj/array_3model_svm_class0.pkl
--- a/obj/array_3model_svm_class1.pkl
+++ b/obj/array_3model_svm_class1.pkl
--- a/obj/array_3model_svm_class2.pkl
+++ b/obj/array_3model_svm_class2.pkl
--- a/obj/array_class_probs_round_11_svm.pkl
+++ b/obj/array_class_probs_round_11_svm.pkl
--- a/obj/array_class_probs_round_12_mnb.pkl
+++ b/obj/array_class_probs_round_12_mnb.pkl
--- a/obj/array_class_probs_round_12_svm.pkl
+++ b/obj/array_class_probs_round_12_svm.pkl
--- a/obj/array_class_probs_round_13_mnb.pkl
+++ b/obj/array_class_probs_round_13_mnb.pkl
--- a/obj/array_class_probs_round_13_svm.pkl
+++ b/obj/array_class_probs_round_13_svm.pkl
--- a/obj/array_class_probs_round_14_svm.pkl
+++ b/obj/array_class_probs_round_14_svm.pkl
--- a/obj/dict_200_most_common_words.pkl
+++ b/obj/dict_200_most_common_words.pkl
--- a/src/2019-02-19-al-neueRunden0-9.ipynb
+++ b/src/2019-02-19-al-neueRunden0-9.ipynb
@ -96,25 +96,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
-    "m=11"
+    "m=16"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "This round number: 11\n",
-      "Number of manually labeled articles: 1082\n",
-      "Number of manually unlabeled articles: 8918\n"
+      "This round number: 16\n",
+      "Number of manually labeled articles: 1132\n",
+      "Number of manually unlabeled articles: 8868\n"
     ]
    }
   ],
@ -842,8 +842,425 @@
    "            df.loc[index, 'Estimated'] = classes[i]\n",
    "            # annotate probability\n",
    "            df.loc[index, 'Probability'] = row[i]\n",
-    "    n += 1\n",
-    "\n",
+    "    n += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "m = 16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###############\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "5"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###############\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###############\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###############\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "5"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "3"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###############\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "7"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###############\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "8"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###############\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "83.33333333333334"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "62.5"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "60.0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###############\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "33.33333333333333"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "100.0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "80.0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###############\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "80.0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###############\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "38.88888888888889"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "54.166666666666664"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "73.33333333333333"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
    "print('###############')\n",
    "zero_0 = len(df.loc[(df['Round'] == m) & (df['Estimated'] == 0) & (df['Label'] == 0)])\n",
    "zero_0\n",
@ -910,7 +1327,7 @@
    "\n",
    "prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
    "prec_1\n",
-    "rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
+    "rec_1 = tp_1  / (tp_1 + fn_1) * 100\n",
    "rec_1\n",
    "acc_1 = (tp_1 + tn_1) / total * 100\n",
    "acc_1\n",
--- a/src/2019-02-24-al-resubstitution-error.ipynb
+++ b/src/2019-02-24-al-resubstitution-error.ipynb
@ -17,7 +17,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -42,6 +42,7 @@
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.semi_supervised import label_propagation\n",
+    "from sklearn.svm import LinearSVC\n",
    "\n",
    "from BagOfWords import BagOfWords\n",
    "from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
@ -50,7 +51,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -66,7 +67,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@ -105,16 +106,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "9"
+       "8"
      ]
     },
-     "execution_count": 70,
+     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -126,7 +127,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
@ -138,16 +139,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "number of labeled samples by class (0/1/2): 80/2/18\n",
-      "minimum of new labeled samples: 2\n",
-      "length of current data set for resubstitution error: 6\n"
+      "number of labeled samples by class (0/1/2): 79/8/13\n",
+      "minimum of new labeled samples: 8\n",
+      "length of current data set for resubstitution error: 24\n"
     ]
    }
   ],
@ -162,7 +163,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
@ -174,7 +175,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
@ -187,62 +188,67 @@
    "#training_data_5 = pd.concat([selec_0, selec_1, selec_2])\n",
    "#training_data_6 = pd.concat([selec_0, selec_1, selec_2])\n",
    "#training_data_7 = pd.concat([selec_0, selec_1, selec_2])\n",
-    "#training_data_8 = pd.concat([selec_0, selec_1, selec_2])\n",
-    "training_data_9 = pd.concat([selec_0, selec_1, selec_2])"
+    "training_data_8 = pd.concat([selec_0, selec_1, selec_2])\n",
+    "#training_data_9 = pd.concat([selec_0, selec_1, selec_2])"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "# indices of training samples\n",
-    "# idx_0 = training_data_0['Index'].tolist()\n",
-    "# idx_1 = training_data_1['Index'].tolist()\n",
-    "# idx_2 = training_data_2['Index'].tolist()\n",
-    "# idx_3 = training_data_3['Index'].tolist()\n",
-    "# idx_4 = training_data_4['Index'].tolist()\n",
-    "# idx_5 = training_data_5['Index'].tolist()\n",
-    "# idx_6 = training_data_6['Index'].tolist()\n",
-    "# idx_7 = training_data_7['Index'].tolist()\n",
-    "# idx_8 = training_data_8['Index'].tolist()\n",
-    "idx_9 = training_data_9['Index'].tolist()"
+    "#idx_0 = training_data_0['Index'].tolist()\n",
+    "#idx_1 = training_data_1['Index'].tolist()\n",
+    "#idx_2 = training_data_2['Index'].tolist()\n",
+    "#idx_3 = training_data_3['Index'].tolist()\n",
+    "#idx_4 = training_data_4['Index'].tolist()\n",
+    "#idx_5 = training_data_5['Index'].tolist()\n",
+    "#idx_6 = training_data_6['Index'].tolist()\n",
+    "#idx_7 = training_data_7['Index'].tolist()\n",
+    "idx_8 = training_data_8['Index'].tolist()\n",
+    "#idx_9 = training_data_9['Index'].tolist()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
-    "#train_all = training_data_0\n",
-    "train_0_8 = training_data_0.append([training_data_1, training_data_2, training_data_3, training_data_4, training_data_5, training_data_6, training_data_7, training_data_8])"
+    "#train_0_1 = training_data_0.append([training_data_1])\n",
+    "#train_0_2 = train_0_1.append([training_data_2])\n",
+    "#train_0_3 = train_0_2.append([training_data_3])\n",
+    "#train_0_4 = train_0_3.append([training_data_4])\n",
+    "#train_0_5 = train_0_4.append([training_data_5])\n",
+    "#train_0_6 = train_0_5.append([training_data_6])\n",
+    "#train_0_7 = train_0_6.append([training_data_7])\n",
+    "train_0_8 = train_0_7.append([training_data_8])"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
-    "#idx_all = idx_0\n",
-    "idx_all = train_all['Index'].tolist()\n",
-    "#idx_9"
+    "train_all = train_0_8\n",
+    "idx_all = train_all['Index'].tolist()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 92,
+   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "117"
+       "111"
      ]
     },
-     "execution_count": 92,
+     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -257,26 +263,35 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "train_0_9 = train_0_2.append(training_data_3)\n",
-    "len(train_0_3)"
+    "#train_0_9 = train_0_2.append(training_data_3)\n",
+    "#len(train_0_3)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#m = 4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "stratified number in round 9: 6\n",
-      "stratified number in total: 138\n"
+      "stratified number in round 7: 18\n",
+      "stratified number in total: 87\n"
     ]
    }
   ],
   "source": [
-    "print('stratified number in round {}: {}'.format(m, len(idx_9)))\n",
+    "print('stratified number in round {}: {}'.format(m, len(idx_7)))\n",
    "print('stratified number in total: {}'.format(len(idx_all)))"
   ]
  },
@ -288,22 +303,22 @@
   "source": [
    "# STEP 1:\n",
    "# resubstitution error round\n",
-    "training_data = train_0_8\n",
-    "testing_data = training_data_9"
+    "#training_data = train_0_8\n",
+    "#testing_data = training_data_9"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "9"
+       "4"
      ]
     },
-     "execution_count": 115,
+     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -314,16 +329,26 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 160,
+   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "1082"
+       "111"
      ]
     },
-     "execution_count": 160,
+     "execution_count": 126,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "100"
+      ]
+     },
+     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -331,10 +356,13 @@
   "source": [
    "# STEP 2: \n",
    "# resubstitution error all labeled articles in round\n",
-    "training_data = train_all\n",
-    "testing_data = df.loc[(df['Round'] <= 11)]# & (~df['Index'].isin(idx_all))]\n",
+    "training_data = train_0_8\n",
+    "testing_data = df.loc[(df['Round'] == (m+1))]\n",
+    "\n",
+    "# & (~df['Index'].isin(idx_all))]\n",
    "#df[~df['Index'].isin(idx_all)]\n",
    "#df.loc[(df['Label'] == -1) | (df['Round'] >= 10)]\n",
+    "len(training_data)\n",
    "len(testing_data)"
   ]
  },
@ -345,24 +373,44 @@
   "outputs": [],
   "source": [
    "# STEP 3:\n",
-    "training_data = train_all\n",
-    "testing_data = train_all"
+    "#training_data = train_all\n",
+    "#testing_data = train_all"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# STEP 4:\n",
-    "training_data = train_all\n",
-    "testing_data = train_all"
+    "#training_data = df.loc[df['Label'] != -1].reset_index(drop=True)\n",
+    "#testing_data = df.loc[df['Label'] == -1].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 161,
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8918"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#len(testing_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
@ -385,7 +433,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 162,
+   "execution_count": 128,
   "metadata": {},
   "outputs": [
    {
@ -425,7 +473,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
@ -438,10 +486,10 @@
    {
     "data": {
      "text/plain": [
-       "7140"
+       "65"
      ]
     },
-     "execution_count": 131,
+     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    },
@ -455,10 +503,10 @@
    {
     "data": {
      "text/plain": [
-       "2007"
+       "26"
      ]
     },
-     "execution_count": 131,
+     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    },
@ -472,10 +520,10 @@
    {
     "data": {
      "text/plain": [
-       "736"
+       "9"
      ]
     },
-     "execution_count": 131,
+     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -570,27 +618,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 181,
+   "execution_count": 158,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Nachberechnung fürs Latex:\n",
-    "zero_0 = 1\n",
-    "zero_1 = 1\n",
-    "zero_2 = 0\n",
+    "zero_0 = 80\n",
+    "zero_1 = 2\n",
+    "zero_2 = 14\n",
    "\n",
-    "one_0 = 4\n",
-    "one_1 = 3\n",
-    "one_2 = 4\n",
+    "one_0 = 0\n",
+    "one_1 = 0\n",
+    "one_2 = 1\n",
    "\n",
    "two_0 = 0\n",
-    "two_1 = 1\n",
-    "two_2 = 1"
+    "two_1 = 0\n",
+    "two_2 = 3"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 163,
+   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
@ -604,10 +652,10 @@
    {
     "data": {
      "text/plain": [
-       "701"
+       "68"
      ]
     },
-     "execution_count": 163,
+     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    },
@ -617,17 +665,17 @@
       "0"
      ]
     },
-     "execution_count": 163,
+     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
-       "41"
+       "6"
      ]
     },
-     "execution_count": 163,
+     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    },
@ -641,47 +689,10 @@
    {
     "data": {
      "text/plain": [
-       "99"
+       "8"
      ]
     },
-     "execution_count": 163,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "49"
-      ]
-     },
-     "execution_count": 163,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "74"
-      ]
-     },
-     "execution_count": 163,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "47"
-      ]
-     },
-     "execution_count": 163,
+     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    },
@ -691,17 +702,54 @@
       "1"
      ]
     },
-     "execution_count": 163,
+     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
-       "70"
+       "11"
      ]
     },
-     "execution_count": 163,
+     "execution_count": 129,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "4"
+      ]
+     },
+     "execution_count": 129,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 129,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -734,7 +782,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 182,
+   "execution_count": 159,
   "metadata": {
    "scrolled": false
   },
@ -747,51 +795,51 @@
      "\n",
      "class 0:\n",
      "\n",
-      "TP: 1\n",
-      "TN: 9\n",
-      "FP: 1\n",
-      "FN: 4\n",
+      "TP: 80\n",
+      "TN: 4\n",
+      "FP: 16\n",
+      "FN: 0\n",
      "\n",
      "class 1:\n",
      "\n",
-      "TP: 3\n",
-      "TN: 2\n",
-      "FP: 8\n",
+      "TP: 0\n",
+      "TN: 97\n",
+      "FP: 1\n",
      "FN: 2\n",
      "\n",
      "class 2:\n",
      "\n",
-      "TP: 1\n",
-      "TN: 9\n",
-      "FP: 1\n",
-      "FN: 4\n",
+      "TP: 3\n",
+      "TN: 82\n",
+      "FP: 0\n",
+      "FN: 15\n",
      "###############\n",
      "\n",
      "METRICS:\n",
      "\n",
      "class 0:\n",
      "\n",
-      "precision: 50.0\n",
-      "recall: 20.0\n",
-      "accuracy: 66.67\n",
+      "precision: 83.33\n",
+      "recall: 100.0\n",
+      "accuracy: 84.0\n",
      "\n",
      "class 1:\n",
      "\n",
-      "precision: 27.27\n",
-      "recall: 60.0\n",
-      "accuracy: 33.33\n",
+      "precision: 0.0\n",
+      "recall: 0.0\n",
+      "accuracy: 97.0\n",
      "\n",
      "class 2:\n",
      "\n",
-      "precision: 50.0\n",
-      "recall: 20.0\n",
-      "accuracy: 66.67\n",
+      "precision: 100.0\n",
+      "recall: 16.67\n",
+      "accuracy: 85.0\n",
      "\n",
      "Average Metrics:\n",
      "\n",
-      "precision: 42.42424242424242\n",
-      "recall: 33.333333333333336\n",
-      "accuracy: 55.55555555555554\n"
+      "precision: 61.111111111111114\n",
+      "recall: 38.888888888888886\n",
+      "accuracy: 88.66666666666667\n"
     ]
    }
   ],
--- a/src/2019-03-12-al-finding-the-best-model-new.ipynb
+++ b/src/2019-03-12-al-finding-the-best-model-new.ipynb
@ -0,0 +1,374 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 112,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import operator\n",
+    "import pickle\n",
+    "import random\n",
+    "\n",
+    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
+    "import ipywidgets as widgets\n",
+    "from IPython.core.interactiveshell import InteractiveShell\n",
+    "from IPython.display import display\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.svm import LinearSVC\n",
+    "\n",
+    "from BagOfWords import BagOfWords\n",
+    "from MNBInteractive import MNBInteractive\n",
+    "from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
+    "from NaiveBayes import NaiveBayes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Last round number: 15\n",
+      "Number of manually labeled articles: 1122\n",
+      "Number of manually unlabeled articles: 8878\n"
+     ]
+    }
+   ],
+   "source": [
+    "# initialize random => reproducible sequence\n",
+    "random.seed(5)\n",
+    "random_state=5\n",
+    "\n",
+    "# set up wider display area\n",
+    "pd.set_option('display.max_colwidth', -1)\n",
+    "\n",
+    "# read current data set from csv\n",
+    "df = pd.read_csv('../data/interactive_labeling_round_15_temp.csv',\n",
+    "          sep='|',\n",
+    "          usecols=range(1,13), # drop first column 'unnamed'\n",
+    "          encoding='utf-8',\n",
+    "          quoting=csv.QUOTE_NONNUMERIC,\n",
+    "          quotechar='\\'')\n",
+    "\n",
+    "# find current iteration/round number\n",
+    "m = int(df['Round'].max())\n",
+    "print('Last round number: {}'.format(m))\n",
+    "print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
+    "print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "52\n"
+     ]
+    }
+   ],
+   "source": [
+    "labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
+    "labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
+    "labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)\n",
+    "\n",
+    "max_sample = min(len(labeled_pos_0), len(labeled_pos_1), len(labeled_pos_2))\n",
+    "print(max_sample)\n",
+    "\n",
+    "sampling_class0 = labeled_pos_0.sample(n=max_sample, random_state=random_state)\n",
+    "sampling_class1 = labeled_pos_1.sample(n=max_sample, random_state=random_state)\n",
+    "sampling_class2 = labeled_pos_2.sample(n=max_sample, random_state=random_state)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# nur für subset EINDEUTIG\n",
+    "training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
+    "testing_data = df.loc[(df['Label'] != -1) & (df['Index'].isin(subset_indices))].reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
+    "testing_data = df.loc[(df['Label'] != -1)].reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(testing_data)\n",
+    "indices_predicted = df.loc[(df['Label'] != -1), 'Index'].tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# split training data into text and label set\n",
+    "# join title and text\n",
+    "X = training_data['Title'] + '. ' + training_data['Text']\n",
+    "y = training_data['Label']\n",
+    "\n",
+    "# split testing data into text and label set\n",
+    "U = testing_data['Title'] + '. ' + testing_data['Text']\n",
+    "v = testing_data['Label']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#classifier = MultinomialNB(alpha=1.0e-10,\n",
+    "#                           fit_prior=False,\n",
+    "#                           class_prior=None)\n",
+    "#classifier = SVC()\n",
+    "classifier = LinearSVC()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cv = CountVectorizer()\n",
+    "\n",
+    "# fit the training data and then return the matrix\n",
+    "training_data = cv.fit_transform(X, y).toarray()\n",
+    "# transform testing data and return the matrix\n",
+    "testing_data = cv.transform(U).toarray()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#fit classifier\n",
+    "classifier.fit(training_data, y)\n",
+    "\n",
+    "#predict class\n",
+    "predictions_test = classifier.predict(testing_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# annotate estimated labels\n",
+    "df['Estimated'] = np.nan\n",
+    "\n",
+    "for i, value in enumerate(indices_predicted):\n",
+    "    df.loc[(df['Index'] == value), 'Estimated'] = predictions_test[i]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###############\n",
+      "642\n",
+      "0\n",
+      "19\n",
+      "###############\n",
+      "55\n",
+      "50\n",
+      "36\n",
+      "###############\n",
+      "150\n",
+      "0\n",
+      "130\n",
+      "###############\n",
+      "metrics:\n",
+      "\n",
+      "642\n",
+      "216\n",
+      "19\n",
+      "205\n",
+      "###############\n",
+      "50\n",
+      "941\n",
+      "91\n",
+      "0\n",
+      "###############\n",
+      "130\n",
+      "747\n",
+      "150\n",
+      "55\n",
+      "###############\n",
+      "97.12556732223904\n",
+      "75.79693034238488\n",
+      "79.29759704251387\n",
+      "###############\n",
+      "35.46099290780142\n",
+      "100.0\n",
+      "91.58964879852127\n",
+      "###############\n",
+      "46.42857142857143\n",
+      "70.27027027027027\n",
+      "81.05360443622921\n",
+      "###############\n",
+      "59.67171055287063\n",
+      "82.02240020421839\n",
+      "83.98028342575479\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('###############')\n",
+    "zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
+    "print(zero_0)\n",
+    "zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
+    "print(zero_1)\n",
+    "zero_2 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 2)])\n",
+    "print(zero_2)\n",
+    "print('###############')\n",
+    "one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
+    "print(one_0)\n",
+    "one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
+    "print(one_1)\n",
+    "one_2 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 2)])\n",
+    "print(one_2)\n",
+    "print('###############')\n",
+    "\n",
+    "two_0 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 0)])\n",
+    "print(two_0)\n",
+    "two_1 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 1)])\n",
+    "print(two_1)\n",
+    "two_2 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 2)])\n",
+    "print(two_2)\n",
+    "print('###############')\n",
+    "print('metrics:')\n",
+    "print()\n",
+    "\n",
+    "total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
+    "\n",
+    "tp_0 = zero_0\n",
+    "print(tp_0)\n",
+    "tn_0 = one_1 + one_2 + two_1 + two_2\n",
+    "print(tn_0)\n",
+    "fp_0 = zero_1 + zero_2\n",
+    "print(fp_0)\n",
+    "fn_0 = one_0 + two_0\n",
+    "print(fn_0)\n",
+    "print('###############')\n",
+    "\n",
+    "tp_1 = one_1\n",
+    "print(tp_1)\n",
+    "tn_1 = zero_0 + zero_2 + two_0 + two_2\n",
+    "print(tn_1)\n",
+    "fp_1 = one_0 + one_2\n",
+    "print(fp_1)\n",
+    "fn_1 = zero_1 + two_1\n",
+    "print(fn_1)\n",
+    "print('###############')\n",
+    "\n",
+    "tp_2 = two_2\n",
+    "print(tp_2)\n",
+    "tn_2 = zero_0 + zero_1 + one_0 + one_1\n",
+    "print(tn_2)\n",
+    "fp_2 = two_0 + two_1\n",
+    "print(fp_2)\n",
+    "fn_2 = zero_2 + one_2\n",
+    "print(fn_2)\n",
+    "print('###############')\n",
+    "\n",
+    "prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
+    "print(prec_0)\n",
+    "rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
+    "print(rec_0)\n",
+    "acc_0 = (tp_0 + tn_0) / total * 100\n",
+    "print(acc_0)\n",
+    "print('###############')\n",
+    "\n",
+    "prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
+    "print(prec_1)\n",
+    "rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
+    "print(rec_1)\n",
+    "acc_1 = (tp_1 + tn_1) / total * 100\n",
+    "print(acc_1)\n",
+    "print('###############')\n",
+    "\n",
+    "prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
+    "print(prec_2)\n",
+    "rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
+    "print(rec_2)\n",
+    "acc_2 = (tp_2 + tn_2) / total * 100\n",
+    "print(acc_2)\n",
+    "print('###############')\n",
+    "\n",
+    "print((prec_1 + prec_2 + prec_0) / 3)\n",
+    "print((rec_1 + rec_2 + rec_0) / 3)\n",
+    "print((acc_1 + acc_2 + acc_0) / 3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/src/2019-03-12-al-finding-the-best-model.ipynb
+++ b/src/2019-03-12-al-finding-the-best-model.ipynb
--- a/src/2019-03-12-al-model-evaluation.ipynb
+++ b/src/2019-03-12-al-model-evaluation.ipynb
--- a/src/2019-04-02-al-interactive-labeling-best-strategy.ipynb
+++ b/src/2019-04-02-al-interactive-labeling-best-strategy.ipynb
--- a/src/LabelingPlotter.py
+++ b/src/LabelingPlotter.py
@ -11,52 +11,135 @@ class LabelingPlotter():
 		# round numbers
 		round = [0,1,2,3,4,5,6,7,8,9]

-		# number of wrong estimated labels per round
-		wrong = [0/100, 19/100, 17/100, 16/100, 20/100, 12/100, 10/100, 20/100, 14/100, 12/100]
+		# # number of wrong estimated labels per round
+		# wrong = [0/100, 19/100, 17/100, 16/100, 20/100, 12/100, 10/100, 20/100, 14/100, 12/100]

-		# number of manual classified articles per class and round 
-		man_0 = [84/100, 165/200, 247/300, 329/400, 410/500, 498/600, 586/700, 662/800, 741/900, 821/1000]
-		man_1 = [3/100, 7/200, 12/300, 16/400, 20/500, 22/600, 23/700, 29/800, 37/900, 39/1000]
-		man_2 = [13/100, 28/200, 41/300, 55/400, 70/500, 80/600, 91/700, 109/800, 122/900, 140/1000]
+		# # number of manual classified articles per class and round 
+		# man_0 = [84/100, 165/200, 247/300, 329/400, 410/500, 498/600, 586/700, 662/800, 741/900, 821/1000]
+		# man_1 = [3/100, 7/200, 12/300, 16/400, 20/500, 22/600, 23/700, 29/800, 37/900, 39/1000]
+		# man_2 = [13/100, 28/200, 41/300, 55/400, 70/500, 80/600, 91/700, 109/800, 122/900, 140/1000]

-		# number of estimated labels per class and round
-		est_0 = [9873/9900, 9757/9800, 9603/9700, 9470/9600, 9735/9500, 9238/9400, 9107/9300, 8007/9200, 8064/9100, 7641/9000]
-		est_1 = [14/9900, 15/9800, 11/9700, 11/9600, 16/9500, 17/9400, 18/9300, 19/9200, 18/9100, 20/9000]
-		est_2 = [12/9900, 26/9800, 77/9700, 94/9600, 380/9500, 123/9400, 147/9300, 676/9200, 595/9100, 837/9000]
+		# # number of estimated labels per class and round
+		# est_0 = [9873/9900, 9757/9800, 9603/9700, 9470/9600, 9735/9500, 9238/9400, 9107/9300, 8007/9200, 8064/9100, 7641/9000]
+		# est_1 = [14/9900, 15/9800, 11/9700, 11/9600, 16/9500, 17/9400, 18/9300, 19/9200, 18/9100, 20/9000]
+		# est_2 = [12/9900, 26/9800, 77/9700, 94/9600, 380/9500, 123/9400, 147/9300, 676/9200, 595/9100, 837/9000]

-		fig, ax = plt.subplots(3, 1)
+		# naive study
+		rec_av_n = [np.nan, 33.3, 35.9, 38.1, 37.4, 33.3, 39.4, 40.7, 40.1, 38.9]
+		rec_1_n = [np.nan, 0,      0,    0,    0,     0,    0,     0, 12.5, 0]
+		
+		prec_av_n = [np.nan, 26.3, 44.56, 61.22, 49.7, 29.3, 63.3, 59.7, 77.1, 61.1]
+		prec_1_n = [np.nan,   0,   0,     0,     0,     0,    0,   0,  100, 0]
+		
+		acc_av_n = [np.nan,86,88.7,89.3,88,92,93.3,86.7,87.3,88.7]
+		acc_1_n =  [np.nan,96,95,  96, 96 ,98,99, 94,93     ,97.0]
+		
+		# stratified
+		rec_av_s = [np.nan, 44.53, 47.85, 56.45, 56.36, 58.71, 57.20, 62.13, 55.41, 46.85]
+		rec_1_s = [np.nan, 75.00,     50,   100, 75.00, 100, 100, 100, 75.00, 50.00]
+		
+		prec_av_s = [np.nan, 36.8, 46.63, 41.42, 45.73, 33.69, 33.01,   52.68  , 44.68, 37.85]
+		prec_1_s = [np.nan, 6.67, 8.33,    9.52,  11.54,    8, 3.57, 16.67, 28.57, 5.00]
+					
+		fig, ax = plt.subplots(4, 1)

-		ax[0].plot(round, wrong)
-		ax[2].set_xlabel('Iteration number')
-		ax[0].set_ylabel('Error rate')
+		ax[0].plot(round, rec_av_n, round, rec_av_s)
+		ax[0].set_ylabel('Recall (Average)')	
+		ax[0].legend(('Naive Sampling', 'Stratified Sampling'))
+		
+		ax[1].plot(round, prec_av_n, round, prec_av_s)
+		ax[1].set_ylabel('Precision (Average)')
+		ax[1].legend(('Naive Sampling', 'Stratified Sampling'))

-		ax[1].plot(round, man_0, round, man_1, round, man_2)
-		ax[1].set_ylabel('Fraction of manual labels')
-
-		ax[2].plot(round, est_0, round, est_1, round, est_2)
-		ax[2].set_ylabel('Fraction of estimated labels')
+		ax[2].plot(round, rec_1_n, round, rec_1_s)
+		ax[2].set_ylabel('Recall (Class 1)')
+		ax[2].legend(('Naive Sampling', 'Stratified Sampling'))

+		ax[3].plot(round, prec_1_n, round, prec_1_s)
+		ax[3].set_ylabel('Precision (Class 1)')
+		ax[3].legend(('Naive Sampling', 'Stratified Sampling'))
+		
+		ax[3].set_xlabel('Iteration number')
+		
 		# limit x axis
 		ax[0].set_xbound(lower=1, upper=9)
 		ax[1].set_xbound(lower=1, upper=9)
 		ax[2].set_xbound(lower=1, upper=9)
+		ax[3].set_xbound(lower=1, upper=9)
 		
 		ax[0].set_ybound(lower=0)
 		ax[1].set_ybound(lower=0)
-		#ax[2].set_ybound(lower=0)
+		ax[2].set_ybound(lower=0)
+		ax[3].set_ybound(lower=0)
 		
-		# insert legend
-		ax[1].legend(('class 0', 'class 1', 'class 2'))
-		ax[2].legend(('class 0', 'class 1', 'class 2'))
+		# ax[0].plot(round, rec_av_n)
+		# ax[2].set_xlabel('Iteration number')
+		# ax[0].set_ylabel('Metrics without stratified sampling')

-		fig.tight_layout()
+		# ax[1].plot(round, man_0, round, man_1, round, man_2)
+		# ax[1].set_ylabel('Fraction of manual labels')

-		plt.savefig('..\\visualization\\Labeling_Grafik_070219.png')
+		# ax[2].plot(round, est_0, round, est_1, round, est_2)
+		# ax[2].set_ylabel('Fraction of estimated labels')
+
+		# # limit x axis
+		# ax[0].set_xbound(lower=1, upper=9)
+		# ax[1].set_xbound(lower=1, upper=9)
+		# ax[2].set_xbound(lower=1, upper=9)
+		
+		# ax[0].set_ybound(lower=0)
+		# ax[1].set_ybound(lower=0)
+		# #ax[2].set_ybound(lower=0)
+		
+		# # insert legend
+		# ax[1].legend(('class 0', 'class 1', 'class 2'))
+		# ax[2].legend(('class 0', 'class 1', 'class 2'))
+
+		plt.savefig('..\\visualization\\Labeling_plot_190404.png')
+		plt.savefig('..\\visualization\\Labeling_plot_190404.eps')
+		plt.show()
+		
+	def plot_labeling_rounds_naive():
+		# round numbers
+		round = [0,1,2,3,4,5,6,7,8,9]
+
+		# naive study
+		rec_av_n = [np.nan, 33.3, 35.9, 38.1, 37.4, 33.3, 39.4, 40.7, 40.1, 38.9]
+		rec_1_n = [np.nan, 0,      0,    0,    0,     0,    0,     0, 12.5, 0]
+		
+		prec_av_n = [np.nan, 26.3, 44.56, 61.22, 49.7, 29.3, 63.3, 59.7, 77.1, 61.1]
+		prec_1_n = [np.nan,   0,   0,     0,     0,     0,    0,   0,  100, 0]
+		
+		acc_av_n = [np.nan, 86,88.7,89.3,88,92,93.3,86.7,87.3,88.7]
+		acc_1_n =  [np.nan, 96,95,  96, 96 ,98,99, 94,93     ,97.0]
+					
+		fig, ax = plt.subplots(2, 1)
+
+		ax[0].plot(round, rec_av_n, round, prec_av_n, round, acc_av_n)
+		ax[0].set_ylabel('Average metrics')	
+		ax[0].legend(('Recall', 'Precision', 'Accuracy'))
+		
+		ax[1].plot(round, rec_1_n, round, prec_1_n, round, acc_1_n)
+		ax[1].set_ylabel('Class 1 metrics')	
+		ax[1].legend(('Recall', 'Precision', 'Accuracy'))
+		
+		ax[1].set_xlabel('Iteration number')
+		
+		# limit x axis
+		ax[0].set_xbound(lower=1, upper=9)
+		ax[1].set_xbound(lower=1, upper=9)
+	
+		# y axis	
+		ax[1].set_ybound(lower=-5)
+		ax[0].set_ybound(lower=-5)
+
+		plt.savefig('..\\visualization\\Labeling_plot_190411.png')
+		plt.savefig('..\\visualization\\Labeling_plot_190411.eps')
 		plt.show()

 	def plot_cumulative():
 		# load pickle object
-		with open('../obj/array_class_probs_stratified_round_9.pkl', 'rb') as input:
+		with open('../obj/array_3model_svm_class2.pkl', 'rb') as input:
 			list = pickle.load(input)

 		# sort list in descending order
@ -80,18 +163,25 @@ class LabelingPlotter():
 		#ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals])


-		ax.grid(True)
+		#ax.grid(True)
 		#ax.legend(loc='right')
-		#ax.set_title('Cumulative distribution of highest estimated probability')
-		ax.set_xlabel('Highest estimated probability')
-		ax.set_ylabel('Fraction of articles with this highest estimated probability')
+		ax.set_title('Predictions class 2 (SVM)')
+		# for iterations
+		#ax.set_xlabel('Highest estimated probability')
+		#ax.set_ylabel('Fraction of articles with this highest estimated probability')
+		# for 3-models
+		ax.set_xlabel('Estimated probability for class 2')
+		ax.set_ylabel('Fraction of articles with this probability')
+		#plt.axis([0.97, 1, 0.95, 1.01])
 		#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
-		plt.axis([0.5, 1, 0, 0.015]) # round 9 stratified
+		#plt.axis([0.5, 1, 0, 0.015]) # round 9 stratified
 		#plt.axis([0.65, 1, 0, 0.003]) # round 10
 		#plt.axis([0.7, 1, 0, 0.002]) # round 11
 		#ax.set_xbound(lower=0.5, upper=0.99)
-		plt.savefig('..\\visualization\\proba_stratified_round_9.png')
-		plt.savefig('..\\visualization\\proba_stratified_round_9.eps')
+		#plt.savefig('..\\visualization\\proba_stratified_round_9.png')
+		#plt.savefig('..\\visualization\\proba_stratified_round_9.eps')
+		plt.savefig('..\\visualization\\3model_svm_class2.png')
+		plt.savefig('..\\visualization\\3model_svm_class2.eps')

 		plt.show()

@ -121,4 +211,5 @@ class LabelingPlotter():

 if __name__ == '__main__':
    #LabelingPlotter.plot_correlation()
-	LabelingPlotter.plot_cumulative()
+	#LabelingPlotter.plot_cumulative()
+	LabelingPlotter.plot_labeling_rounds_naive()
--- a/src/MNBInteractive.py
+++ b/src/MNBInteractive.py
@ -20,7 +20,7 @@ class MNBInteractive:
 	However, in practice, fractional counts such as tf-idf may also work.
 	'''

-	def estimate_mnb(labeled_data, unlabeled_data, sklearn_cv=False):
+	def estimate_mnb(labeled_data, unlabeled_data, sklearn_cv=True):
 		'''fits naive bayes model
 		'''

--- a/src/MultinomialNaiveBayes.py
+++ b/src/MultinomialNaiveBayes.py
@ -17,7 +17,7 @@ from sklearn.naive_bayes import MultinomialNB

 class MultinomialNaiveBayes:

-	def make_mnb(dataset, sklearn_cv=True, percentile=100):
+	def make_mnb(dataset, sklearn_cv=True, percentile=100, bigram=False):
 		'''fits naive bayes model with StratifiedKFold
 		'''
 		print('# starting multinomial naive bayes')
@ -29,7 +29,13 @@ class MultinomialNaiveBayes:
 		y = dataset['Label']

 		if sklearn_cv:
-			cv = CountVectorizer()
+			if bigram:
+				cv = CountVectorizer(ngram_range=(2,2))
+			else:
+				# ignore company names
+				company_names_list = BagOfWords.load_company_names()
+				stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
+				cv = CountVectorizer(stop_words = stopwords)

 		# use stratified k-fold cross-validation as split method
 		skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
@ -43,11 +49,6 @@ class MultinomialNaiveBayes:
 		precision_scores = []
 		f1_scores = []

-		# probabilities of each class (of each fold)
-		#class_prob = []
-		# counts number of training samples observed in each class 
-		#class_counts = []
-
 		# for each fold
 		n = 0
 		for train, test in skf.split(X,y):
@ -90,13 +91,6 @@ class MultinomialNaiveBayes:
 			#predict class
 			predictions_train = classifier.predict(training_data_r)
 			predictions_test = classifier.predict(testing_data_r)
-			# print('train:')
-			# print(y[train])
-			# print('test:')
-			# print(y[test])
-			# print()
-			# print('pred')
-			# print(predictions_test)

 			#print and store metrics
 			rec = recall_score(y[test], predictions_test, average='weighted')
@ -113,22 +107,19 @@ class MultinomialNaiveBayes:
 			#class_counts.append(classifier.class_count_)

 		##########################
-		# probability estimates for the test vector (testing_data)
-		class_probs = classifier.predict_proba(testing_data)
-
-		# number of samples encountered for each class during fitting
-		# this value is weighted by the sample weight when provided
-		class_count = classifier.class_count_
-
 		# classes in order used
 		classes = classifier.classes_
 		
-		print('average: recall, precision, f1 score')
-		print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
-		
+		print('Recall (Min): ' + str(min(recall_scores)))
+		print('Recall (Max): ' + str(max(recall_scores)))
+		print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
+		print()
+		print('Precision (Min): ' + str(min(precision_scores)))
+		print('Precision (Max): ' + str(max(precision_scores)))
+		print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
 		
 		# return classes and vector of class estimates
-		return recall_scores, precision_scores, f1_scores, class_probs
+		return recall_scores, precision_scores

 	######## nur für resubstitutionsfehler benötigt ########
 	def analyze_errors(training, testing):
@ -195,4 +186,4 @@ if __name__ == '__main__':
 			  quotechar='\'')

 	# select only labeled articles
-	MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=True, percentile=100)
+	MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=False)
--- a/src/MultinomialNaiveBayes_Word2Vec.py
+++ b/src/MultinomialNaiveBayes_Word2Vec.py
@ -17,13 +17,14 @@ from sklearn.metrics import recall_score, precision_score
 import sklearn
 from sklearn.model_selection import StratifiedKFold
 from sklearn.naive_bayes import MultinomialNB
+from sklearn.svm import LinearSVC
+from sklearn.svm import SVC

 class MultinomialNaiveBayes_Word2Vec:

-	def make_mnb(dataset, sklearn_cv=True, percentile=100):
+	def make_mnb(dataset):
 		'''fits naive bayes model with StratifiedKFold
 		'''
-		vector_size=150

 		def read_corpus(data, tokens_only=False):
 			list_of_lists = []
@ -35,7 +36,13 @@ class MultinomialNaiveBayes_Word2Vec:
 					list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
 			return list_of_lists

-		print('# starting multinomial naive bayes')
+		def normalize_vector(two_dim_array, min, max):
+			norm_array = two_dim_array
+			for (x,y), value in np.ndenumerate(two_dim_array):
+				norm_array[x][y] = ((value - min) / (max - min))
+			return norm_array
+
+		print('# starting multinomial naive bayes with Word2Vec')
 		print('# ...')

 		# split data into text and label set
@ -46,20 +53,19 @@ class MultinomialNaiveBayes_Word2Vec:
 		# use stratified k-fold cross-validation as split method
 		skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)

-		classifier = MultinomialNB(alpha=1.0e-10,
-								   fit_prior=False,
-								   class_prior=None)
+		#classifier = MultinomialNB(alpha=1.0e-10,
+		#						   fit_prior=False,
+		#						   class_prior=None)
+		
+		# classifier = SVC(probability=True,
+						 # gamma='auto')
+		classifier = LinearSVC()

 		# metrics
 		recall_scores = []
 		precision_scores = []
 		f1_scores = []

-		# probabilities of each class (of each fold)
-		#class_prob = []
-		# counts number of training samples observed in each class 
-		#class_counts = []
-
 		# for each fold
 		n = 0
 		for train, test in skf.split(X,y):
@ -68,28 +74,51 @@ class MultinomialNaiveBayes_Word2Vec:
 			print('# split no. ' + str(n))

 			# train model with gensim	   
-			training_data = read_corpus(X[train], tokens_only=False)
-			testing_data = read_corpus(X[test], tokens_only=True)
-			all_data = read_corpus(X, tokens_only=False)
+			tagged_train_data = read_corpus(X[train], tokens_only=False)
+			tagged_test_data = read_corpus(X[test], tokens_only=False)

 			# instantiate a Doc2Vec object
-			doc2vec_model = Doc2Vec(training_data, vector_size=100, window=2, min_count=2, epochs = 40)
+			model = Doc2Vec(vector_size=100,  
+                min_count=20,
+                epochs=40,
+				negative=0,
+				workers=1,
+				seed=5,
+				hs=1)

-			# Frage: hier dürfen keine negativen Werte drin sein für Naive Bayes?
-			print(doc2vec_model.docvecs[0])
-			print(doc2vec_model.docvecs[1])
-			print(doc2vec_model.docvecs[2])
-			
-			training_data = [doc2vec_model.docvecs[i] for i in range(len(training_data))]
-			
-			# Frage: muss man bei den testing daten auch einen tag mit machen?
-			testing_data = [doc2vec_model.infer_vector(vector) for vector in testing_data]
+			model.build_vocab(tagged_train_data)
+
+			model.train(tagged_train_data,
+					   total_examples=model.corpus_count,
+					   epochs=model.epochs)
+
+			model.docvecs.count
+
+			X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data]
+			X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data]
+
+			# convert matrix
+			X_train=np.vstack(X_train)
+			X_test=np.vstack(X_test)
+
+			# min max for normalization
+			minimum = min(X_train.min(), X_test.min())
+			maximum = max(X_train.max(), X_test.max())
+
+			X_test_norm = normalize_vector(X_test, minimum, maximum)
+			X_train_norm = normalize_vector(X_train, minimum, maximum)
+
+			# shape vectors
+			X_test_norm.shape
+			y[test].shape
+			X_train_norm.shape
+			y[train].shape
 			
 			#fit classifier
-			classifier.fit(training_data, y[train])
+			classifier.fit(X_train_norm, y[train])
 			#predict class
-			predictions_train = classifier.predict(training_data)
-			predictions_test = classifier.predict(testing_data)
+			predictions_train = classifier.predict(X_train_norm)
+			predictions_test = classifier.predict(X_test_norm)

 			#print and store metrics
 			rec = recall_score(y[test], predictions_test, average='weighted')
@ -104,21 +133,25 @@ class MultinomialNaiveBayes_Word2Vec:

 		##########################
 		# probability estimates for the test vector (testing_data)
-		class_probs = classifier.predict_proba(testing_data)
+		#class_probs = classifier.predict_proba(X_test_norm)

 		# number of samples encountered for each class during fitting
 		# this value is weighted by the sample weight when provided
-		class_count = classifier.class_count_
+		#class_count = classifier.class_count_

 		# classes in order used
-		classes = classifier.classes_
-		
-		print('average: recall, precision, f1 score')
-		print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
+		#classes = classifier.classes_
 		
+		print('Recall (Min): ' + str(min(recall_scores)))
+		print('Recall (Max): ' + str(max(recall_scores)))
+		print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
+		print()
+		print('Precision (Min): ' + str(min(precision_scores)))
+		print('Precision (Max): ' + str(max(precision_scores)))
+		print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
 		
 		# return classes and vector of class estimates
-		return recall_scores, precision_scores, f1_scores, class_probs
+		return recall_scores, precision_scores, f1_scores#, class_probs

 if __name__ == '__main__':

@ -135,4 +168,4 @@ if __name__ == '__main__':
 			  quotechar='\'')

 	# select only labeled articles
-	MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=False, percentile=100)
+	MultinomialNaiveBayes_Word2Vec.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True))
--- a/src/MultinomialNaiveBayes_bigram.py
+++ b/src/MultinomialNaiveBayes_bigram.py
@ -0,0 +1,198 @@
+'''
+Multinomial Naive Bayes Classifier
+==================================
+'''
+
+from BagOfWords import BagOfWords
+
+import csv
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_selection import SelectPercentile
+from sklearn.metrics import recall_score, precision_score
+import sklearn
+from sklearn.model_selection import StratifiedKFold
+from sklearn.naive_bayes import MultinomialNB
+
+class MultinomialNaiveBayes:
+
+	def make_mnb(dataset, sklearn_cv=True, percentile=100):
+		'''fits naive bayes model with StratifiedKFold
+		'''
+		print('# starting multinomial naive bayes')
+		print('# ...')
+
+		# split data into text and label set
+		# join title and text
+		X = dataset['Title'] + '. ' + dataset['Text']
+		y = dataset['Label']
+
+		if sklearn_cv:
+			cv = CountVectorizer(ngram_range = (1,2))
+
+		# use stratified k-fold cross-validation as split method
+		skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
+
+		classifier = MultinomialNB(alpha=1.0e-10,
+								   fit_prior=False,
+								   class_prior=None)
+
+		# metrics
+		recall_scores = []
+		precision_scores = []
+		f1_scores = []
+
+		# probabilities of each class (of each fold)
+		#class_prob = []
+		# counts number of training samples observed in each class 
+		#class_counts = []
+
+		# for each fold
+		n = 0
+		for train, test in skf.split(X,y):
+
+			n += 1
+			print('# split no. ' + str(n))
+
+			if sklearn_cv:
+				# use sklearn CountVectorizer
+				# fit the training data and then return the matrix
+				
+				training_data = cv.fit_transform(X[train], y[train]).toarray()
+				# transform testing data and return the matrix
+				testing_data = cv.transform(X[test]).toarray()
+			else:
+				# use my own BagOfWords python implementation
+				stemming = True
+				rel_freq = True
+				extracted_words = BagOfWords.extract_all_words(X[train])
+				vocab = BagOfWords.make_vocab(extracted_words)
+
+				# fit the training data and then return the matrix
+				training_data = BagOfWords.make_matrix(extracted_words,
+								vocab, rel_freq, stemming)
+				# transform testing data and return the matrix
+				extracted_words = BagOfWords.extract_all_words(X[test])
+				testing_data = BagOfWords.make_matrix(extracted_words,
+								vocab, rel_freq, stemming)
+
+			# apply select percentile
+			selector = SelectPercentile(percentile=percentile)
+			selector.fit(training_data, y[train])
+
+			# new reduced data sets
+			training_data_r = selector.transform(training_data)
+			testing_data_r = selector.transform(testing_data)
+
+			#fit classifier
+			classifier.fit(training_data_r, y[train])
+			#predict class
+			predictions_train = classifier.predict(training_data_r)
+			predictions_test = classifier.predict(testing_data_r)
+			# print('train:')
+			# print(y[train])
+			# print('test:')
+			# print(y[test])
+			# print()
+			# print('pred')
+			# print(predictions_test)
+
+			#print and store metrics
+			rec = recall_score(y[test], predictions_test, average='weighted')
+			print('rec: ' + str(rec))
+			recall_scores.append(rec)
+			prec = precision_score(y[test], predictions_test, average='weighted')
+			print('prec: ' + str(prec))
+			print('#')
+			precision_scores.append(prec)
+			# equation for f1 score
+			f1_scores.append(2 * (prec * rec)/(prec + rec))
+
+			#class_prob.append(classifier.class_prior_)
+			#class_counts.append(classifier.class_count_)
+
+		##########################
+		# probability estimates for the test vector (testing_data)
+		class_probs = classifier.predict_proba(testing_data)
+
+		# number of samples encountered for each class during fitting
+		# this value is weighted by the sample weight when provided
+		class_count = classifier.class_count_
+
+		# classes in order used
+		classes = classifier.classes_
+		
+		print('average: recall, precision, f1 score')
+		print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
+		
+		
+		# return classes and vector of class estimates
+		return recall_scores, precision_scores, f1_scores, class_probs
+
+	######## nur für resubstitutionsfehler benötigt ########
+	def analyze_errors(training, testing):
+		'''calculates resubstitution error
+		shows indices of false classified articles
+		uses Gaussian Bayes with train test split
+		'''
+		X_train = training['Title'] + ' ' + training['Text']
+		y_train = training['Label']
+		
+		X_test = testing['Title'] + ' ' + testing['Text']
+		y_test = testing['Label']
+
+		count_vector = CountVectorizer()
+
+		# fit the training data and then return the matrix
+		training_data = count_vector.fit_transform(X_train).toarray()
+
+		# transform testing data and return the matrix
+		testing_data = count_vector.transform(X_test).toarray()
+
+		# Naive Bayes
+		classifier = MultinomialNB(alpha=1.0e-10,
+								   fit_prior=False,
+								   class_prior=None)
+		# fit classifier
+		classifier.fit(training_data, y_train)
+
+		# Predict class
+		predictions = classifier.predict(testing_data)
+
+		print(type(y_test))
+		print(len(y_test))
+		print(type(predictions))
+		print(len(predictions))
+
+		print('Errors at index:')
+		print()
+		n = 0
+		for i in range(len(y_test)):
+			if y_test[i] != predictions[i]:
+				n += 1
+				print('error no.{}'.format(n))
+				print('prediction at index {} is: {}, but actual is: {}'
+				.format(i, predictions[i], y_test[i]))
+				print(X_test[i])
+				print(y_test[i])
+				print()
+		#print metrics
+		print('F1 score: ', format(f1_score(y_test, predictions)))
+
+if __name__ == '__main__':
+
+	# read csv file
+	print('# reading dataset')
+	print('# ...')
+
+	# read current data set from csv
+	df = pd.read_csv('../data/interactive_labeling_round_11.csv',
+			  sep='|',
+			  usecols=range(1,13), # drop first column 'unnamed'
+			  encoding='utf-8',
+			  quoting=csv.QUOTE_NONNUMERIC,
+			  quotechar='\'')
+
+	# select only labeled articles
+	MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=True, percentile=100)
--- a/src/NaiveBayes.py
+++ b/src/NaiveBayes.py
@ -48,7 +48,6 @@ class NaiveBayes:
 		# metrics
 		recall_scores = []
 		precision_scores = []
-		#f1_scores = []

 		# probabilities of each class (of each fold)
 		class_prob = []
@ -113,32 +112,15 @@ class NaiveBayes:

 		##########################
 		#print metrics of test set
-		# print('-------------------------')
-		# print('prediction of testing set:')
-		# print('Precision score: min = {}, max = {}, average = {}'
-				# .format(min(precision_scores),
-						# max(precision_scores),
-						# sum(precision_scores)/float(len(precision_scores))))
-		# print('Recall score: min = {}, max = {}, average = {}'
-				# .format(min(recall_scores),
-						# max(recall_scores),
-						# sum(recall_scores)/float(len(recall_scores))))
-		# print('F1 score: min = {}, max = {}, average = {}'
-				# .format(min(f1_scores),
-						# max(f1_scores),
-						# sum(f1_scores)/float(len(f1_scores))))
-		# print()
-		# # print probability of each class
-		# print('probability of each class:')
-		# print()
-		# print(class_prob)
-		# print()
-		# print('number of samples of each class:')
-		# print()
-		# print(class_counts)
-		# print()
+		print('Recall (Min): ' + str(min(recall_scores)))
+		print('Recall (Max): ' + str(max(recall_scores)))
+		print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
+		print()
+		print('Precision (Min): ' + str(min(precision_scores)))
+		print('Precision (Max): ' + str(max(precision_scores)))
+		print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))

-		return class_prob, class_counts, recall_scores, precision_scores#, f1_scores
+		return class_prob, class_counts, recall_scores, precision_scores

 		##### nur für overfit testing ###########
 		#print('overfit testing: prediction of training set')
--- a/src/SVMInteractive.py
+++ b/src/SVMInteractive.py
@ -0,0 +1,83 @@
+'''
+SVM Classifier for Interactive Labeling
+=======================================
+
+returns probabilities for classes needed for interactive labeling.
+'''
+from BagOfWords import BagOfWords
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_selection import SelectPercentile
+from sklearn.metrics import recall_score, precision_score
+from sklearn.model_selection import StratifiedKFold
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.svm import SVC
+
+class SVMInteractive:
+
+	def estimate_svm(labeled_data, unlabeled_data, sklearn_cv=True):
+
+		print('# SVM: starting interactive SVM...')
+		print()
+
+		# split labeled data into text and label set
+		# join title and text
+		X = labeled_data['Title'] + '. ' + labeled_data['Text']
+		y = labeled_data['Label']
+
+		# split unlabeled data into text and label set
+		# join title and text
+		U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
+		l = unlabeled_data['Label']
+
+		if sklearn_cv:
+			cv = CountVectorizer()
+
+		# fit_prior=False: a uniform prior will be used instead
+		# of learning class prior probabilities
+		classifier = SVC(probability=True,
+						 gamma='auto')
+
+		# probabilities of each class (of each fold)
+		class_probs = []
+
+		if sklearn_cv:
+			# use sklearn CountVectorizer
+			# fit the training data and then return the matrix
+			training_data = cv.fit_transform(X, y).toarray()
+			# transform testing data and return the matrix
+			testing_data = cv.transform(U).toarray()
+		else:
+			# use my own BagOfWords python implementation
+			stemming = True
+			rel_freq = False
+			extracted_words = BagOfWords.extract_all_words(X)
+			vocab = BagOfWords.make_vocab(extracted_words)
+
+			# fit the training data and then return the matrix
+			print('# MNB: fit training data and calculate matrix...')
+			print()
+			training_data = BagOfWords.make_matrix(extracted_words,
+							vocab, rel_freq, stemming)
+
+			# transform testing data and return the matrix
+			print('# MNB: transform testing data to matrix...')
+			print()
+			extracted_words = BagOfWords.extract_all_words(U)
+			testing_data = BagOfWords.make_matrix(extracted_words,
+							vocab, rel_freq, stemming)
+
+		#fit classifier
+		classifier.fit(training_data, y)
+		
+		# probability estimates for the test vector (testing_data)
+		class_probs = classifier.predict_proba(testing_data)
+
+		# classes in order used
+		classes = classifier.classes_
+
+		print('# ending SVM')
+
+		# return classes and vector of class estimates
+		return classes, class_probs
--- a/src/SVMInteractive_wp.py
+++ b/src/SVMInteractive_wp.py
@ -0,0 +1,81 @@
+'''
+SVM Classifier for Interactive Labeling
+=======================================
+
+returns probabilities for classes needed for interactive labeling.
+'''
+from BagOfWords import BagOfWords
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_selection import SelectPercentile
+from sklearn.metrics import recall_score, precision_score
+from sklearn.model_selection import StratifiedKFold
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.svm import LinearSVC
+
+class SVMInteractive_wp:
+
+	def estimate_svm(labeled_data, unlabeled_data, sklearn_cv=True):
+
+		print('# SVM: starting interactive SVM...')
+		print()
+
+		# split labeled data into text and label set
+		# join title and text
+		X = labeled_data['Title'] + '. ' + labeled_data['Text']
+		y = labeled_data['Label']
+
+		# split unlabeled data into text and label set
+		# join title and text
+		U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
+		l = unlabeled_data['Label']
+
+		if sklearn_cv:
+			cv = CountVectorizer()
+
+		# fit_prior=False: a uniform prior will be used instead
+		# of learning class prior probabilities
+		classifier = LinearSVC()
+
+		# probabilities of each class (of each fold)
+		class_probs = []
+
+		if sklearn_cv:
+			# use sklearn CountVectorizer
+			# fit the training data and then return the matrix
+			training_data = cv.fit_transform(X, y).toarray()
+			# transform testing data and return the matrix
+			testing_data = cv.transform(U).toarray()
+		else:
+			# use my own BagOfWords python implementation
+			stemming = True
+			rel_freq = False
+			extracted_words = BagOfWords.extract_all_words(X)
+			vocab = BagOfWords.make_vocab(extracted_words)
+
+			# fit the training data and then return the matrix
+			print('# MNB: fit training data and calculate matrix...')
+			print()
+			training_data = BagOfWords.make_matrix(extracted_words,
+							vocab, rel_freq, stemming)
+
+			# transform testing data and return the matrix
+			print('# MNB: transform testing data to matrix...')
+			print()
+			extracted_words = BagOfWords.extract_all_words(U)
+			testing_data = BagOfWords.make_matrix(extracted_words,
+							vocab, rel_freq, stemming)
+
+		#fit classifier
+		classifier.fit(training_data, y)
+		
+		predictions_test = classifier.predict(testing_data)
+
+		# classes in order used
+		classes = classifier.classes_
+
+		print('# ending SVM')
+
+		# return classes and vector of class estimates
+		return classes, predictions_test
--- a/src/SVM_multiclass.py
+++ b/src/SVM_multiclass.py
@ -19,103 +19,143 @@ import csv
 import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
-from sklearn.metrics import f1_score, make_scorer
+from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer, accuracy_score
 from sklearn.model_selection import StratifiedKFold
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
+from sklearn.svm import LinearSVC
 from sklearn.svm import SVC
+from sklearn.svm import NuSVC

-class SVM:
+class SVM_multiclass:

-	def make_svm(dataset, sklearn_cv=True):
+	def make_svm(dataset, sklearn_cv=True, percentile=100):

-		print('# fitting model')
+		print('# starting multinomial svm')
 		print('# ...')

 		# split data into text and label set
-
-		# articles' text (title + text)
+		# join title and text
 		X = dataset['Title'] + '. ' + dataset['Text']
-		# articles' labels
 		y = dataset['Label']
-		matrix = pd.DataFrame()

-		# fit the training data and then return the matrix
 		if sklearn_cv:
-			# use sklearn CountVectorizer
-			matrix = CountVectorizer().fit_transform(X).toarray()
-		else:
-			# use own BOW implementation
-			matrix = BagOfWords.fit_transform(X)
+
+			# ignore company names
+			company_names_list = BagOfWords.load_company_names()
+			stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
+			cv = CountVectorizer(stop_words = stopwords)

 		# use stratified k-fold cross-validation as split method
-		skf = StratifiedKFold(n_splits = 10, shuffle=True)
+		skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)

-		# use only most important features
-		selector = SelectPercentile()
+		classifier = LinearSVC()
+		
+		# for predict proba:
+		#classifier = SVC(probability=True,
+		#					 gamma='auto')

-		pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
+		# metrics
+		recall_scores = []
+		precision_scores = []
+		accuracy_scores = []
+		f1_scores = []

-		grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100],
-							'SVC__kernel': ['linear'],
-							'SVC__gamma': [0.00001, 0.0001],
-							'SVC__C': [0.1, 1]},
-							cv=skf,
-							scoring=make_scorer(f1_score, average='micro'))
+		# for each fold
+		n = 0
+		for train, test in skf.split(X,y):

-		print('# fit classifier')
-		print('# ...')
+			n += 1
+			print('# split no. ' + str(n))

-		grid.fit(matrix,y)
+			if sklearn_cv:
+				# use sklearn CountVectorizer
+				# fit the training data and then return the matrix
+				
+				training_data = cv.fit_transform(X[train], y[train]).toarray()
+				# transform testing data and return the matrix
+				testing_data = cv.transform(X[test]).toarray()
+			else:
+				# use my own BagOfWords python implementation
+				stemming = True
+				rel_freq = True
+				extracted_words = BagOfWords.extract_all_words(X[train])
+				vocab = BagOfWords.make_vocab(extracted_words)

-		# DataFrame of results
-		df_results = grid.cv_results_
+				# fit the training data and then return the matrix
+				training_data = BagOfWords.make_matrix(extracted_words,
+								vocab, rel_freq, stemming)
+				# transform testing data and return the matrix
+				extracted_words = BagOfWords.extract_all_words(X[test])
+				testing_data = BagOfWords.make_matrix(extracted_words,
+								vocab, rel_freq, stemming)

-		# print results
-		######################
-		print('RESULTS:')
-		print('')
-		print('mean_test_score:')
-		print(df_results['mean_test_score'])
-		print('')
-		print('mean of means:')
-		print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
-		print('')
-		print('best score:')
-		print(grid.best_score_)
+			# apply select percentile
+			selector = SelectPercentile(percentile=percentile)
+			selector.fit(training_data, y[train])
+
+			# new reduced data sets
+			training_data_r = selector.transform(training_data)
+			testing_data_r = selector.transform(testing_data)
+
+			#fit classifier
+			classifier.fit(training_data_r, y[train])
+			#predict class
+			predictions_train = classifier.predict(training_data_r)
+			predictions_test = classifier.predict(testing_data_r)
+
+			#print and store metrics
+			rec = recall_score(y[test], predictions_test, average='weighted')
+			print('rec: ' + str(rec))
+			recall_scores.append(rec)
+			prec = precision_score(y[test], predictions_test, average='weighted')
+			print('prec: ' + str(prec))
+			print('#')
+			precision_scores.append(prec)
+			acc = recall_score(y[test], predictions_test, average='weighted')
+			accuracy_scores.append(acc)
+			print('acc: ' + str(acc))
+			print('#')
+			# equation for f1 score
+			f1_scores.append(2 * (prec * rec)/(prec + rec))
+
+			#class_prob.append(classifier.class_prior_)
+			#class_counts.append(classifier.class_count_)
+			#print(classifier.predict_proba(testing_data_r))
+
+		##########################
+		# classes in order used
+		classes = classifier.classes_
+		
+		print('Recall (Min): ' + str(min(recall_scores)))
+		print('Recall (Max): ' + str(max(recall_scores)))
+		print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
 		print()
-		print('best parameters set found on development set:')
-		print(grid.best_params_)
+		print('Precision (Min): ' + str(min(precision_scores)))
+		print('Precision (Max): ' + str(max(precision_scores)))
+		print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
 		print()
+		print('Accuracy (Min): ' + str(min(accuracy_scores)))
+		print('Accuracy (Max): ' + str(max(accuracy_scores)))
+		print('Accuracy (Average) :' + str(sum(accuracy_scores)/len(accuracy_scores)))

-	if __name__ == '__main__':
+		# return classes and vector of class estimates
+		return recall_scores, precision_scores

-		print('# starting svm')
-		print('# ...')
+if __name__ == '__main__':

-		#file = '..\\data\\classification_labelled_corrected.csv'
+	# read csv file
+	print('# reading dataset')
+	print('# ...')

-		# read csv file
-		print('# reading dataset')
-		print('# ...')
+	# read current data set from csv
+	df = pd.read_csv('../data/interactive_labeling_round_11.csv',
+			  sep='|',
+			  usecols=range(1,13), # drop first column 'unnamed'
+			  encoding='utf-8',
+			  quoting=csv.QUOTE_NONNUMERIC,
+			  quotechar='\'')

-		# data = pd.read_csv(file,
-				   # sep='|',
-				   # engine='python',
-				   # decimal='.',
-				   # quotechar='\'',
-				   # quoting=csv.QUOTE_NONE)
-		# read current data set from csv
-
-		df = pd.read_csv('../data/interactive_labeling_round_11.csv',
-						  sep='|',
-						  usecols=range(1,13), # drop first column 'unnamed'
-						  encoding='utf-8',
-						  quoting=csv.QUOTE_NONNUMERIC,
-						  quotechar='\'')
-		data = df.loc[df['Label'] != -1].reset_index(drop=True)
-
-		use_count_vectorizer = True
-		make_svm(data, use_count_vectorizer)
-
-		print('# ending svm')
+	# select only labeled articles
+	SVM_multiclass.make_svm(df.loc[df['Label'] != -1].reset_index(drop=True), 
+							sklearn_cv=True)
--- a/src/SVM_multiclass_grid.py
+++ b/src/SVM_multiclass_grid.py
@ -0,0 +1,123 @@
+'''
+Support Vector Machines (SVM) Classifier
+========================================
+
+The SVM training algorithm builds a model from the training data that assigns
+the test samples to one category ('merger' or 'not merger'),
+making it a non-probabilistic binary linear classifier.
+An SVM model is a representation of the samples as points in space,
+mapped so that the examples of the separate categories are divided
+by a clear gap that is as wide as possible.
+New samples are then mapped into that same space and predicted
+to belong to a category based on which side of the gap they fall.
+'''
+
+from BagOfWords import BagOfWords
+
+import csv
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_selection import SelectPercentile
+from sklearn.metrics import recall_score, f1_score, make_scorer
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.svm import SVC
+
+class SVM_multiclass_grid:
+
+	def make_svm(dataset, sklearn_cv=True):
+
+		print('# fitting model')
+		print('# ...')
+
+		# split data into text and label set
+
+		# articles' text (title + text)
+		X = dataset['Title'] + '. ' + dataset['Text']
+		# articles' labels
+		y = dataset['Label']
+		matrix = pd.DataFrame()
+
+		# fit the training data and then return the matrix
+		if sklearn_cv:
+			# use sklearn CountVectorizer
+			company_names_list = BagOfWords.load_company_names()
+			stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
+			matrix = CountVectorizer(stop_words = stopwords).fit_transform(X).toarray()
+		else:
+			# use own BOW implementation
+			matrix = BagOfWords.fit_transform(X)
+
+		# use stratified k-fold cross-validation as split method
+		skf = StratifiedKFold(n_splits = 10, shuffle=True)
+
+		# use only most important features
+		selector = SelectPercentile()
+
+		pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
+
+		grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100],
+							'SVC__kernel': ['linear'],
+							'SVC__gamma': [0.000001, 0.00001],
+							'SVC__C': [0.01, 0.1]},
+							cv=skf,
+							scoring=make_scorer(recall_score, average='micro'))
+
+		print('# fit classifier')
+		print('# ...')
+
+		grid.fit(matrix,y)
+
+		# DataFrame of results
+		df_results = grid.cv_results_
+
+		# print results
+		######################
+		print('RESULTS:')
+		print('')
+		print('mean_test_score:')
+		print(df_results['mean_test_score'])
+		print('')
+		print('mean of means:')
+		print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
+		print('')
+		print('best score:')
+		print(grid.best_score_)
+		print()
+		print('best parameters set found on development set:')
+		print(grid.best_params_)
+		print()
+
+	if __name__ == '__main__':
+
+		print('# starting svm')
+		print('# ...')
+
+		#file = '..\\data\\classification_labelled_corrected.csv'
+
+		# read csv file
+		print('# reading dataset')
+		print('# ...')
+
+		# data = pd.read_csv(file,
+				   # sep='|',
+				   # engine='python',
+				   # decimal='.',
+				   # quotechar='\'',
+				   # quoting=csv.QUOTE_NONE)
+		# read current data set from csv
+
+		df = pd.read_csv('../data/interactive_labeling_round_11.csv',
+						  sep='|',
+						  usecols=range(1,13), # drop first column 'unnamed'
+						  encoding='utf-8',
+						  quoting=csv.QUOTE_NONNUMERIC,
+						  quotechar='\'')
+		data = df.loc[df['Label'] != -1].reset_index(drop=True)
+
+		use_count_vectorizer = True
+		make_svm(data, use_count_vectorizer)
+
+		print('# ending svm')
--- a/src/SVM_multiclass_interactive.py
+++ b/src/SVM_multiclass_interactive.py
@ -0,0 +1,152 @@
+'''
+Support Vector Machines (SVM) Classifier
+========================================
+
+The SVM training algorithm builds a model from the training data that assigns
+the test samples to one category ('merger' or 'not merger'),
+making it a non-probabilistic binary linear classifier.
+An SVM model is a representation of the samples as points in space,
+mapped so that the examples of the separate categories are divided
+by a clear gap that is as wide as possible.
+New samples are then mapped into that same space and predicted
+to belong to a category based on which side of the gap they fall.
+'''
+
+from BagOfWords import BagOfWords
+
+import csv
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_selection import SelectPercentile
+from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.svm import LinearSVC
+from sklearn.svm import SVC
+from sklearn.svm import NuSVC
+
+class SVM_multiclass:
+
+	def make_svm(dataset, sklearn_cv=True, percentile=100):
+
+		print('# starting multinomial svm')
+		print('# ...')
+
+		# split data into text and label set
+		# join title and text
+		X = dataset['Title'] + '. ' + dataset['Text']
+		y = dataset['Label']
+
+		if sklearn_cv:
+
+			# ignore company names
+			company_names_list = BagOfWords.load_company_names()
+			stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
+			cv = CountVectorizer(stop_words = stopwords)
+
+		# use stratified k-fold cross-validation as split method
+		skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
+
+		#classifier = LinearSVC()
+		
+		# for predict proba:
+		classifier = SVC(probability=True,
+						 gamma='auto')
+
+		# metrics
+		recall_scores = []
+		precision_scores = []
+		f1_scores = []
+
+		# for each fold
+		n = 0
+		for train, test in skf.split(X,y):
+
+			n += 1
+			print('# split no. ' + str(n))
+
+			if sklearn_cv:
+				# use sklearn CountVectorizer
+				# fit the training data and then return the matrix
+				
+				training_data = cv.fit_transform(X[train], y[train]).toarray()
+				# transform testing data and return the matrix
+				testing_data = cv.transform(X[test]).toarray()
+			else:
+				# use my own BagOfWords python implementation
+				stemming = True
+				rel_freq = True
+				extracted_words = BagOfWords.extract_all_words(X[train])
+				vocab = BagOfWords.make_vocab(extracted_words)
+
+				# fit the training data and then return the matrix
+				training_data = BagOfWords.make_matrix(extracted_words,
+								vocab, rel_freq, stemming)
+				# transform testing data and return the matrix
+				extracted_words = BagOfWords.extract_all_words(X[test])
+				testing_data = BagOfWords.make_matrix(extracted_words,
+								vocab, rel_freq, stemming)
+
+			# apply select percentile
+			selector = SelectPercentile(percentile=percentile)
+			selector.fit(training_data, y[train])
+
+			# new reduced data sets
+			training_data_r = selector.transform(training_data)
+			testing_data_r = selector.transform(testing_data)
+
+			#fit classifier
+			classifier.fit(training_data_r, y[train])
+			#predict class
+			predictions_train = classifier.predict(training_data_r)
+			predictions_test = classifier.predict(testing_data_r)
+
+			#print and store metrics
+			rec = recall_score(y[test], predictions_test, average='weighted')
+			print('rec: ' + str(rec))
+			recall_scores.append(rec)
+			prec = precision_score(y[test], predictions_test, average='weighted')
+			print('prec: ' + str(prec))
+			print('#')
+			precision_scores.append(prec)
+			# equation for f1 score
+			f1_scores.append(2 * (prec * rec)/(prec + rec))
+
+			#class_prob.append(classifier.class_prior_)
+			#class_counts.append(classifier.class_count_)
+			print(classifier.predict_proba(testing_data_r))
+
+		##########################
+		# classes in order used
+		classes = classifier.classes_
+		
+		print('Recall (Min): ' + str(min(recall_scores)))
+		print('Recall (Max): ' + str(max(recall_scores)))
+		print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
+		print()
+		print('Precision (Min): ' + str(min(precision_scores)))
+		print('Precision (Max): ' + str(max(precision_scores)))
+		print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
+
+		# return classes and vector of class estimates
+		return recall_scores, precision_scores
+
+if __name__ == '__main__':
+
+	# read csv file
+	print('# reading dataset')
+	print('# ...')
+
+	# read current data set from csv
+	df = pd.read_csv('../data/interactive_labeling_round_11.csv',
+			  sep='|',
+			  usecols=range(1,13), # drop first column 'unnamed'
+			  encoding='utf-8',
+			  quoting=csv.QUOTE_NONNUMERIC,
+			  quotechar='\'')
+
+	# select only labeled articles
+	SVM_multiclass.make_svm(df.loc[df['Label'] != -1].reset_index(drop=True), 
+							sklearn_cv=True)
--- a/src/VisualizerNews.py
+++ b/src/VisualizerNews.py
@ -22,314 +22,315 @@ from wordcloud import WordCloud

 class VisualizerNews:

-    datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
+	datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')

-    def plot_wordcloud_dataset():
-        '''plots word cloud image of most common words in dataset.
-        '''
-        print('# preparing word cloud of 200 most common words...')
-        print()
-        # load new data set
-        file = '..\\data\\cleaned_data_set_without_header.csv'
-        df_dataset = pd.read_csv(file,
-                                 delimiter='|',
-                                 header=None,
-                                 index_col=None,
-                                 engine='python',
-                                 usecols=[1,2],
-                                 #nrows=100,
-                                 quoting=csv.QUOTE_NONNUMERIC,
-                                 quotechar='\'')
+	def plot_wordcloud_dataset():
+		'''plots word cloud image of most common words in dataset.
+		'''
+		print('# preparing word cloud of 200 most common words...')
+		print()
+		# load new data set
+		file = '..\\data\\cleaned_data_set_without_header.csv'
+		df_dataset = pd.read_csv(file,
+								 delimiter='|',
+								 header=None,
+								 index_col=None,
+								 engine='python',
+								 usecols=[1,2],
+								 #nrows=100,
+								 quoting=csv.QUOTE_NONNUMERIC,
+								 quotechar='\'')

-        corpus = df_dataset[1] + '. ' + df_dataset[2]
-        stemming = True
-        rel_freq = True
+		corpus = df_dataset[1] + '. ' + df_dataset[2]
+		stemming = True
+		rel_freq = True

-        # find most common words in dataset
-        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
-        vocab = BagOfWords.make_vocab(extracted_words, stemming)
-        matrix = BagOfWords.make_matrix(extracted_words, vocab,
-                                        rel_freq, stemming)
-        dict = BagOfWords.make_dict_common_words(matrix, 200,
-                                                 rel_freq, stemming)
-        # save dict object
-        with open('../obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
-            pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
+		# find most common words in dataset
+		extracted_words = BagOfWords.extract_all_words(corpus, stemming)
+		vocab = BagOfWords.make_vocab(extracted_words, stemming)
+		matrix = BagOfWords.make_matrix(extracted_words, vocab,
+										rel_freq, stemming)
+		dict = BagOfWords.make_dict_common_words(matrix, 200,
+												 rel_freq, stemming)
+		# save dict object
+		with open('../obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
+			pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)

-        wordcloud = WordCloud(background_color='white',
-                              width=2400, 
-                              height=1200, 
-                              scale=2,
-                              # true if bigram:
-                              collocations=False)\
-                              .generate_from_frequencies(dict)
+		wordcloud = WordCloud(background_color='white',
+							  width=2400, 
+							  height=1200, 
+							  scale=2,
+							  # true if bigram:
+							  collocations=False)\
+							  .generate_from_frequencies(dict)

-        # display generated image
-        plt.imshow(wordcloud, interpolation='bilinear')
-        plt.axis("off")
-        plt.savefig('visualization\\WordCloud_{}.eps'
-                    .format(VisualizerNews.datestring))
-        plt.savefig('visualization\\WordCloud_{}.png'
-                    .format(VisualizerNews.datestring))
-        plt.show()
+		# display generated image
+		plt.imshow(wordcloud, interpolation='bilinear')
+		plt.axis("off")
+		plt.savefig('visualization\\WordCloud_{}.eps'
+					.format(VisualizerNews.datestring))
+		plt.savefig('visualization\\WordCloud_{}.png'
+					.format(VisualizerNews.datestring))
+		plt.show()

-    def plot_histogram_companies():
-        '''plots diagram of company names distribution
-        count_names: list of company counts(int)
-        x-axis: number of mentions of the company
-        y-axis: frequency
-        '''
-        print('# preparing histogram of company mentions...')
-        print()
-        # # read data set
-        # file = '..\\data\\cleaned_data_set_without_header.csv'
-        # df = pd.read_csv(file,
-                         # delimiter='|',
-                         # header=None,
-                         # index_col=None,
-                         # engine='python',
-                         # usecols=[1,2],
-                         # #nrows=10,
-                         # quoting=csv.QUOTE_NONNUMERIC,
-                         # quotechar='\'')
+	def plot_histogram_companies():
+		'''plots diagram of company names distribution
+		count_names: list of company counts(int)
+		x-axis: number of mentions of the company
+		y-axis: frequency
+		'''
+		print('# preparing histogram of company mentions...')
+		print()
+		# # read data set
+		# file = '..\\data\\cleaned_data_set_without_header.csv'
+		# df = pd.read_csv(file,
+						 # delimiter='|',
+						 # header=None,
+						 # index_col=None,
+						 # engine='python',
+						 # usecols=[1,2],
+						 # #nrows=10,
+						 # quoting=csv.QUOTE_NONNUMERIC,
+						 # quotechar='\'')

-        # # # only articles with label==1
-        # # df_hits = df[df['Label'] == 1]
-        # # texts = df_hits['Title'] + '. ' + df_hits['Text']
-        # texts = df[1] + '. ' + df[2]
+		# # # only articles with label==1
+		# # df_hits = df[df['Label'] == 1]
+		# # texts = df_hits['Title'] + '. ' + df_hits['Text']
+		# texts = df[1] + '. ' + df[2]

-        # # list: count articles with company names
-        # count_names = NER.count_companies(texts)
+		# # list: count articles with company names
+		# count_names = NER.count_companies(texts)

-        # # sort list in descending order
-        # count_names.sort(reverse=True)
-        # # convert list to array
-        # names = np.asarray(count_names)
+		# # sort list in descending order
+		# count_names.sort(reverse=True)
+		# # convert list to array
+		# names = np.asarray(count_names)

-        # load pickle object
-        with open('../obj/dict_organizations.pkl', 'rb') as input:
-            dict = pickle.load(input)
-        # make list of dict's values
-        count_companies = list(dict.values())
-        # sort list in descending order
-        count_companies.sort(reverse=True)
-        # convert list to array
-        names = np.asarray(count_companies)
+		# load pickle object
+		with open('../obj/dict_organizations.pkl', 'rb') as input:
+			dict = pickle.load(input)
+		# make list of dict's values
+		count_companies = list(dict.values())
+		# sort list in descending order
+		count_companies.sort(reverse=True)
+		# convert list to array
+		names = np.asarray(count_companies)

-        plt.xlabel('Count of articles that mention a company')
-        # Number of companies with this number of mentions
-        plt.ylabel('Number of companies with this number of articles')
-        num_bins = 400
-        n, bins, patches = plt.hist(names, num_bins,
-                                    facecolor='darkred', alpha=0.5)
-        plt.axis([1, 14, 0, 14000])
+		plt.xlabel('Count of articles that mention a special company')
+		# Number of companies with this number of mentions
+		plt.ylabel('Number of companies with this number of articles')
+		num_bins = 300
+		n, bins, patches = plt.hist(names, num_bins,
+									color='darkred', alpha=1)
+		plt.axis([1, 14, 0, 14000])

-        # format axis labels for thousends (e.g. '10,000')
-        plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
-            .FuncFormatter(lambda x, p: format(int(x), ',')))
+		# format axis labels for thousends (e.g. '10,000')
+		plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
+			.FuncFormatter(lambda x, p: format(int(x), ',')))

-        # save to file
-        plt.savefig('..\\visualization\\NER_{}.eps'
-                    .format(VisualizerNews.datestring))
-        plt.savefig('..\\visualization\\NER_{}.png'
-                    .format(VisualizerNews.datestring))
-        plt.show()
+		# save to file
+		plt.savefig('..\\visualization\\NER_{}.eps'
+					.format(VisualizerNews.datestring))
+		plt.savefig('..\\visualization\\NER_{}.png'
+					.format(VisualizerNews.datestring))
+		plt.show()

-    def plot_histogram_text_lengths():
-        '''plot histogram of article length
-        x-axis: number of characters in article (without headline)
-        y-axis: frequency
-        '''
-        print('# preparing histogram of text lengths...')
-        print()
-        # read data set
-        filepath = '..\\data\\cleaned_data_set_without_header.csv'
-        df_dataset = pd.read_csv(filepath,
-                                 delimiter='|',
-                                 header=None,
-                                 index_col=None,
-                                 engine='python',
-                                 usecols=[2],
-                                 #nrows=100,
-                                 quoting=csv.QUOTE_NONNUMERIC,
-                                 quotechar='\'')
-        # consider only Text, not Headline
-        texts = df_dataset[2]
+	def plot_histogram_text_lengths():
+		'''plot histogram of article length
+		x-axis: number of characters in article (without headline)
+		y-axis: frequency
+		'''
+		print('# preparing histogram of text lengths...')
+		print()
+		# read data set
+		filepath = '..\\data\\cleaned_data_set_without_header.csv'
+		df_dataset = pd.read_csv(filepath,
+								 delimiter='|',
+								 header=None,
+								 index_col=None,
+								 engine='python',
+								 usecols=[2],
+								 #nrows=100,
+								 quoting=csv.QUOTE_NONNUMERIC,
+								 quotechar='\'')
+		# consider only Text, not Headline
+		texts = df_dataset[2]

-        # count characters in articles
-        print('# counting characters in articles...')
-        print()
-        count_chars = []
-        for text in texts:
-            count_chars.append(len(text))
-        # average of number of characters
-        av = int(sum(count_chars) / len(count_chars))
-        print('# average length of news articles is {} characters'.format(av))
-        print()
-        # sort list in descending order
-        count_chars.sort(reverse=True)
-        # convert list to array
-        names = np.asarray(count_chars)
-        # plt.title('Length of News Articles')
-        plt.xlabel('Number of characters in article')
-        plt.ylabel('Frequency')
-        # number of vertical bins
-        num_bins = 200
-        n, bins, patches = plt.hist(names, num_bins,
-                                    facecolor='darkslategrey', alpha=0.5)
-        # [xmin, xmax, ymin, ymax] of axis
-        plt.axis([300,10000,0,500])
-        # format axis labels for thousends (e.g. '10,000')
-        plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
-            .FuncFormatter(lambda x, p: format(int(x), ',')))
-        # save plot
-        plt.savefig('..\\visualization\\TextLength_{}.eps'\
-                    .format(VisualizerNews.datestring))
-        plt.savefig('..\\visualization\\TextLength_{}.png'\
-                    .format(VisualizerNews.datestring))
-        plt.show()
+		# count characters in articles
+		print('# counting characters in articles...')
+		print()
+		count_chars = []
+		for text in texts:
+			count_chars.append(len(text))
+		# average of number of characters
+		av = int(sum(count_chars) / len(count_chars))
+		print('# average length of news articles is {} characters'.format(av))
+		print()
+		# sort list in descending order
+		count_chars.sort(reverse=True)
+		# convert list to array
+		names = np.asarray(count_chars)
+		# plt.title('Length of News Articles')
+		plt.xlabel('Number of characters in the article')
+		plt.ylabel('Frequency')
+		# number of vertical bins
+		num_bins = 200
+		n, bins, patches = plt.hist(names, num_bins,
+									facecolor='darkslategrey', alpha=0.5)
+		# [xmin, xmax, ymin, ymax] of axis
+		plt.axis([300,10000,0,500])
+		# format axis labels for thousends (e.g. '10,000')
+		plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
+			.FuncFormatter(lambda x, p: format(int(x), ',')))
+		# save plot
+		plt.savefig('..\\visualization\\TextLength_{}.eps'\
+					.format(VisualizerNews.datestring))
+		plt.savefig('..\\visualization\\TextLength_{}.png'\
+					.format(VisualizerNews.datestring))
+		plt.show()

-    def plot_pie_chart_of_sites():
+	def plot_pie_chart_of_sites():

-        print('# preparing pie chart of news article sites...')
-        print()
+		print('# preparing pie chart of news article sites...')
+		print()

-        # load data set
-        filepath = '..\\data\\cleaned_data_set_without_header.csv'
-        df_dataset = pd.read_csv(filepath,
-                                 delimiter='|',
-                                 header=None,
-                                 #usecols=[3], #column 'Site'
-                                 index_col=None,
-                                 engine='python',
-                                 #nrows=10,
-                                 quoting=csv.QUOTE_NONNUMERIC,
-                                 quotechar='\'')
-        # find all different sites, group by 'Site'
-        df_counts = df_dataset.groupby(3).count()
-        # count occurences of each site, count different 'Url's
-        df_counts = df_counts.sort_values([5], ascending=False)
+		# load data set
+		filepath = '..\\data\\cleaned_data_set_without_header.csv'
+		df_dataset = pd.read_csv(filepath,
+								 delimiter='|',
+								 header=None,
+								 #usecols=[3], #column 'Site'
+								 index_col=None,
+								 engine='python',
+								 #nrows=10,
+								 quoting=csv.QUOTE_NONNUMERIC,
+								 quotechar='\'')
+		# find all different sites, group by 'Site'
+		df_counts = df_dataset.groupby(3).count()
+		# count occurences of each site, count different 'Url's
+		df_counts = df_counts.sort_values([5], ascending=False)

-        fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
+		fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))

-        data = list(df_counts[5])
-        # legend labels
-        labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)', 
-                  'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
+		data = list(df_counts[5])
+		# legend labels
+		labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)', 
+				  'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']

-        wedges, texts, autotexts = ax.pie(data, autopct='%1.0f%%', pctdistance=2.0,
-                                          startangle=90, textprops=dict(color="w"))
+		wedges, texts, autotexts = ax.pie(data, autopct='%1.0f%%', pctdistance=2.0,
+										  startangle=90, textprops=dict(color="w"))

-        ax.legend(wedges, labels,
-                  #title="News Article Sources",
-                  loc="center left",
-                  bbox_to_anchor=(1, 0, 0.5, 1),
-                  prop={'size': 10},
-                  fontsize=10)
+		ax.legend(wedges, labels,
+				  #title="News Article Sources",
+				  loc="center left",
+				  bbox_to_anchor=(1, 0, 0.5, 1),
+				  prop={'size': 10},
+				  fontsize=10)

-        plt.setp(autotexts, size=8, weight="bold")
-        plt.show()
-        plt.savefig('..\\visualization\\Sites_{}.pdf'.format(VisualizerNews.datestring))
-        plt.savefig('..\\visualization\\Sites_{}.pgf'.format(VisualizerNews.datestring))
+		plt.setp(autotexts, size=8, weight="bold")
+		plt.show()
+		plt.savefig('..\\visualization\\Sites_{}.pdf'.format(VisualizerNews.datestring))
+		plt.savefig('..\\visualization\\Sites_{}.pgf'.format(VisualizerNews.datestring))

-    def plot_hist_most_common_words(n_commons = 10):
-        print('# preparing histogram of most common words...')
-        print()
-        # # load data set
-        # filepath = '..\\data\\cleaned_data_set_without_header.csv'
-        # df_dataset = pd.read_csv(filepath,
-                                 # delimiter='|',
-                                 # header=None,
-                                 # usecols=[1,2],
-                                 # index_col=None,
-                                 # engine='python',
-                                 # #nrows=1000,
-                                 # quoting=csv.QUOTE_NONNUMERIC,
-                                 # quotechar='\'')
+	def plot_hist_most_common_words(n_commons = 10):
+		print('# preparing histogram of most common words...')
+		print()
+		# load data set
+		df = pd.read_csv('../data/interactive_labeling_round_16_temp.csv',
+			  sep='|',
+			  usecols=range(1,13), # drop first column 'unnamed'
+			  encoding='utf-8',
+			  quoting=csv.QUOTE_NONNUMERIC,
+			  quotechar='\'')

-        # corpus = df_dataset[1] + '. ' + df_dataset[2]
+		# select only labeled articles
+		df = df.loc[df['Label'] != -1].reset_index(drop=True)

-        # stemming = False
-        # rel_freq = True
+		corpus = df['Title'] + '. ' + df['Text']

-        # # find most common words in dataset
-        # extracted_words = BagOfWords.extract_all_words(corpus, stemming)
-        # vocab = BagOfWords.make_vocab(extracted_words, stemming)
-        # matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
-                                        # stemming)
-        # dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
-                                                 # stemming)
-        # # save dict object
-        # with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
-            # pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
+		stemming = False
+		rel_freq = True

-        # load pickle object
-        with open ('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
-            dict = pickle.load(i)
-        # sort dict by value
-        o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
-                             reverse=True))
-        # return n higest values as dict (word => count)
-        n_dict = {}
+		# find most common words in dataset
+		extracted_words = BagOfWords.extract_all_words(corpus, stemming)
+		vocab = BagOfWords.make_vocab(extracted_words, stemming)
+		matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
+										stemming)
+		dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
+												 stemming)
+		# save dict object
+		#with open('obj/'+ 'dict_10_most_common_words_merger' + '.pkl', 'wb') as f:
+		#	pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)

-        for i in range(n_commons):
-            # next highest score
-            next_highest = o_dict.popitem(last=False)
-            n_dict[next_highest[0]] = next_highest[1]
+		# load pickle object
+		#with open ('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
+		#	dict = pickle.load(i)
+		
+		# sort dict by value
+		o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
+							 reverse=True))
+		# return n higest values as dict (word => count)
+		n_dict = {}

-        #plt.xlabel('Most common words in textual corpus')
-        plt.ylabel('Relative frequency')
+		for i in range(n_commons):
+			# next highest score
+			next_highest = o_dict.popitem(last=False)
+			n_dict[next_highest[0]] = next_highest[1]

-        labels = list(n_dict.keys())
-        numbers = list(n_dict.values())
-        nbars = n_commons
-        plt.bar(np.arange(nbars), 
-                height=numbers, 
-                tick_label=labels, 
-                facecolor='royalblue')
-        plt.savefig('..\\visualization\\10_most_common_words_{}.eps'
-                    .format(VisualizerNews.datestring))
-        plt.savefig('..\\visualization\\10_most_common_words_{}.png'
-                    .format(VisualizerNews.datestring))
-        plt.show()
+		#plt.xlabel('Most common words in textual corpus')
+		plt.ylabel('Relative frequency')

-    def plot_hist_num_comp_per_art():
-        ''' open pkl file of dict, plot histogram of number of different
-        company names per article.
-        '''
-        # list of number of different companies per article (int)
-        list = []
-        with open('../obj/num_mentions_companies.pkl', 'rb') as input:
-            list = pickle.load(input)
+		labels = list(n_dict.keys())
+		numbers = list(n_dict.values())
+		nbars = n_commons
+		plt.bar(np.arange(nbars), 
+				height=numbers, 
+				tick_label=labels, 
+				facecolor='royalblue')

-        # sort list in descending order
-        list.sort(reverse=True)
+		plt.savefig('..\\visualization\\10_most_common_words_mergers_{}.eps'
+					.format(VisualizerNews.datestring))
+		plt.savefig('..\\visualization\\10_most_common_words_mergers_{}.png'
+					.format(VisualizerNews.datestring))
+		plt.show()

-        # convert list to array
-        names = np.asarray(list)
+	def plot_hist_num_comp_per_art():
+		''' open pkl file of dict, plot histogram of number of different
+		company names per article.
+		'''
+		# list of number of different companies per article (int)
+		list = []
+		with open('../obj/num_mentions_companies.pkl', 'rb') as input:
+			list = pickle.load(input)

-        plt.xlabel('Number of different company names in news article')
-        plt.ylabel('Number of articles with this number of company names')
-        num_bins = 100
-        n, bins, patches = plt.hist(names, num_bins,
-                                    facecolor='darkgreen', alpha=0.5)
-        plt.axis([0, 30, 0, 1500])
+		# sort list in descending order
+		list.sort(reverse=True)

-        # format axis labels for thousends (e.g. '10,000')
-        plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
-            .FuncFormatter(lambda x, p: format(int(x), ',')))
+		# convert list to array
+		names = np.asarray(list)

-        # save to file
-        plt.savefig('..\\visualization\\NER_2_{}.eps'
-                    .format(VisualizerNews.datestring))
-        plt.savefig('..\\visualization\\NER_2_{}.png'
-                    .format(VisualizerNews.datestring))
-        plt.show()
+		plt.xlabel('Number of different company names in the news article')
+		plt.ylabel('Number of articles with this number of company names')
+		num_bins = 100
+		n, bins, patches = plt.hist(names, num_bins,
+									facecolor='darkgreen', alpha=0.5)
+		plt.axis([0, 30, 0, 1500])
+
+		# format axis labels for thousends (e.g. '10,000')
+		plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
+			.FuncFormatter(lambda x, p: format(int(x), ',')))
+
+		# save to file
+		plt.savefig('..\\visualization\\NER_2_{}.eps'
+					.format(VisualizerNews.datestring))
+		plt.savefig('..\\visualization\\NER_2_{}.png'
+					.format(VisualizerNews.datestring))
+		plt.show()

 if __name__ == '__main__':
-    VisualizerNews.plot_wordcloud_dataset()
-    # VisualizerNews.plot_histogram_companies()
-    # VisualizerNews.plot_hist_num_comp_per_art()
-    # VisualizerNews.plot_histogram_text_lengths()
-    # VisualizerNews.plot_pie_chart_of_sites()
-    # VisualizerNews.plot_hist_most_common_words(10)
+	# VisualizerNews.plot_wordcloud_dataset()
+	VisualizerNews.plot_histogram_companies()
+	# VisualizerNews.plot_hist_num_comp_per_art()
+	# VisualizerNews.plot_histogram_text_lengths()
+	# VisualizerNews.plot_pie_chart_of_sites()
+	# VisualizerNews.plot_hist_most_common_words(10)
--- a/src/test.py
+++ b/src/test.py
@ -1,128 +0,0 @@
-from BagOfWords import BagOfWords
-
-import csv
-
-import gensim
-from gensim.models.doc2vec import Doc2Vec, TaggedDocument
-import numpy as np
-import pandas as pd
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.feature_selection import SelectPercentile
-from sklearn.metrics import recall_score, precision_score
-import sklearn
-from sklearn.model_selection import StratifiedKFold
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.model_selection import train_test_split
-
-# read current data set from csv
-df = pd.read_csv('../data/interactive_labeling_round_11.csv',
-		  sep='|',
-		  usecols=range(1,13), # drop first column 'unnamed'
-		  encoding='utf-8',
-		  quoting=csv.QUOTE_NONNUMERIC,
-		  quotechar='\'')
-		  
-dataset = df.loc[df['Label'] != -1][:100].reset_index(drop=True)
-
-train = dataset[:15]
-test = dataset[15:20].reset_index(drop=True)
-
-classifier = MultinomialNB(alpha=1.0e-10,
-						   fit_prior=False,
-						   class_prior=None)
-
-def make_tagged_document(row):
-	# TaggedDocument wie wo was?
-	# tags (a list of tokens). Tags may be one or more unicode string tokens, 
-	# but typical practice (which will also be the most memory-efficient) is 
-	# for the tags list to include a unique integer id as the only tag.
-	# also kein Label?
-
-    return TaggedDocument(words=BagOfWords.extract_words(row['Text']),
-                          tags=[row['Label']])
-
-tagged_train_data=train.apply(lambda row: make_tagged_document(row), axis=1)
-print(tagged_train_data[0])
-
-tagged_test_data=test.apply(lambda row: make_tagged_document(row), axis=1)
-print(tagged_test_data[0])
-
-model = Doc2Vec(vector_size=100,  
-                min_count=20,
-                epochs=40,
-				negative=0)
-
-model.build_vocab(tagged_train_data)
-
-model.train(tagged_train_data,
-           total_examples=model.corpus_count,
-           epochs=model.epochs)
-
-model.docvecs.count
-
-y_train=np.array([doc.tags[0] for doc in tagged_train_data])
-
-y_test=np.array([doc.tags[0] for doc in tagged_test_data])
-
-X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data]
-
-X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data]
-
-# X_train=np.vstack(X_train)
-
-# X_test=np.vstack(X_test)
-
-# X_test.shape
-
-# y_test.shape
-
-# X_train.shape
-
-# y_train.shape
-
-print(X_test)
-print(y_test)
-print(X_train)
-print(y_train)
-
-# reshape data
-
-X_train = np.array(X_train)
-X_test = np.array(X_test)
-
-#X_train = X_train.reshape((X_train.shape[0],1,X_train.shape[1]))
-#X_test = X_test.reshape((X_test.shape[0],1,X_test.shape[1]))
-X_train.shape
-X_test.shape
-
-
-#fit classifier
-classifier.fit(X_train, y_train)
-#predict class
-predictions_train = classifier.predict(X_train)
-predictions_test = classifier.predict(X_test)
-
-#print and store metrics
-rec = recall_score(y_test, predictions_test, average='weighted')
-print('rec: ' + str(rec))
-recall_scores.append(rec)
-prec = precision_score(y_test, predictions_test, average='weighted')
-print('prec: ' + str(prec))
-print('#')
-precision_scores.append(prec)
-# equation for f1 score
-f1_scores.append(2 * (prec * rec)/(prec + rec))
-
-##########################
-# probability estimates for the test vector (testing_data)
-class_probs = classifier.predict_proba(testing_data)
-
-# number of samples encountered for each class during fitting
-# this value is weighted by the sample weight when provided
-class_count = classifier.class_count_
-
-# classes in order used
-classes = classifier.classes_
-
-# return classes and vector of class estimates
-print (recall_scores, precision_scores, f1_scores, class_probs)
--- a/src/test_2.py
+++ b/src/test_2.py
@ -1,131 +0,0 @@
-from BagOfWords import BagOfWords
-
-import csv
-
-import gensim
-from gensim.models.doc2vec import Doc2Vec, TaggedDocument
-import numpy as np
-import pandas as pd
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.feature_selection import SelectPercentile
-from sklearn.metrics import recall_score, precision_score
-import sklearn
-from sklearn.model_selection import StratifiedKFold
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.model_selection import train_test_split
-
-# read current data set from csv
-df = pd.read_csv('../data/interactive_labeling_round_11.csv',
-		  sep='|',
-		  usecols=range(1,13), # drop first column 'unnamed'
-		  encoding='utf-8',
-		  quoting=csv.QUOTE_NONNUMERIC,
-		  quotechar='\'')
-		  
-dataset = df.loc[df['Label'] != -1].reset_index(drop=True)
-
-X = dataset['Title'] + '. ' + dataset['Text']
-y = dataset['Label']
-
-classifier = MultinomialNB(alpha=1.0e-10,
-						   fit_prior=False,
-						   class_prior=None)
-						   
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)
-						  
-def read_corpus(data, tokens_only=False):
-			list_of_lists = []
-			for i, text in enumerate(data):
-				if tokens_only:
-					list_of_lists.append(BagOfWords.extract_words(text))
-				else:
-					# For training data, add tags
-					list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
-			return list_of_lists
-
-tagged_train_data = read_corpus(X_train, tokens_only=False)
-
-print('tagged_train_data[0]:')
-print(tagged_train_data[0])
-
-tagged_test_data = read_corpus(X_test, tokens_only=False)
-
-print('tagged_test_data[0]:')
-print(tagged_test_data[0])
-
-model = Doc2Vec(vector_size=100,  
-                min_count=20,
-                epochs=40,
-				negative=0)
-
-model.build_vocab(tagged_train_data)
-
-model.train(tagged_train_data,
-           total_examples=model.corpus_count,
-           epochs=model.epochs)
-
-model.docvecs.count
-
-#y_train=np.array([doc.tags[0] for doc in tagged_train_data])
-#y_test=np.array([doc.tags[0] for doc in tagged_test_data])
-
-X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data]
-
-X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data]
-
-X_train=np.vstack(X_train)
-
-X_test=np.vstack(X_test)
-
-X_test.shape
-
-y_test.shape
-
-X_train.shape
-
-y_train.shape
-
-print('X_test:')
-print(X_test)
-
-print('y_test:')
-print(y_test)
-
-print('X_train:')
-print(X_train)
-
-print('y_train:')
-print(y_train)
-
-# hier: ValueError: Input X must be non-negative
-
-#fit classifier
-classifier.fit(X_train, y_train)
-#predict class
-predictions_train = classifier.predict(X_train)
-predictions_test = classifier.predict(X_test)
-
-#print and store metrics
-rec = recall_score(y_test, predictions_test, average='weighted')
-print('rec: ' + str(rec))
-recall_scores.append(rec)
-prec = precision_score(y_test, predictions_test, average='weighted')
-print('prec: ' + str(prec))
-print('#')
-precision_scores.append(prec)
-# equation for f1 score
-f1_scores.append(2 * (prec * rec)/(prec + rec))
-
-##########################
-# probability estimates for the test vector (testing_data)
-class_probs = classifier.predict_proba(testing_data)
-
-# number of samples encountered for each class during fitting
-# this value is weighted by the sample weight when provided
-class_count = classifier.class_count_
-
-# classes in order used
-classes = classifier.classes_
-
-# return classes and vector of class estimates
-print (recall_scores, precision_scores, f1_scores, class_probs)
--- a/visualization/10_most_common_words_mergers_2019-04-15.eps
+++ b/visualization/10_most_common_words_mergers_2019-04-15.eps
--- a/visualization/10_most_common_words_mergers_2019-04-15.png
+++ b/visualization/10_most_common_words_mergers_2019-04-15.png
--- a/visualization/3model_naive_bayes_class0.eps
+++ b/visualization/3model_naive_bayes_class0.eps
--- a/visualization/3model_naive_bayes_class0.png
+++ b/visualization/3model_naive_bayes_class0.png
--- a/visualization/3model_naive_bayes_class1.eps
+++ b/visualization/3model_naive_bayes_class1.eps
--- a/visualization/3model_naive_bayes_class1.png
+++ b/visualization/3model_naive_bayes_class1.png
--- a/visualization/3model_naive_bayes_class2.eps
+++ b/visualization/3model_naive_bayes_class2.eps
--- a/visualization/3model_naive_bayes_class2.png
+++ b/visualization/3model_naive_bayes_class2.png
--- a/visualization/3model_svm_class0.eps
+++ b/visualization/3model_svm_class0.eps
--- a/visualization/3model_svm_class0.png
+++ b/visualization/3model_svm_class0.png
--- a/visualization/3model_svm_class1.eps
+++ b/visualization/3model_svm_class1.eps
--- a/visualization/3model_svm_class1.png
+++ b/visualization/3model_svm_class1.png
--- a/visualization/3model_svm_class2.eps
+++ b/visualization/3model_svm_class2.eps
--- a/visualization/3model_svm_class2.png
+++ b/visualization/3model_svm_class2.png
--- a/visualization/Labeling_plot_190404.eps
+++ b/visualization/Labeling_plot_190404.eps
--- a/visualization/Labeling_plot_190404.png
+++ b/visualization/Labeling_plot_190404.png
--- a/visualization/Labeling_plot_190404_zoom.png
+++ b/visualization/Labeling_plot_190404_zoom.png
--- a/visualization/Labeling_plot_190411.eps
+++ b/visualization/Labeling_plot_190411.eps
--- a/visualization/Labeling_plot_190411.png
+++ b/visualization/Labeling_plot_190411.png
--- a/visualization/NER_2019-04-15.eps
+++ b/visualization/NER_2019-04-15.eps
--- a/visualization/NER_2019-04-15.png
+++ b/visualization/NER_2019-04-15.png
--- a/visualization/NER_2_2019-04-15.eps
+++ b/visualization/NER_2_2019-04-15.eps
--- a/visualization/NER_2_2019-04-15.png
+++ b/visualization/NER_2_2019-04-15.png
--- a/visualization/TextLength_2019-04-15.eps
+++ b/visualization/TextLength_2019-04-15.eps
--- a/visualization/TextLength_2019-04-15.png
+++ b/visualization/TextLength_2019-04-15.png
--- a/visualization/proba_stratified_round_9.eps
+++ b/visualization/proba_stratified_round_9.eps
--- a/visualization/proba_stratified_round_9.png
+++ b/visualization/proba_stratified_round_9.png
--- a/visualization/probabilities_after_round_11_svm.png
+++ b/visualization/probabilities_after_round_11_svm.png
--- a/visualization/probabilities_after_round_12_mnb.png
+++ b/visualization/probabilities_after_round_12_mnb.png
--- a/visualization/probabilities_after_round_13_mnb.png
+++ b/visualization/probabilities_after_round_13_mnb.png
--- a/visualization/probabilities_after_round_13_svm.png
+++ b/visualization/probabilities_after_round_13_svm.png
--- a/visualization/probabilities_after_round_14_svm.png
+++ b/visualization/probabilities_after_round_14_svm.png