interactive labeling: documentation round 0-9

2019-03-01 12:28:29 +01:00 · 2019-03-01 12:28:29 +01:00 · a2c7a7279e
parent 213bb148de
commit a2c7a7279e
27 changed files with 8412 additions and 31932 deletions
--- a/data/interactive_labeling_round_11.csv
+++ b/data/interactive_labeling_round_11.csv
--- a/data/interactive_labeling_round_9_corrected.csv
+++ b/data/interactive_labeling_round_9_corrected.csv
--- a/data/interactive_labeling_round_9_neu.csv
+++ b/data/interactive_labeling_round_9_neu.csv
--- a/data/interactive_labeling_round_9_uncorrected.csv
+++ b/data/interactive_labeling_round_9_uncorrected.csv
--- a/obj/array_class_probs_round_0_stratified.pkl
+++ b/obj/array_class_probs_round_0_stratified.pkl
--- a/obj/array_class_probs_round_10.pkl
+++ b/obj/array_class_probs_round_10.pkl
--- a/obj/array_class_probs_round_11.pkl
+++ b/obj/array_class_probs_round_11.pkl
--- a/obj/array_class_probs_round_11_stratified_nur100.pkl
+++ b/obj/array_class_probs_round_11_stratified_nur100.pkl
--- a/obj/array_class_probs_round_1_stratified.pkl
+++ b/obj/array_class_probs_round_1_stratified.pkl
--- a/obj/array_class_probs_round_3_stratified.pkl
+++ b/obj/array_class_probs_round_3_stratified.pkl
--- a/obj/array_class_probs_round_4_stratified.pkl
+++ b/obj/array_class_probs_round_4_stratified.pkl
--- a/src/2019-02-11-interactive-labeling-analysis.ipynb
+++ b/src/2019-02-11-interactive-labeling-analysis.ipynb
--- a/src/2019-02-19-al-interactive-labeling-part2.ipynb
+++ b/src/2019-02-19-al-interactive-labeling-part2.ipynb
--- a/src/2019-02-19-al-neueRunden0-9.ipynb
+++ b/src/2019-02-19-al-neueRunden0-9.ipynb
--- a/src/2019-02-24-al-resubstitution-error.ipynb
+++ b/src/2019-02-24-al-resubstitution-error.ipynb
@ -0,0 +1,689 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Notebook for calculation of the resubstitution error...\n",
    "\n",
    "Note:\n",
    "\n",
    "class 0: unrelated news\n",
    "\n",
    "class 1: mergers\n",
    "\n",
    "class 2: other deals, non-mergers, etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import operator\n",
    "import pickle\n",
    "import random\n",
    "\n",
    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
    "import ipywidgets as widgets\n",
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "from IPython.display import display\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_selection import SelectPercentile\n",
    "from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.semi_supervised import label_propagation\n",
    "\n",
    "from BagOfWords import BagOfWords\n",
    "from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
    "from MNBInteractive import MNBInteractive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# initialize random => reproducible sequence\n",
    "random_state = 5\n",
    "\n",
    "# set up wider display area\n",
    "pd.set_option('display.max_colwidth', -1)\n",
    "\n",
    "# show full text for print statement\n",
    "InteractiveShell.ast_node_interactivity = \"all\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Last iteration number: 11\n",
      "Number of manually labeled articles: 1082\n",
      "Number of manually unlabeled articles: 8918\n"
     ]
    }
   ],
   "source": [
    "# read current data set from csv\n",
    "df = pd.read_csv('../data/interactive_labeling_round_11.csv',\n",
    "          sep='|',\n",
    "          usecols=range(1,13), # drop first column 'unnamed'\n",
    "          encoding='utf-8',\n",
    "          quoting=csv.QUOTE_NONNUMERIC,\n",
    "          quotechar='\\'')\n",
    "\n",
    "# find current iteration/round number\n",
    "m = int(df['Round'].max())\n",
    "print('Last iteration number: {}'.format(m))\n",
    "print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
    "print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculate the resubstitution error for iteration 0-9 with stratified sampling.\n",
    "Start with iteration number 0."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m = 3\n",
    "m"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "# select all samples that were labeled with 0/1/2\n",
    "set_0 = df.loc[(df['Round'] == m) & (df['Label'] == 0)]\n",
    "set_1 = df.loc[(df['Round'] == m) & (df['Label'] == 1)]\n",
    "set_2 = df.loc[(df['Round'] == m) & (df['Label'] == 2)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of labeled samples by class (0/1/2): 82/4/14\n",
      "minimum of new labeled samples: 4\n",
      "length of current data set for resubstitution error: 12\n"
     ]
    }
   ],
   "source": [
    "# find minimum\n",
    "print('number of labeled samples by class (0/1/2): {}/{}/{}'.format(len(set_0), len(set_1), len(set_2)))\n",
    "strat_len = min(len(set_0), len(set_1), len(set_2))\n",
    "print('minimum of new labeled samples: {}'.format(strat_len))\n",
    "# length of current data set for resubstitution error\n",
    "print('length of current data set for resubstitution error: {}'.format(strat_len * 3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "# random sampling for selection\n",
    "selec_0 = set_0.sample(n=strat_len, random_state=random_state)\n",
    "selec_1 = set_1.sample(n=strat_len, random_state=random_state)\n",
    "selec_2 = set_2.sample(n=strat_len, random_state=random_state)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "# newly added training data of the current round\n",
    "# training_data_0 = pd.concat([selec_0, selec_1, selec_2])\n",
    "# training_data_1 = pd.concat([selec_0, selec_1, selec_2])\n",
    "# training_data_2 = pd.concat([selec_0, selec_1, selec_2])\n",
    "# training_data_3 = pd.concat([selec_0, selec_1, selec_2])\n",
    "training_data_4 = pd.concat([selec_0, selec_1, selec_2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[5789.0,\n",
       " 4237.0,\n",
       " 2202.0,\n",
       " 4913.0,\n",
       " 821.0,\n",
       " 5973.0,\n",
       " 6198.0,\n",
       " 8490.0,\n",
       " 4815.0,\n",
       " 2386.0,\n",
       " 5177.0,\n",
       " 2482.0]"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# indices of training samples\n",
    "# idx_0 = training_data_0['Index'].tolist()\n",
    "# idx_1 = training_data_1['Index'].tolist()\n",
    "# idx_2 = training_data_2['Index'].tolist()\n",
    "# idx_3 = training_data_3['Index'].tolist()\n",
    "idx_4 = training_data_4['Index'].tolist()\n",
    "\n",
    "train_all = train_all.append(training_data_4)\n",
    "idx_all = train_all['Index'].tolist()\n",
    "idx_4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "36"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_0_2 = train_0_1.append(training_data_2)\n",
    "len(train_0_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "stratified number in round 3: 12\n",
      "stratified number in total: 48\n"
     ]
    }
   ],
   "source": [
    "print('stratified number in round {}: {}'.format(m, len(idx_3)))\n",
    "print('stratified number in total: {}'.format(len(idx_all)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "# STEP 1:\n",
    "# resubstitution error round\n",
    "training_data = training_data_3\n",
    "testing_data = training_data_3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "400"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# STEP 2: \n",
    "# resubstitution error all labeled articles in round\n",
    "training_data = training_data_3\n",
    "testing_data = df.loc[(df['Round'] <= m)]\n",
    "len(testing_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "# STEP 3:\n",
    "training_data = train_all\n",
    "testing_data = df.loc[(df['Round'] <= m)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
    "# STEP 4:\n",
    "training_data = train_0_2\n",
    "testing_data = training_data_3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# MNB: starting interactive multinomial naives bayes...\n",
      "\n",
      "# MNB: ending multinomial naive bayes\n"
     ]
    }
   ],
   "source": [
    "# call script\n",
    "classes, class_count, class_probs = MNBInteractive.estimate_mnb(training_data, testing_data, True)\n",
    "\n",
    "# series of indices of recently estimated articles \n",
    "indices_estimated = testing_data['Index'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "confusion matrix:\n",
      "###############\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "###############\n",
      "\n",
      "class 0:\n",
      "\n",
      "TP: 1\n",
      "TN: 7\n",
      "FP: 1\n",
      "FN: 3\n",
      "\n",
      "class 1:\n",
      "\n",
      "TP: 4\n",
      "TN: 3\n",
      "FP: 5\n",
      "FN: 0\n",
      "\n",
      "class 2:\n",
      "\n",
      "TP: 0\n",
      "TN: 7\n",
      "FP: 1\n",
      "FN: 4\n",
      "###############\n",
      "\n",
      "METRICS:\n",
      "\n",
      "class 0:\n",
      "\n",
      "precision: 50.0\n",
      "recall: 25.0\n",
      "accuracy: 66.667\n",
      "\n",
      "class 1:\n",
      "\n",
      "precision: 44.444\n",
      "recall: 100.0\n",
      "accuracy: 58.333\n",
      "\n",
      "class 2:\n",
      "\n",
      "precision: 0.0\n",
      "recall: 0.0\n",
      "accuracy: 58.333\n",
      "\n",
      "Average Metrics:\n",
      "\n",
      "precision: 31\n",
      "recall: 42\n",
      "accuracy: 61\n"
     ]
    }
   ],
   "source": [
    "n = 0    \n",
    "for row in class_probs:\n",
    "    for i in range(0, len(classes)):\n",
    "        index = indices_estimated[n]\n",
    "        # save estimated label\n",
    "        if np.amax(row) == row[i]:\n",
    "            testing_data.loc[index, 'Estimated'] = classes[i]\n",
    "            # annotate probability\n",
    "            testing_data.loc[index, 'Probability'] = row[i]\n",
    "    n += 1\n",
    "\n",
    "print('confusion matrix:')\n",
    "print('###############')\n",
    "zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])\n",
    "zero_0\n",
    "zero_1 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 1)])\n",
    "zero_1\n",
    "zero_2 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 2)])\n",
    "zero_2\n",
    "print('/')\n",
    "one_0 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 0)])\n",
    "one_0\n",
    "one_1 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 1)])\n",
    "one_1\n",
    "one_2 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 2)])\n",
    "one_2\n",
    "print('/')\n",
    "\n",
    "two_0 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 0)])\n",
    "two_0\n",
    "two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])\n",
    "two_1\n",
    "two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])\n",
    "two_2\n",
    "print('###############')\n",
    "print()\n",
    "total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
    "print('class 0:')\n",
    "print()\n",
    "tp_0 = zero_0\n",
    "print('TP: {}'.format(tp_0))\n",
    "tn_0 = one_1 + one_2 + two_1 + two_2\n",
    "print('TN: {}'.format(tn_0))\n",
    "fp_0 = zero_1 + zero_2\n",
    "print('FP: {}'.format(fp_0))\n",
    "fn_0 = one_0 + two_0\n",
    "print('FN: {}'.format(fn_0))\n",
    "print()\n",
    "print('class 1:')\n",
    "print()\n",
    "tp_1 = one_1\n",
    "print('TP: {}'.format(tp_1))\n",
    "tn_1 = zero_0 + zero_2 + two_0 + two_2\n",
    "print('TN: {}'.format(tn_1))\n",
    "fp_1 = one_0 + one_2\n",
    "print('FP: {}'.format(fp_1))\n",
    "fn_1 = zero_1 + two_1\n",
    "print('FN: {}'.format(fn_1))\n",
    "print()\n",
    "print('class 2:')\n",
    "print()\n",
    "tp_2 = two_2\n",
    "print('TP: {}'.format(tp_2))\n",
    "tn_2 = zero_0 + zero_1 + one_0 + one_1\n",
    "print('TN: {}'.format(tn_2))\n",
    "fp_2 = two_0 + two_1\n",
    "print('FP: {}'.format(fp_2))\n",
    "fn_2 = zero_2 + one_2\n",
    "print('FN: {}'.format(fn_2))\n",
    "print('###############')\n",
    "print()\n",
    "print('METRICS:')\n",
    "print()\n",
    "print('class 0:')\n",
    "print()\n",
    "prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
    "print('precision: {}'.format(round(prec_0, 3)))\n",
    "rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
    "print('recall: {}'.format(round(rec_0, 3)))\n",
    "acc_0 = (tp_0 + tn_0) / total * 100\n",
    "print('accuracy: {}'.format(round(acc_0, 3)))\n",
    "print()\n",
    "print('class 1:')\n",
    "print()\n",
    "prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
    "print('precision: {}'.format(round(prec_1, 3)))\n",
    "rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
    "print('recall: {}'.format(round(rec_1, 3)))\n",
    "acc_1 = (tp_1 + tn_1) / total * 100\n",
    "print('accuracy: {}'.format(round(acc_1, 3)))\n",
    "print()\n",
    "print('class 2:')\n",
    "print()\n",
    "prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
    "print('precision: {}'.format(round(prec_2, 3)))\n",
    "rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
    "print('recall: {}'.format(round(rec_2, 3)))\n",
    "acc_2 = (tp_2 + tn_2) / total * 100\n",
    "print('accuracy: {}'.format(round(acc_2, 3)))\n",
    "print()\n",
    "print('Average Metrics:')\n",
    "print()\n",
    "print('precision: {}'.format(round((prec_1 + prec_2 + prec_0) / 3), 3))\n",
    "print('recall: {}'.format(round((rec_1 + rec_2 + rec_0) / 3), 3))\n",
    "print('accuracy: {}'.format(round((acc_1 + acc_2 + acc_0) / 3), 3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/src/LabelingPlotter.py
+++ b/src/LabelingPlotter.py
@ -1,6 +1,8 @@
 import csv
 import matplotlib
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import pickle
 class LabelingPlotter():
@ -54,7 +56,7 @@ class LabelingPlotter():
 	def plot_cumulative():
 		# load pickle object
-		with open('../obj/array_class_probs_round_9.pkl', 'rb') as input:
+		with open('../obj/array_class_probs_round_11.pkl', 'rb') as input:
 			list = pickle.load(input)
 		# sort list in descending order
@ -79,14 +81,43 @@ class LabelingPlotter():
 		ax.grid(True)
-		ax.legend(loc='right')
+		#ax.legend(loc='right')
 		#ax.set_title('Cumulative distribution of highest estimated probability')
 		ax.set_xlabel('Highest estimated probability')
 		ax.set_ylabel('Fraction of articles with this highest estimated probability')
-		#plt.axis([0.5, 0.99, 0, 0.006])
+		#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
 		#plt.axis([0.65, 1, 0, 0.003]) # round 10
 		plt.axis([0.7, 1, 0, 0.002]) # round 11
 		#ax.set_xbound(lower=0.5, upper=0.99)
 		plt.savefig('..\\visualization\\proba_round_11.png')
 		plt.savefig('..\\visualization\\proba_round_11.eps')
 		plt.show()
 	def plot_correlation():
 		m = 10
 		df = pd.read_csv('../data/interactive_labeling_round_{}_temp.csv'.format(m),
 						  sep='|',
 						  usecols=range(1,13), # drop first column 'unnamed'
 						  encoding='utf-8',
 						  quoting=csv.QUOTE_NONNUMERIC,
 						  quotechar='\'')
 		# add boolean, if estimation was true
 		df['EstCorrect'] = np.nan
 		df.loc[(df['Label'] != -1) & (df['Label'] == df['Estimated']), 'EstCorrect'] = 1
 		df.loc[(df['Label'] != -1) & (df['Label'] != df['Estimated']), 'EstCorrect'] = 0
 		print('estimation was correct: {}'.format(len(df.loc[df['EstCorrect'] == 1])))
 		print('estimation was wrong: {}'.format(len(df.loc[df['EstCorrect'] == 0])))
 		x = df.loc[df['Label'] != -1, 'Probability'].tolist()
 		y = df.loc[df['Label'] != -1, 'EstCorrect'].tolist()
 		plt.plot(x, y, 'bo')
 		plt.axis([0.4, 1, -0.1, 1.1])
 		plt.show()
 if __name__ == '__main__':
-    LabelingPlotter.plot_cumulative()
+    #LabelingPlotter.plot_correlation()
 	LabelingPlotter.plot_cumulative()
--- a/src/MNBInteractive.py
+++ b/src/MNBInteractive.py
@ -24,7 +24,7 @@ class MNBInteractive:
 		'''fits naive bayes model
 		'''
-		print('# MNB: starting multinomial naives bayes...')
+		print('# MNB: starting interactive multinomial naives bayes...')
 		print()
 		# split labeled data into text and label set
--- a/src/MultinomialNaiveBayes.py
+++ b/src/MultinomialNaiveBayes.py
@ -0,0 +1,175 @@
 '''
 Multinomial Naive Bayes Classifier
 ======================
 '''
 from BagOfWords import BagOfWords
 import csv
 import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
 from sklearn.metrics import recall_score, precision_score
 from sklearn.model_selection import StratifiedKFold
 from sklearn.naive_bayes import MultinomialNB
 class MultinomialNaiveBayes:
 	def make_mnb(dataset, sklearn_cv=True, percentile=100):
 		'''fits naive bayes model with StratifiedKFold
 		'''
 		print('# starting classical multinomial naive bayes')
 		print('# ...')
 		# split data into text and label set
 		# join title and text
 		X = dataset['Title'] + '. ' + dataset['Text']
 		y = dataset['Label']
 		if sklearn_cv:
 			cv = CountVectorizer()
 		# use stratified k-fold cross-validation as split method
 		skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
 		classifier = MultinomialNB(alpha=1.0e-10,
 								   fit_prior=False,
 								   class_prior=None)
 		# metrics
 		recall_scores = []
 		precision_scores = []
 		f1_scores = []
 		# probabilities of each class (of each fold)
 		#class_prob = []
 		# counts number of training samples observed in each class 
 		#class_counts = []
 		# for each fold
 		n = 0
 		for train, test in skf.split(X,y):
 			n += 1
 			print('# split no. ' + str(n))
 			if sklearn_cv:
 				# use sklearn CountVectorizer
 				# fit the training data and then return the matrix
 				training_data = cv.fit_transform(X[train], y[train]).toarray()
 				# transform testing data and return the matrix
 				testing_data = cv.transform(X[test]).toarray()
 			else:
 				# use my own BagOfWords python implementation
 				stemming = True
 				rel_freq = True
 				extracted_words = BagOfWords.extract_all_words(X[train])
 				vocab = BagOfWords.make_vocab(extracted_words)
 				# fit the training data and then return the matrix
 				training_data = BagOfWords.make_matrix(extracted_words,
 								vocab, rel_freq, stemming)
 				# transform testing data and return the matrix
 				extracted_words = BagOfWords.extract_all_words(X[test])
 				testing_data = BagOfWords.make_matrix(extracted_words,
 								vocab, rel_freq, stemming)
 			# apply select percentile
 			selector = SelectPercentile(percentile=percentile)
 			selector.fit(training_data, y[train])
 			# new reduced data sets
 			training_data_r = selector.transform(training_data)
 			testing_data_r = selector.transform(testing_data)
 			#fit classifier
 			classifier.fit(training_data_r, y[train])
 			#predict class
 			predictions_train = classifier.predict(training_data_r)
 			predictions_test = classifier.predict(testing_data_r)
 			# print('train:')
 			# print(y[train])
 			# print('test:')
 			# print(y[test])
 			# print()
 			# print('pred')
 			# print(predictions_test)
 			#print and store metrics
 			rec = recall_score(y[test], predictions_test, average='weighted')
 			print('rec: ' + str(rec))
 			recall_scores.append(rec)
 			prec = precision_score(y[test], predictions_test, average='weighted')
 			print('prec: ' + str(prec))
 			print('#')
 			precision_scores.append(prec)
 			# equation for f1 score
 			f1_scores.append(2 * (prec * rec)/(prec + rec))
 			#class_prob.append(classifier.class_prior_)
 			#class_counts.append(classifier.class_count_)
 		##########################
 		# probability estimates for the test vector (testing_data)
 		class_probs = classifier.predict_proba(testing_data)
 		# number of samples encountered for each class during fitting
 		# this value is weighted by the sample weight when provided
 		class_count = classifier.class_count_
 		# classes in order used
 		classes = classifier.classes_
 		# return classes and vector of class estimates
 		return recall_scores, precision_scores, f1_scores
 	######## nur für resubstitutionsfehler benötigt ########
 	def analyze_errors(training, testing):
 		'''calculates resubstitution error
 		shows indices of false classified articles
 		uses Gaussian Bayes with train test split
 		'''
 		X_train = training['Title'] + ' ' + training['Text']
 		y_train = training['Label']
 		X_test = testing['Title'] + ' ' + testing['Text']
 		y_test = testing['Label']
 		count_vector = CountVectorizer()
 		# fit the training data and then return the matrix
 		training_data = count_vector.fit_transform(X_train).toarray()
 		# transform testing data and return the matrix
 		testing_data = count_vector.transform(X_test).toarray()
 		# Naive Bayes
 		classifier = MultinomialNB(alpha=1.0e-10,
 								   fit_prior=False,
 								   class_prior=None)
 		# fit classifier
 		classifier.fit(training_data, y_train)
 		# Predict class
 		predictions = classifier.predict(testing_data)
 		print(type(y_test))
 		print(len(y_test))
 		print(type(predictions))
 		print(len(predictions))
 		print('Errors at index:')
 		print()
 		n = 0
 		for i in range(len(y_test)):
 			if y_test[i] != predictions[i]:
 				n += 1
 				print('error no.{}'.format(n))
 				print('prediction at index {} is: {}, but actual is: {}'
 				.format(i, predictions[i], y_test[i]))
 				print(X_test[i])
 				print(y_test[i])
 				print()
 		#print metrics
 		print('F1 score: ', format(f1_score(y_test, predictions)))
--- a/src/Untitled.ipynb
+++ b/src/Untitled.ipynb
@ -0,0 +1,6 @@
 {
 "cells": [],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/visualization/proba_after_round_1_stratified.png
+++ b/visualization/proba_after_round_1_stratified.png
--- a/visualization/proba_round_10.eps
+++ b/visualization/proba_round_10.eps
--- a/visualization/proba_round_10.png
+++ b/visualization/proba_round_10.png
--- a/visualization/proba_round_11.eps
+++ b/visualization/proba_round_11.eps
--- a/visualization/proba_round_11.png
+++ b/visualization/proba_round_11.png
--- a/visualization/proba_round_9.eps
+++ b/visualization/proba_round_9.eps
--- a/visualization/proba_round_9.png
+++ b/visualization/proba_round_9.png
--- a/visualization/probabilities_after_round_10.png
+++ b/visualization/probabilities_after_round_10.png