update tri-model and try word2vec

2019-03-25 12:41:10 +01:00 · 2019-03-25 12:41:10 +01:00 · ea0a132bd6
commit ea0a132bd6
parent c87a85b818
10 changed files with 21173 additions and 199 deletions
--- a/data/interactive_labeling_triple_model.csv
+++ b/data/interactive_labeling_triple_model.csv
--- a/data/interactive_labeling_triple_model_auf_1082.csv
+++ b/data/interactive_labeling_triple_model_auf_1082.csv
--- a/src/2019-02-11-interactive-labeling-analysis.ipynb
+++ b/src/2019-02-11-interactive-labeling-analysis.ipynb
@ -24,7 +24,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -209,7 +209,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -218,7 +218,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@ -233,7 +233,7 @@
   ],
   "source": [
    "# read current data set from csv\n",
-    "df = pd.read_csv('../data/interactive_labeling_round_{}_temp.csv'.format(m),\n",
+    "df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n",
    "          sep='|',\n",
    "          usecols=range(1,13), # drop first column 'unnamed'\n",
    "          encoding='utf-8',\n",
@ -725,27 +725,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "ename": "NameError",
     "evalue": "name 'pd' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-1-a51d7411db70>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;31m# read current data set from csv\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[0mm\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m9\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n\u001b[0m\u001b[0;32m      6\u001b[0m           \u001b[0msep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'|'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m           \u001b[0musecols\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m13\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;31m# drop first column 'unnamed'\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'pd' is not defined"
     ]
    }
   ],
   "source": [
    "# THIS CELL IS OPTIONAL\n",
    "\n",
    "# read current data set from csv\n",
-    "m = 9\n",
+    "m = 11\n",
-    "df = pd.read_csv('../data/interactive_labeling_round_{}_corrected.csv'.format(m),\n",
+    "df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n",
    "          sep='|',\n",
    "          usecols=range(1,13), # drop first column 'unnamed'\n",
    "          encoding='utf-8',\n",
--- a/src/2019-03-12-al-model-evaluation.ipynb
+++ b/src/2019-03-12-al-model-evaluation.ipynb
@ -0,0 +1,744 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import operator\n",
    "import pickle\n",
    "import random\n",
    "\n",
    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
    "import ipywidgets as widgets\n",
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "from IPython.display import display\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "from MNBInteractive import MNBInteractive\n",
    "from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
    "from NaiveBayes import NaiveBayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# initialize random => reproducible sequence\n",
    "random.seed(5)\n",
    "\n",
    "# set up wider display area\n",
    "pd.set_option('display.max_colwidth', -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Last round number: 11\n",
      "Number of manually labeled articles: 1082\n",
      "Number of manually unlabeled articles: 8918\n"
     ]
    }
   ],
   "source": [
    "# read current data set from csv\n",
    "df = pd.read_csv('../data/interactive_labeling_round_11.csv',\n",
    "          sep='|',\n",
    "          usecols=range(1,13), # drop first column 'unnamed'\n",
    "          encoding='utf-8',\n",
    "          quoting=csv.QUOTE_NONNUMERIC,\n",
    "          quotechar='\\'')\n",
    "\n",
    "# find current iteration/round number\n",
    "m = int(df['Round'].max())\n",
    "print('Last round number: {}'.format(m))\n",
    "print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
    "print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_next(index):\n",
    "    ''' this method displays an article's text and an interactive slider to set its label manually\n",
    "    '''\n",
    "    print('News article no. {}:'.format(index))\n",
    "    print()\n",
    "    print('HEADLINE:')\n",
    "    print(df.loc[df['Index'] == index, 'Title'])\n",
    "    print()\n",
    "    print('TEXT:')\n",
    "    print(df.loc[df['Index'] == index, 'Text'])\n",
    "    print()\n",
    "    print('ESTIMATED_0:')\n",
    "    print(df.loc[df['Index'] == index, 'Estimated_0'])\n",
    "    print()\n",
    "    print('ESTIMATED_1:')\n",
    "    print(df.loc[df['Index'] == index, 'Estimated_1'])\n",
    "    print()\n",
    "    print('ESTIMATED_2:')\n",
    "    print(df.loc[df['Index'] == index, 'Estimated_2'])\n",
    "    \n",
    "    def f(x):\n",
    "        # save user input\n",
    "        df.loc[df['Index'] == index, 'Label'] = x\n",
    "\n",
    "    # create slider widget for labels\n",
    "    interact(f, x = widgets.IntSlider(min=-1, max=2, step=1, value=df.loc[df['Index'] == index, 'Label']))\n",
    "    print('0: Other/Unrelated news, 1: Merger,')\n",
    "    print('2: Topics related to deals, investments and mergers')\n",
    "    print('___________________________________________________________________________________________________________')\n",
    "    print()\n",
    "    print()\n",
    "\n",
    "# list of article indices that will be shown next\n",
    "label_next = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How to find a better model:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A) Multinomial Naive Bayes Algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# toDo: läuft noch nicht\n",
    "\n",
    "# series of indices of recently estimated articles \n",
    "indices_estimated = df.loc[df['Label'] != -1, 'Index'].tolist()\n",
    "\n",
    "# annotate probability\n",
    "n = 0\n",
    "for row in class_probs[0]:\n",
    "    index = indices_estimated[n]\n",
    "    # save estimated label\n",
    "    df.loc[index, 'Estimated_2'] = row[1]\n",
    "    n += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
    "print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
    "print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
    "print()\n",
    "print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
    "print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
    "print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('confusion matrix:')\n",
    "print('###############')\n",
    "zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])\n",
    "zero_0\n",
    "zero_1 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 1)])\n",
    "zero_1\n",
    "zero_2 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 2)])\n",
    "zero_2\n",
    "print('/')\n",
    "one_0 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 0)])\n",
    "one_0\n",
    "one_1 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 1)])\n",
    "one_1\n",
    "one_2 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 2)])\n",
    "one_2\n",
    "print('/')\n",
    "\n",
    "two_0 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 0)])\n",
    "two_0\n",
    "two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])\n",
    "two_1\n",
    "two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])\n",
    "two_2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Building three separate models:\n",
    "\n",
    "B) One model per class: Funktioniert es besser wenn man 3 Modelle hat.\n",
    "Begründung: wir sind interessiert an Klasse 1\n",
    "Pro Klasse 1 Modell bauen (Hier ist das ziel das beste Modell zu finden. Dafür nehmen wir 1082 gelabelte Daten.)\n",
    "3 Modelle => Ergebnis für 1 Sample: (70%, 40%, 80%) unklar => überprüfen\n",
    "=> (90%, 90%, 90%) => überprüfen\n",
    "liefert das bessere ambiguity Samples als oben\n",
    "Stratified sample: (50 + 50 (1/2 von der anderen Klasse 1/2 der dritten Klasse))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
    "labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
    "labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "847"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(labeled_pos_0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(labeled_pos_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "185"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(labeled_pos_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add three new columns for the three models, initialize with nans\n",
    "df['Estimated_0'] = np.nan\n",
    "df['Estimated_1'] = np.nan\n",
    "df['Estimated_2'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "sampling_class0_0 = labeled_pos_0.sample(n=100, random_state=5)\n",
    "sampling_class0_1 = labeled_pos_1.sample(n=50, random_state=5)\n",
    "sampling_class0_2 = labeled_pos_2.sample(n=50, random_state=5)\n",
    "\n",
    "sampling_class1_0 = labeled_pos_0.sample(n=25, random_state=5)\n",
    "sampling_class1_1 = sampling_class0_1\n",
    "sampling_class1_2 = labeled_pos_2.sample(n=25, random_state=5)\n",
    "\n",
    "sampling_class2_0 = labeled_pos_0.sample(n=50, random_state=5)\n",
    "sampling_class2_1 = sampling_class0_1\n",
    "sampling_class2_2 = labeled_pos_2.sample(n=100, random_state=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "sampling_class0_complement = pd.concat([sampling_class0_1, sampling_class0_2])\n",
    "sampling_class1_complement = pd.concat([sampling_class1_0, sampling_class1_2])\n",
    "sampling_class2_complement = pd.concat([sampling_class2_0, sampling_class2_1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# prepare for binary classification:\n",
    "# pos_label = 3\n",
    "sampling_class0_0['Label'] = 3\n",
    "sampling_class1_1['Label'] = 3\n",
    "sampling_class2_2['Label'] = 3\n",
    "# neg_label = 4\n",
    "sampling_class0_complement['Label'] = 4\n",
    "sampling_class1_complement['Label'] = 4\n",
    "sampling_class2_complement['Label'] = 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "sampling_class0 = pd.concat([sampling_class0_0, sampling_class0_complement]).reset_index(drop=True)\n",
    "sampling_class1 = pd.concat([sampling_class1_1, sampling_class1_complement]).reset_index(drop=True)\n",
    "sampling_class2 = pd.concat([sampling_class2_2, sampling_class2_complement]).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Apply Naive Bayes Model to estimate all labeled articles (1082 samples):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data = sampling_class2\n",
    "indices_train = train_data['Index'].tolist()\n",
    "len(indices_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "882"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_train))].reset_index(drop=True)\n",
    "len(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data.loc[(test_data['Label'] == 0), 'Label'] = 3\n",
    "test_data.loc[(test_data['Label'] == 1) | (test_data['Label'] == 2), 'Label'] = 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# split training data into text and label set\n",
    "# join title and text\n",
    "X = train_data['Title'] + '. ' + train_data['Text']\n",
    "y = train_data['Label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# split testing data into text and label set\n",
    "U = test_data['Title'] + '. ' + test_data['Text']\n",
    "v = test_data['Label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "recall: 0.19949811794228356\n",
      "precision: 0.803030303030303\n"
     ]
    }
   ],
   "source": [
    "classifier = GaussianNB()\n",
    "\n",
    "cv = CountVectorizer()\n",
    "\n",
    "# probabilities of each class\n",
    "class_probs = []\n",
    "\n",
    "# use sklearn CountVectorizer\n",
    "# fit the training data and then return the matrix\n",
    "training_data = cv.fit_transform(X, y).toarray()\n",
    "# transform testing data and return the matrix\n",
    "testing_data = cv.transform(U).toarray()\n",
    "\n",
    "#fit classifier\n",
    "classifier.fit(training_data, y)\n",
    "\n",
    "#predict class\n",
    "predictions_test = classifier.predict(testing_data)\n",
    "\n",
    "class_probs = classifier.predict_proba(testing_data)\n",
    "\n",
    "#print and store metrics\n",
    "rec = recall_score(v, predictions_test, pos_label=3)\n",
    "print('recall: ' + str(rec))\n",
    "prec = precision_score(v, predictions_test, pos_label=3)\n",
    "print('precision: ' + str(prec))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class_probs[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# series of indices of recently estimated articles \n",
    "indices_estimated_2 = test_data['Index'].tolist()\n",
    "\n",
    "# annotate probability\n",
    "n = 0\n",
    "for row in class_probs:\n",
    "    index = indices_estimated_2[n]\n",
    "    # save estimated label\n",
    "    df.loc[index, 'Estimated_2'] = row[0]\n",
    "    n += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Apply Naive Bayes Model (10-fold-cross validation):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = sampling_class0\n",
    "\n",
    "X = dataset['Title'] + '. ' + dataset['Text']\n",
    "y = dataset['Label']\n",
    "\n",
    "cv = CountVectorizer()\n",
    "\n",
    "# use stratified k-fold cross-validation as split method\n",
    "skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)\n",
    "\n",
    "classifier = GaussianNB()\n",
    "\n",
    "# metrics\n",
    "recall_scores = []\n",
    "precision_scores = []\n",
    "\n",
    "# probabilities of each class (of each fold)\n",
    "class_probs = []\n",
    "# counts number of training samples observed in each class \n",
    "class_counts = []\n",
    "\n",
    "# for each fold\n",
    "for train, test in skf.split(X,y):\n",
    "    \n",
    "    # fit the training data and then return the matrix\n",
    "    training_data = cv.fit_transform(X[train], y[train]).toarray()\n",
    "    # transform testing data and return the matrix\n",
    "    testing_data = cv.transform(X[test]).toarray()\n",
    "\n",
    "    #fit classifier\n",
    "    classifier.fit(training_data, y[train])\n",
    "    #predict class\n",
    "    predictions_train = classifier.predict(training_data)\n",
    "    predictions_test = classifier.predict(testing_data)\n",
    "\n",
    "    #print and store metrics\n",
    "    rec = recall_score(y[test], predictions_test)\n",
    "    recall_scores.append(rec)\n",
    "    prec = precision_score(y[test], predictions_test)\n",
    "    precision_scores.append(prec)\n",
    "\n",
    "    class_probs.append(classifier.class_prior_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# series of indices of recently estimated articles \n",
    "indices_estimated_0 = sampling_class0['Index'].tolist()\n",
    "\n",
    "# annotate probability\n",
    "n = 0\n",
    "for row in class_probs:\n",
    "    index = indices_estimated_0[n]\n",
    "    # save estimated label\n",
    "    df.loc[index, 'Estimated_0'] = row[1]\n",
    "    n += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
    "print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
    "print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
    "print()\n",
    "print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
    "print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
    "print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Number of used samples:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "indices_all_samples = set((indices_estimated_0 + indices_estimated_1) + indices_estimated_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(indices_all_samples)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check if there are samples where more than one class was marked with 1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "indices = df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0), 'Index'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save tri-model to csv \n",
    "df.to_csv('../data/interactive_labeling_triple_model_auf_1082.csv',\n",
    "      sep='|',\n",
    "      mode='w',\n",
    "      encoding='utf-8',\n",
    "      quoting=csv.QUOTE_NONNUMERIC,\n",
    "      quotechar='\\'')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read current data set from csv\n",
    "df = pd.read_csv('../data/interactive_labeling_triple_model_auf_1082.csv',\n",
    "          sep='|',\n",
    "          usecols=range(1,16), # drop first column 'unnamed'\n",
    "          encoding='utf-8',\n",
    "          quoting=csv.QUOTE_NONNUMERIC,\n",
    "          quotechar='\\'')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for index in indices:\n",
    "    show_next(index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/src/BagOfWords.py
+++ b/src/BagOfWords.py
@ -53,7 +53,7 @@ class BagOfWords:
 		for word in words:
 			word = word.lower()
 			# check if alphabetic and not stop word
-			if (word.isalpha()):# and word not in stop_words):
+			if (word.isalpha() and word not in stop_words):
 				if stemming:
 					# reduce word to its stem
 					word = stemmer.stem(word)
--- a/src/MultinomialNaiveBayes.py
+++ b/src/MultinomialNaiveBayes.py
@ -1,6 +1,6 @@
 '''
 Multinomial Naive Bayes Classifier
-======================
+==================================
 '''
 from BagOfWords import BagOfWords
@ -11,6 +11,7 @@ import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
 from sklearn.metrics import recall_score, precision_score
 import sklearn
 from sklearn.model_selection import StratifiedKFold
 from sklearn.naive_bayes import MultinomialNB
@ -19,19 +20,13 @@ class MultinomialNaiveBayes:
 	def make_mnb(dataset, sklearn_cv=True, percentile=100):
 		'''fits naive bayes model with StratifiedKFold
 		'''
-		print('# starting classical multinomial naive bayes')
+		print('# starting multinomial naive bayes')
 		print('# ...')
 		# split data into text and label set
 		# join title and text
 		X = dataset['Title'] + '. ' + dataset['Text']
 		print(X[:12])
 		y = dataset['Label']
 		print(y[:12])
 		# bis hierhin stimmt noch alles...
 		if sklearn_cv:
 			cv = CountVectorizer()
@ -63,14 +58,6 @@ class MultinomialNaiveBayes:
 			if sklearn_cv:
 				# use sklearn CountVectorizer
 				# fit the training data and then return the matrix
 				print('Title + Text von train')
 				# aber ab hier stimmt was nicht. irgendwie rutschen da NaNs mit rein...
 				print(X[train])
 				print('Label von train')
 				print(y[train])
 				training_data = cv.fit_transform(X[train], y[train]).toarray()
 				# transform testing data and return the matrix
@ -136,8 +123,12 @@ class MultinomialNaiveBayes:
 		# classes in order used
 		classes = classifier.classes_
 		print('average: recall, precision, f1 score')
 		print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
 		# return classes and vector of class estimates
-		return recall_scores, precision_scores, f1_scores
+		return recall_scores, precision_scores, f1_scores, class_probs
 	######## nur für resubstitutionsfehler benötigt ########
 	def analyze_errors(training, testing):
@ -204,7 +195,4 @@ if __name__ == '__main__':
 			  quotechar='\'')
 	# select only labeled articles
-	#print('Anzahl aller gelabelten:')
+	MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=True, percentile=100)
 	#print(len(df.loc[df['Label'] != -1]))
 	#print(df.loc[df['Label'] != -1][:5])
 	MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reindex(), sklearn_cv=True, percentile=100)
--- a/src/MultinomialNaiveBayes_Word2Vec.py
+++ b/src/MultinomialNaiveBayes_Word2Vec.py
@ -0,0 +1,135 @@
 '''
 Multinomial Naive Bayes Classifier
 ==================================
 '''
 from BagOfWords import BagOfWords
 import csv
 import gensim
 from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
 from sklearn.metrics import recall_score, precision_score
 import sklearn
 from sklearn.model_selection import StratifiedKFold
 from sklearn.naive_bayes import MultinomialNB
 class MultinomialNaiveBayes:
 	def make_mnb(dataset, sklearn_cv=True, percentile=100):
 		'''fits naive bayes model with StratifiedKFold
 		'''
 		vector_size=150
 		def read_corpus(data, tokens_only=False):
 			list_of_lists = []
 			for i, text in enumerate(data):
 				if tokens_only:
 					list_of_lists.append(BagOfWords.extract_words(text))
 				else:
 					# For training data, add tags
 					list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
 			return list_of_lists
 		print('# starting multinomial naive bayes')
 		print('# ...')
 		# split data into text and label set
 		# join title and text
 		X = dataset['Title'] + '. ' + dataset['Text']
 		y = dataset['Label']
 		# use stratified k-fold cross-validation as split method
 		skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
 		classifier = MultinomialNB(alpha=1.0e-10,
 								   fit_prior=False,
 								   class_prior=None)
 		# metrics
 		recall_scores = []
 		precision_scores = []
 		f1_scores = []
 		# probabilities of each class (of each fold)
 		#class_prob = []
 		# counts number of training samples observed in each class 
 		#class_counts = []
 		# for each fold
 		n = 0
 		for train, test in skf.split(X,y):
 			n += 1
 			print('# split no. ' + str(n))
 			# train model with gensim	   
 			training_data = read_corpus(X[train], tokens_only=False)
 			testing_data = read_corpus(X[test], tokens_only=True)
 			all_data = read_corpus(X, tokens_only=False)
 			# instantiate a Doc2Vec object
 			doc2vec_model = Doc2Vec(training_data, vector_size=5, window=2, min_count=1, workers=4)
 			print(doc2vec_model.docvecs[0])
 			print(doc2vec_model.docvecs[1])
 			print(doc2vec_model.docvecs[2])
 			training_data = [doc2vec_model.docvecs[i] for i in range(len(training_data))]
 			testing_data = [doc2vec_model.infer_vector(vector) for vector in testing_data]
 			#fit classifier
 			classifier.fit(training_data, y[train])
 			#predict class
 			predictions_train = classifier.predict(training_data)
 			predictions_test = classifier.predict(testing_data)
 			#print and store metrics
 			rec = recall_score(y[test], predictions_test, average='weighted')
 			print('rec: ' + str(rec))
 			recall_scores.append(rec)
 			prec = precision_score(y[test], predictions_test, average='weighted')
 			print('prec: ' + str(prec))
 			print('#')
 			precision_scores.append(prec)
 			# equation for f1 score
 			f1_scores.append(2 * (prec * rec)/(prec + rec))
 		##########################
 		# probability estimates for the test vector (testing_data)
 		class_probs = classifier.predict_proba(testing_data)
 		# number of samples encountered for each class during fitting
 		# this value is weighted by the sample weight when provided
 		class_count = classifier.class_count_
 		# classes in order used
 		classes = classifier.classes_
 		print('average: recall, precision, f1 score')
 		print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
 		# return classes and vector of class estimates
 		return recall_scores, precision_scores, f1_scores, class_probs
 if __name__ == '__main__':
 	# read csv file
 	print('# reading dataset')
 	print('# ...')
 	# read current data set from csv
 	df = pd.read_csv('../data/interactive_labeling_round_11.csv',
 			  sep='|',
 			  usecols=range(1,13), # drop first column 'unnamed'
 			  encoding='utf-8',
 			  quoting=csv.QUOTE_NONNUMERIC,
 			  quotechar='\'')
 	# select only labeled articles
 	MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1][:100].reset_index(drop=True), sklearn_cv=False, percentile=100)
--- a/src/NaiveBayes.py
+++ b/src/NaiveBayes.py
@ -25,182 +25,184 @@ from sklearn.naive_bayes import GaussianNB
 class NaiveBayes:
-    def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
+	def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
-        '''fits naive bayes model with StratifiedKFold,
+		'''fits naive bayes model with StratifiedKFold,
-        uses my BOW
+		uses my BOW
-        '''
+		'''
-        print('# fitting model')
+		print('# fitting model')
-        print('# ...')
+		print('# ...')
-        # split data into text and label set
+		# split data into text and label set
-        # join title and text
+		# join title and text
-        X = dataset['Title'] + '. ' + dataset['Text']
+		X = dataset['Title'] + '. ' + dataset['Text']
-        y = dataset['Label']
+		y = dataset['Label']
-        if sklearn_cv:
+		if sklearn_cv:
-            cv = CountVectorizer()
+			cv = CountVectorizer()
-        # use stratified k-fold cross-validation as split method
+		# use stratified k-fold cross-validation as split method
-        skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
+		skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
-        classifier = GaussianNB()
+		classifier = GaussianNB()
-        # metrics
+		# metrics
-        recall_scores = []
+		recall_scores = []
-        precision_scores = []
+		precision_scores = []
-        f1_scores = []
+		#f1_scores = []
-        # probabilities of each class (of each fold)
+		# probabilities of each class (of each fold)
-        class_prob = []
+		class_prob = []
-        # counts number of training samples observed in each class 
+		# counts number of training samples observed in each class 
-        class_counts = []
+		class_counts = []
-        # for each fold
+		# for each fold
-        n = 0
+		n = 0
-        for train, test in skf.split(X,y):
+		for train, test in skf.split(X,y):
-            n += 1
+			n += 1
-            print('# split no. ' + str(n))
+			print('# split no. ' + str(n))
-            if sklearn_cv:
+			if sklearn_cv:
-                # use sklearn CountVectorizer
+				# use sklearn CountVectorizer
-                # fit the training data and then return the matrix
+				# fit the training data and then return the matrix
-                training_data = cv.fit_transform(X[train], y[train]).toarray()
+				training_data = cv.fit_transform(X[train], y[train]).toarray()
-                # transform testing data and return the matrix
+				# transform testing data and return the matrix
-                testing_data = cv.transform(X[test]).toarray()
+				testing_data = cv.transform(X[test]).toarray()
-            else:
+			else:
-                # use my own BagOfWords python implementation
+				# use my own BagOfWords python implementation
-                stemming = True
+				stemming = True
-                rel_freq = True
+				rel_freq = True
-                extracted_words = BagOfWords.extract_all_words(X[train])
+				extracted_words = BagOfWords.extract_all_words(X[train])
-                vocab = BagOfWords.make_vocab(extracted_words)
+				vocab = BagOfWords.make_vocab(extracted_words)
-                # fit the training data and then return the matrix
+				# fit the training data and then return the matrix
-                training_data = BagOfWords.make_matrix(extracted_words,
+				training_data = BagOfWords.make_matrix(extracted_words,
-                                vocab, rel_freq, stemming)
+								vocab, rel_freq, stemming)
-                # transform testing data and return the matrix
+				# transform testing data and return the matrix
-                extracted_words = BagOfWords.extract_all_words(X[test])
+				extracted_words = BagOfWords.extract_all_words(X[test])
-                testing_data = BagOfWords.make_matrix(extracted_words,
+				testing_data = BagOfWords.make_matrix(extracted_words,
-                                vocab, rel_freq, stemming)
+								vocab, rel_freq, stemming)
-            # apply select percentile
+			# apply select percentile
-            selector = SelectPercentile(percentile=percentile)
+			selector = SelectPercentile(percentile=percentile)
-            selector.fit(training_data, y[train])
+			selector.fit(training_data, y[train])
-            # new reduced data sets
+			# new reduced data sets
-            training_data_r = selector.transform(training_data)
+			training_data_r = selector.transform(training_data)
-            testing_data_r = selector.transform(testing_data)
+			testing_data_r = selector.transform(testing_data)
-            #fit classifier
+			#fit classifier
-            classifier.fit(training_data_r, y[train])
+			classifier.fit(training_data_r, y[train])
-            #predict class
+			#predict class
-            predictions_train = classifier.predict(training_data_r)
+			predictions_train = classifier.predict(training_data_r)
-            predictions_test = classifier.predict(testing_data_r)
+			predictions_test = classifier.predict(testing_data_r)
-            #print and store metrics
+			#print and store metrics
-            rec = recall_score(y[test], predictions_test)
+			rec = recall_score(y[test], predictions_test)
-            print('rec: ' + str(rec))
+			print('rec: ' + str(rec))
-            recall_scores.append(rec)
+			recall_scores.append(rec)
-            prec = precision_score(y[test], predictions_test)
+			prec = precision_score(y[test], predictions_test)
-            print('prec: ' + str(prec))
+			print('prec: ' + str(prec))
-            print('#')
+			print('#')
-            precision_scores.append(prec)
+			precision_scores.append(prec)
-            # equation for f1 score
+			# equation for f1 score
-            f1_scores.append(2 * (prec * rec)/(prec + rec))
+			#f1_scores.append(2 * (prec * rec)/(prec + rec))
-            class_prob.append(classifier.class_prior_)
+			class_prob.append(classifier.class_prior_)
-            class_counts.append(classifier.class_count_)
+			class_counts.append(classifier.class_count_)
-        ##########################
+		##########################
-        #print metrics of test set
+		#print metrics of test set
-        print('-------------------------')
+		# print('-------------------------')
-        print('prediction of testing set:')
+		# print('prediction of testing set:')
-        print('Precision score: min = {}, max = {}, average = {}'
+		# print('Precision score: min = {}, max = {}, average = {}'
-                .format(min(precision_scores),
+				# .format(min(precision_scores),
-                        max(precision_scores),
+						# max(precision_scores),
-                        sum(precision_scores)/float(len(precision_scores))))
+						# sum(precision_scores)/float(len(precision_scores))))
-        print('Recall score: min = {}, max = {}, average = {}'
+		# print('Recall score: min = {}, max = {}, average = {}'
-                .format(min(recall_scores),
+				# .format(min(recall_scores),
-                        max(recall_scores),
+						# max(recall_scores),
-                        sum(recall_scores)/float(len(recall_scores))))
+						# sum(recall_scores)/float(len(recall_scores))))
-        print('F1 score: min = {}, max = {}, average = {}'
+		# print('F1 score: min = {}, max = {}, average = {}'
-                .format(min(f1_scores),
+				# .format(min(f1_scores),
-                        max(f1_scores),
+						# max(f1_scores),
-                        sum(f1_scores)/float(len(f1_scores))))
+						# sum(f1_scores)/float(len(f1_scores))))
-        print()
+		# print()
-        # print probability of each class
+		# # print probability of each class
-        print('probability of each class:')
+		# print('probability of each class:')
-        print()
+		# print()
-        print(class_prob)
+		# print(class_prob)
-        print()
+		# print()
-        print('number of samples of each class:')
+		# print('number of samples of each class:')
-        print()
+		# print()
-        print(class_counts)
+		# print(class_counts)
-        print()
+		# print()
-        ##### nur für overfit testing ###########
+		return class_prob, class_counts, recall_scores, precision_scores#, f1_scores
        #print('overfit testing: prediction of training set')
        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
        #format(min(f1_scores_train), max(f1_scores_train),
        #sum(f1_scores_train)/float(len(f1_scores_train))))
        #print()
-    ######## nur für resubstitutionsfehler benötigt ########
+		##### nur für overfit testing ###########
-    def analyze_errors(dataset):
+		#print('overfit testing: prediction of training set')
-        '''calculates resubstitution error
+		#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
-        shows indices of false classified articles
+		#format(min(f1_scores_train), max(f1_scores_train),
-        uses Gaussian Bayes with train test split
+		#sum(f1_scores_train)/float(len(f1_scores_train))))
-        '''
+		#print()
        X_train_test = dataset['Title'] + ' ' + dataset['Text']
        y_train_test = dataset['Label']
-        count_vector = CountVectorizer()
+	######## nur für resubstitutionsfehler benötigt ########
-        # fit the training data and then return the matrix
+	def analyze_errors(dataset):
-        training_data = count_vector.fit_transform(X_train_test).toarray()
+		'''calculates resubstitution error
-        # transform testing data and return the matrix
+		shows indices of false classified articles
-        testing_data = count_vector.transform(X_train_test).toarray()
+		uses Gaussian Bayes with train test split
 		'''
 		X_train_test = dataset['Title'] + ' ' + dataset['Text']
 		y_train_test = dataset['Label']
-        # Naive Bayes
+		count_vector = CountVectorizer()
-        classifier = GaussianNB()
+		# fit the training data and then return the matrix
-        # fit classifier
+		training_data = count_vector.fit_transform(X_train_test).toarray()
-        classifier.fit(training_data, y_train_test)
+		# transform testing data and return the matrix
 		testing_data = count_vector.transform(X_train_test).toarray()
-        # Predict class
+		# Naive Bayes
-        predictions = classifier.predict(testing_data)
+		classifier = GaussianNB()
-        print('Errors at index:')
+		# fit classifier
-        print()
+		classifier.fit(training_data, y_train_test)
        n = 0
        for i in range(len(y_train_test)):
            if y_train_test[i] != predictions[i]:
                n += 1
                print('error no.{}'.format(n))
                print('prediction at index {} is: {}, but actual is: {}'
                .format(i, predictions[i], y_train_test[i]))
                print(X_train_test[i])
                print(y_train_test[i])
                print()
        #print metrics
        print('F1 score: ', format(f1_score(y_train_test, predictions)))
-    if __name__ == '__main__':
+		# Predict class
 		predictions = classifier.predict(testing_data)
 		print('Errors at index:')
 		print()
 		n = 0
 		for i in range(len(y_train_test)):
 			if y_train_test[i] != predictions[i]:
 				n += 1
 				print('error no.{}'.format(n))
 				print('prediction at index {} is: {}, but actual is: {}'
 				.format(i, predictions[i], y_train_test[i]))
 				print(X_train_test[i])
 				print(y_train_test[i])
 				print()
 		#print metrics
 		print('F1 score: ', format(f1_score(y_train_test, predictions)))
-        print('# starting naive bayes')
+	if __name__ == '__main__':
        print('# ...')
-        file = '..\\data\\classification_labelled_corrected.csv'
+		print('# starting naive bayes')
 		print('# ...')
-        # read csv file
+		file = '..\\data\\classification_labelled_corrected.csv'
        print('# reading dataset')
        print('# ...')
-        data = pd.read_csv(file,
+		# read csv file
-                           sep='|',
+		print('# reading dataset')
-                           engine='python',
+		print('# ...')
                           decimal='.',
                           quotechar='\'',
                           quoting=csv.QUOTE_NONE)
-        make_naive_bayes(data)
+		data = pd.read_csv(file,
 						   sep='|',
 						   engine='python',
 						   decimal='.',
 						   quotechar='\'',
 						   quoting=csv.QUOTE_NONE)
-        print('#')
+		make_naive_bayes(data)
-        print('# ending naive bayes')
+
 		print('#')
 		print('# ending naive bayes')
--- a/src/SVM_multiclass.py
+++ b/src/SVM_multiclass.py
@ -0,0 +1,121 @@
 '''
 Support Vector Machines (SVM) Classifier
 ========================================
 The SVM training algorithm builds a model from the training data that assigns
 the test samples to one category ('merger' or 'not merger'),
 making it a non-probabilistic binary linear classifier.
 An SVM model is a representation of the samples as points in space,
 mapped so that the examples of the separate categories are divided
 by a clear gap that is as wide as possible.
 New samples are then mapped into that same space and predicted
 to belong to a category based on which side of the gap they fall.
 '''
 from BagOfWords import BagOfWords
 import csv
 import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
 from sklearn.metrics import f1_score, make_scorer
 from sklearn.model_selection import StratifiedKFold
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC
 class SVM:
 	def make_svm(dataset, sklearn_cv=True):
 		print('# fitting model')
 		print('# ...')
 		# split data into text and label set
 		# articles' text (title + text)
 		X = dataset['Title'] + '. ' + dataset['Text']
 		# articles' labels
 		y = dataset['Label']
 		matrix = pd.DataFrame()
 		# fit the training data and then return the matrix
 		if sklearn_cv:
 			# use sklearn CountVectorizer
 			matrix = CountVectorizer().fit_transform(X).toarray()
 		else:
 			# use own BOW implementation
 			matrix = BagOfWords.fit_transform(X)
 		# use stratified k-fold cross-validation as split method
 		skf = StratifiedKFold(n_splits = 10, shuffle=True)
 		# use only most important features
 		selector = SelectPercentile()
 		pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
 		grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100],
 							'SVC__kernel': ['linear'],
 							'SVC__gamma': [0.00001, 0.0001],
 							'SVC__C': [0.1, 1]},
 							cv=skf,
 							scoring=make_scorer(f1_score, average='micro'))
 		print('# fit classifier')
 		print('# ...')
 		grid.fit(matrix,y)
 		# DataFrame of results
 		df_results = grid.cv_results_
 		# print results
 		######################
 		print('RESULTS:')
 		print('')
 		print('mean_test_score:')
 		print(df_results['mean_test_score'])
 		print('')
 		print('mean of means:')
 		print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
 		print('')
 		print('best score:')
 		print(grid.best_score_)
 		print()
 		print('best parameters set found on development set:')
 		print(grid.best_params_)
 		print()
 	if __name__ == '__main__':
 		print('# starting svm')
 		print('# ...')
 		#file = '..\\data\\classification_labelled_corrected.csv'
 		# read csv file
 		print('# reading dataset')
 		print('# ...')
 		# data = pd.read_csv(file,
 				   # sep='|',
 				   # engine='python',
 				   # decimal='.',
 				   # quotechar='\'',
 				   # quoting=csv.QUOTE_NONE)
 		# read current data set from csv
 		df = pd.read_csv('../data/interactive_labeling_round_11.csv',
 						  sep='|',
 						  usecols=range(1,13), # drop first column 'unnamed'
 						  encoding='utf-8',
 						  quoting=csv.QUOTE_NONNUMERIC,
 						  quotechar='\'')
 		data = df.loc[df['Label'] != -1].reset_index(drop=True)
 		use_count_vectorizer = True
 		make_svm(data, use_count_vectorizer)
 		print('# ending svm')
--- a/src/Untitled.ipynb
+++ b/src/Untitled.ipynb
@ -1,6 +0,0 @@
 {
 "cells": [],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 2
 }