thesis-anne/src/2019-03-12-al-model-evaluation.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import operator\n",
    "import pickle\n",
    "import random\n",
    "\n",
    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
    "import ipywidgets as widgets\n",
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "from IPython.display import display\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "from MNBInteractive import MNBInteractive\n",
    "from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
    "from NaiveBayes import NaiveBayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# initialize random => reproducible sequence\n",
    "random.seed(5)\n",
    "\n",
    "# set up wider display area\n",
    "pd.set_option('display.max_colwidth', -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Last round number: 11\n",
      "Number of manually labeled articles: 1082\n",
      "Number of manually unlabeled articles: 8918\n"
     ]
    }
   ],
   "source": [
    "# read current data set from csv\n",
    "df = pd.read_csv('../data/interactive_labeling_round_11.csv',\n",
    "          sep='|',\n",
    "          usecols=range(1,13), # drop first column 'unnamed'\n",
    "          encoding='utf-8',\n",
    "          quoting=csv.QUOTE_NONNUMERIC,\n",
    "          quotechar='\\'')\n",
    "\n",
    "# find current iteration/round number\n",
    "m = int(df['Round'].max())\n",
    "print('Last round number: {}'.format(m))\n",
    "print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
    "print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_next(index):\n",
    "    ''' this method displays an article's text and an interactive slider to set its label manually\n",
    "    '''\n",
    "    print('News article no. {}:'.format(index))\n",
    "    print()\n",
    "    print('HEADLINE:')\n",
    "    print(df.loc[df['Index'] == index, 'Title'])\n",
    "    print()\n",
    "    print('TEXT:')\n",
    "    print(df.loc[df['Index'] == index, 'Text'])\n",
    "    print()\n",
    "    print('ESTIMATED_0:')\n",
    "    print(df.loc[df['Index'] == index, 'Estimated_0'])\n",
    "    print()\n",
    "    print('ESTIMATED_1:')\n",
    "    print(df.loc[df['Index'] == index, 'Estimated_1'])\n",
    "    print()\n",
    "    print('ESTIMATED_2:')\n",
    "    print(df.loc[df['Index'] == index, 'Estimated_2'])\n",
    "    \n",
    "    def f(x):\n",
    "        # save user input\n",
    "        df.loc[df['Index'] == index, 'Label'] = x\n",
    "\n",
    "    # create slider widget for labels\n",
    "    interact(f, x = widgets.IntSlider(min=-1, max=2, step=1, value=df.loc[df['Index'] == index, 'Label']))\n",
    "    print('0: Other/Unrelated news, 1: Merger,')\n",
    "    print('2: Topics related to deals, investments and mergers')\n",
    "    print('___________________________________________________________________________________________________________')\n",
    "    print()\n",
    "    print()\n",
    "\n",
    "# list of article indices that will be shown next\n",
    "label_next = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How to find a better model:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A) Multinomial Naive Bayes Algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# toDo: läuft noch nicht\n",
    "\n",
    "# series of indices of recently estimated articles \n",
    "indices_estimated = df.loc[df['Label'] != -1, 'Index'].tolist()\n",
    "\n",
    "# annotate probability\n",
    "n = 0\n",
    "for row in class_probs[0]:\n",
    "    index = indices_estimated[n]\n",
    "    # save estimated label\n",
    "    df.loc[index, 'Estimated_2'] = row[1]\n",
    "    n += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
    "print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
    "print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
    "print()\n",
    "print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
    "print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
    "print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('confusion matrix:')\n",
    "print('###############')\n",
    "zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])\n",
    "zero_0\n",
    "zero_1 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 1)])\n",
    "zero_1\n",
    "zero_2 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 2)])\n",
    "zero_2\n",
    "print('/')\n",
    "one_0 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 0)])\n",
    "one_0\n",
    "one_1 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 1)])\n",
    "one_1\n",
    "one_2 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 2)])\n",
    "one_2\n",
    "print('/')\n",
    "\n",
    "two_0 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 0)])\n",
    "two_0\n",
    "two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])\n",
    "two_1\n",
    "two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])\n",
    "two_2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Building three separate models:\n",
    "\n",
    "B) One model per class: Funktioniert es besser wenn man 3 Modelle hat.\n",
    "Begründung: wir sind interessiert an Klasse 1\n",
    "Pro Klasse 1 Modell bauen (Hier ist das ziel das beste Modell zu finden. Dafür nehmen wir 1082 gelabelte Daten.)\n",
    "3 Modelle => Ergebnis für 1 Sample: (70%, 40%, 80%) unklar => überprüfen\n",
    "=> (90%, 90%, 90%) => überprüfen\n",
    "liefert das bessere ambiguity Samples als oben\n",
    "Stratified sample: (50 + 50 (1/2 von der anderen Klasse 1/2 der dritten Klasse))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
    "labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
    "labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "847"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(labeled_pos_0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(labeled_pos_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "185"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(labeled_pos_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add three new columns for the three models, initialize with nans\n",
    "df['Estimated_0'] = np.nan\n",
    "df['Estimated_1'] = np.nan\n",
    "df['Estimated_2'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "sampling_class0_0 = labeled_pos_0.sample(n=100, random_state=5)\n",
    "sampling_class0_1 = labeled_pos_1.sample(n=50, random_state=5)\n",
    "sampling_class0_2 = labeled_pos_2.sample(n=50, random_state=5)\n",
    "\n",
    "sampling_class1_0 = labeled_pos_0.sample(n=25, random_state=5)\n",
    "sampling_class1_1 = sampling_class0_1\n",
    "sampling_class1_2 = labeled_pos_2.sample(n=25, random_state=5)\n",
    "\n",
    "sampling_class2_0 = labeled_pos_0.sample(n=50, random_state=5)\n",
    "sampling_class2_1 = sampling_class0_1\n",
    "sampling_class2_2 = labeled_pos_2.sample(n=100, random_state=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "sampling_class0_complement = pd.concat([sampling_class0_1, sampling_class0_2])\n",
    "sampling_class1_complement = pd.concat([sampling_class1_0, sampling_class1_2])\n",
    "sampling_class2_complement = pd.concat([sampling_class2_0, sampling_class2_1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# prepare for binary classification:\n",
    "# pos_label = 3\n",
    "sampling_class0_0['Label'] = 3\n",
    "sampling_class1_1['Label'] = 3\n",
    "sampling_class2_2['Label'] = 3\n",
    "# neg_label = 4\n",
    "sampling_class0_complement['Label'] = 4\n",
    "sampling_class1_complement['Label'] = 4\n",
    "sampling_class2_complement['Label'] = 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "sampling_class0 = pd.concat([sampling_class0_0, sampling_class0_complement]).reset_index(drop=True)\n",
    "sampling_class1 = pd.concat([sampling_class1_1, sampling_class1_complement]).reset_index(drop=True)\n",
    "sampling_class2 = pd.concat([sampling_class2_2, sampling_class2_complement]).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Apply Naive Bayes Model to estimate all labeled articles (1082 samples):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data = sampling_class2\n",
    "indices_train = train_data['Index'].tolist()\n",
    "len(indices_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "882"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_train))].reset_index(drop=True)\n",
    "len(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data.loc[(test_data['Label'] == 0), 'Label'] = 3\n",
    "test_data.loc[(test_data['Label'] == 1) | (test_data['Label'] == 2), 'Label'] = 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# split training data into text and label set\n",
    "# join title and text\n",
    "X = train_data['Title'] + '. ' + train_data['Text']\n",
    "y = train_data['Label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# split testing data into text and label set\n",
    "U = test_data['Title'] + '. ' + test_data['Text']\n",
    "v = test_data['Label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "recall: 0.19949811794228356\n",
      "precision: 0.803030303030303\n"
     ]
    }
   ],
   "source": [
    "classifier = GaussianNB()\n",
    "\n",
    "cv = CountVectorizer()\n",
    "\n",
    "# probabilities of each class\n",
    "class_probs = []\n",
    "\n",
    "# use sklearn CountVectorizer\n",
    "# fit the training data and then return the matrix\n",
    "training_data = cv.fit_transform(X, y).toarray()\n",
    "# transform testing data and return the matrix\n",
    "testing_data = cv.transform(U).toarray()\n",
    "\n",
    "#fit classifier\n",
    "classifier.fit(training_data, y)\n",
    "\n",
    "#predict class\n",
    "predictions_test = classifier.predict(testing_data)\n",
    "\n",
    "class_probs = classifier.predict_proba(testing_data)\n",
    "\n",
    "#print and store metrics\n",
    "rec = recall_score(v, predictions_test, pos_label=3)\n",
    "print('recall: ' + str(rec))\n",
    "prec = precision_score(v, predictions_test, pos_label=3)\n",
    "print('precision: ' + str(prec))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class_probs[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# series of indices of recently estimated articles \n",
    "indices_estimated_2 = test_data['Index'].tolist()\n",
    "\n",
    "# annotate probability\n",
    "n = 0\n",
    "for row in class_probs:\n",
    "    index = indices_estimated_2[n]\n",
    "    # save estimated label\n",
    "    df.loc[index, 'Estimated_2'] = row[0]\n",
    "    n += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Apply Naive Bayes Model (10-fold-cross validation):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = sampling_class0\n",
    "\n",
    "X = dataset['Title'] + '. ' + dataset['Text']\n",
    "y = dataset['Label']\n",
    "\n",
    "cv = CountVectorizer()\n",
    "\n",
    "# use stratified k-fold cross-validation as split method\n",
    "skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)\n",
    "\n",
    "classifier = GaussianNB()\n",
    "\n",
    "# metrics\n",
    "recall_scores = []\n",
    "precision_scores = []\n",
    "\n",
    "# probabilities of each class (of each fold)\n",
    "class_probs = []\n",
    "# counts number of training samples observed in each class \n",
    "class_counts = []\n",
    "\n",
    "# for each fold\n",
    "for train, test in skf.split(X,y):\n",
    "    \n",
    "    # fit the training data and then return the matrix\n",
    "    training_data = cv.fit_transform(X[train], y[train]).toarray()\n",
    "    # transform testing data and return the matrix\n",
    "    testing_data = cv.transform(X[test]).toarray()\n",
    "\n",
    "    #fit classifier\n",
    "    classifier.fit(training_data, y[train])\n",
    "    #predict class\n",
    "    predictions_train = classifier.predict(training_data)\n",
    "    predictions_test = classifier.predict(testing_data)\n",
    "\n",
    "    #print and store metrics\n",
    "    rec = recall_score(y[test], predictions_test)\n",
    "    recall_scores.append(rec)\n",
    "    prec = precision_score(y[test], predictions_test)\n",
    "    precision_scores.append(prec)\n",
    "\n",
    "    class_probs.append(classifier.class_prior_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# series of indices of recently estimated articles \n",
    "indices_estimated_0 = sampling_class0['Index'].tolist()\n",
    "\n",
    "# annotate probability\n",
    "n = 0\n",
    "for row in class_probs:\n",
    "    index = indices_estimated_0[n]\n",
    "    # save estimated label\n",
    "    df.loc[index, 'Estimated_0'] = row[1]\n",
    "    n += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
    "print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
    "print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
    "print()\n",
    "print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
    "print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
    "print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Number of used samples:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "indices_all_samples = set((indices_estimated_0 + indices_estimated_1) + indices_estimated_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(indices_all_samples)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check if there are samples where more than one class was marked with 1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "indices = df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0), 'Index'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save tri-model to csv \n",
    "df.to_csv('../data/interactive_labeling_triple_model_auf_1082.csv',\n",
    "      sep='|',\n",
    "      mode='w',\n",
    "      encoding='utf-8',\n",
    "      quoting=csv.QUOTE_NONNUMERIC,\n",
    "      quotechar='\\'')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read current data set from csv\n",
    "df = pd.read_csv('../data/interactive_labeling_triple_model_auf_1082.csv',\n",
    "          sep='|',\n",
    "          usecols=range(1,16), # drop first column 'unnamed'\n",
    "          encoding='utf-8',\n",
    "          quoting=csv.QUOTE_NONNUMERIC,\n",
    "          quotechar='\\'')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for index in indices:\n",
    "    show_next(index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}