{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Model Evaluation" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import csv\n", "import operator\n", "import pickle\n", "import random\n", "\n", "from ipywidgets import interact, interactive, fixed, interact_manual\n", "import ipywidgets as widgets\n", "from IPython.core.interactiveshell import InteractiveShell\n", "from IPython.display import display\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.naive_bayes import MultinomialNB\n", "\n", "from MNBInteractive import MNBInteractive\n", "from MultinomialNaiveBayes import MultinomialNaiveBayes\n", "from NaiveBayes import NaiveBayes" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# initialize random => reproducible sequence\n", "random.seed(5)\n", "\n", "# set up wider display area\n", "pd.set_option('display.max_colwidth', -1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Last round number: 11\n", "Number of manually labeled articles: 1082\n", "Number of manually unlabeled articles: 8918\n" ] } ], "source": [ "# read current data set from csv\n", "df = pd.read_csv('../data/interactive_labeling_round_11.csv',\n", " sep='|',\n", " usecols=range(1,13), # drop first column 'unnamed'\n", " encoding='utf-8',\n", " quoting=csv.QUOTE_NONNUMERIC,\n", " quotechar='\\'')\n", "\n", "# find current iteration/round number\n", "m = int(df['Round'].max())\n", "print('Last round number: {}'.format(m))\n", "print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n", "print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def show_next(index):\n", " ''' this method displays an article's text and an interactive slider to set its label manually\n", " '''\n", " print('News article no. {}:'.format(index))\n", " print()\n", " print('HEADLINE:')\n", " print(df.loc[df['Index'] == index, 'Title'])\n", " print()\n", " print('TEXT:')\n", " print(df.loc[df['Index'] == index, 'Text'])\n", " print()\n", " print('ESTIMATED_0:')\n", " print(df.loc[df['Index'] == index, 'Estimated_0'])\n", " print()\n", " print('ESTIMATED_1:')\n", " print(df.loc[df['Index'] == index, 'Estimated_1'])\n", " print()\n", " print('ESTIMATED_2:')\n", " print(df.loc[df['Index'] == index, 'Estimated_2'])\n", " \n", " def f(x):\n", " # save user input\n", " df.loc[df['Index'] == index, 'Label'] = x\n", "\n", " # create slider widget for labels\n", " interact(f, x = widgets.IntSlider(min=-1, max=2, step=1, value=df.loc[df['Index'] == index, 'Label']))\n", " print('0: Other/Unrelated news, 1: Merger,')\n", " print('2: Topics related to deals, investments and mergers')\n", " print('___________________________________________________________________________________________________________')\n", " print()\n", " print()\n", "\n", "# list of article indices that will be shown next\n", "label_next = []" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## How to find a better model:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A) Multinomial Naive Bayes Algorithm" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# toDo: läuft noch nicht\n", "\n", "# series of indices of recently estimated articles \n", "indices_estimated = df.loc[df['Label'] != -1, 'Index'].tolist()\n", "\n", "# annotate probability\n", "n = 0\n", "for row in class_probs[0]:\n", " index = indices_estimated[n]\n", " # save estimated label\n", " df.loc[index, 'Estimated_2'] = row[1]\n", " n += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Recall (Min): {}\".format(min(recall_scores)))\n", "print(\"Recall (Max): {}\".format(max(recall_scores)))\n", "print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n", "print()\n", "print(\"Precision (Min): {}\".format(min(precision_scores)))\n", "print(\"Precision (Max): {}\".format(max(precision_scores)))\n", "print(\"Precision (Average): {}\".format(sum(precision_scores)/10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print('confusion matrix:')\n", "print('###############')\n", "zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])\n", "zero_0\n", "zero_1 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 1)])\n", "zero_1\n", "zero_2 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 2)])\n", "zero_2\n", "print('/')\n", "one_0 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 0)])\n", "one_0\n", "one_1 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 1)])\n", "one_1\n", "one_2 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 2)])\n", "one_2\n", "print('/')\n", "\n", "two_0 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 0)])\n", "two_0\n", "two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])\n", "two_1\n", "two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])\n", "two_2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Building three separate models:\n", "\n", "B) One model per class: Funktioniert es besser wenn man 3 Modelle hat.\n", "Begründung: wir sind interessiert an Klasse 1\n", "Pro Klasse 1 Modell bauen (Hier ist das ziel das beste Modell zu finden. Dafür nehmen wir 1082 gelabelte Daten.)\n", "3 Modelle => Ergebnis für 1 Sample: (70%, 40%, 80%) unklar => überprüfen\n", "=> (90%, 90%, 90%) => überprüfen\n", "liefert das bessere ambiguity Samples als oben\n", "Stratified sample: (50 + 50 (1/2 von der anderen Klasse 1/2 der dritten Klasse))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n", "labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n", "labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "847" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(labeled_pos_0)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "50" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(labeled_pos_1)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "185" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(labeled_pos_2)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# add three new columns for the three models, initialize with nans\n", "df['Estimated_0'] = np.nan\n", "df['Estimated_1'] = np.nan\n", "df['Estimated_2'] = np.nan" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "sampling_class0_0 = labeled_pos_0.sample(n=100, random_state=5)\n", "sampling_class0_1 = labeled_pos_1.sample(n=50, random_state=5)\n", "sampling_class0_2 = labeled_pos_2.sample(n=50, random_state=5)\n", "\n", "sampling_class1_0 = labeled_pos_0.sample(n=25, random_state=5)\n", "sampling_class1_1 = sampling_class0_1\n", "sampling_class1_2 = labeled_pos_2.sample(n=25, random_state=5)\n", "\n", "sampling_class2_0 = labeled_pos_0.sample(n=50, random_state=5)\n", "sampling_class2_1 = sampling_class0_1\n", "sampling_class2_2 = labeled_pos_2.sample(n=100, random_state=5)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "sampling_class0_complement = pd.concat([sampling_class0_1, sampling_class0_2])\n", "sampling_class1_complement = pd.concat([sampling_class1_0, sampling_class1_2])\n", "sampling_class2_complement = pd.concat([sampling_class2_0, sampling_class2_1])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# prepare for binary classification:\n", "# pos_label = 3\n", "sampling_class0_0['Label'] = 3\n", "sampling_class1_1['Label'] = 3\n", "sampling_class2_2['Label'] = 3\n", "# neg_label = 4\n", "sampling_class0_complement['Label'] = 4\n", "sampling_class1_complement['Label'] = 4\n", "sampling_class2_complement['Label'] = 4" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "sampling_class0 = pd.concat([sampling_class0_0, sampling_class0_complement]).reset_index(drop=True)\n", "sampling_class1 = pd.concat([sampling_class1_1, sampling_class1_complement]).reset_index(drop=True)\n", "sampling_class2 = pd.concat([sampling_class2_2, sampling_class2_complement]).reset_index(drop=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Apply Naive Bayes Model to estimate all labeled articles (1082 samples):" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "200" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_data = sampling_class2\n", "indices_train = train_data['Index'].tolist()\n", "len(indices_train)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "882" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_train))].reset_index(drop=True)\n", "len(test_data)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "test_data.loc[(test_data['Label'] == 0), 'Label'] = 3\n", "test_data.loc[(test_data['Label'] == 1) | (test_data['Label'] == 2), 'Label'] = 4" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# split training data into text and label set\n", "# join title and text\n", "X = train_data['Title'] + '. ' + train_data['Text']\n", "y = train_data['Label']" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# split testing data into text and label set\n", "U = test_data['Title'] + '. ' + test_data['Text']\n", "v = test_data['Label']" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "recall: 0.19949811794228356\n", "precision: 0.803030303030303\n" ] } ], "source": [ "classifier = GaussianNB()\n", "\n", "cv = CountVectorizer()\n", "\n", "# probabilities of each class\n", "class_probs = []\n", "\n", "# use sklearn CountVectorizer\n", "# fit the training data and then return the matrix\n", "training_data = cv.fit_transform(X, y).toarray()\n", "# transform testing data and return the matrix\n", "testing_data = cv.transform(U).toarray()\n", "\n", "#fit classifier\n", "classifier.fit(training_data, y)\n", "\n", "#predict class\n", "predictions_test = classifier.predict(testing_data)\n", "\n", "class_probs = classifier.predict_proba(testing_data)\n", "\n", "#print and store metrics\n", "rec = recall_score(v, predictions_test, pos_label=3)\n", "print('recall: ' + str(rec))\n", "prec = precision_score(v, predictions_test, pos_label=3)\n", "print('precision: ' + str(prec))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class_probs[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# series of indices of recently estimated articles \n", "indices_estimated_2 = test_data['Index'].tolist()\n", "\n", "# annotate probability\n", "n = 0\n", "for row in class_probs:\n", " index = indices_estimated_2[n]\n", " # save estimated label\n", " df.loc[index, 'Estimated_2'] = row[0]\n", " n += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Apply Naive Bayes Model (10-fold-cross validation):" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset = sampling_class0\n", "\n", "X = dataset['Title'] + '. ' + dataset['Text']\n", "y = dataset['Label']\n", "\n", "cv = CountVectorizer()\n", "\n", "# use stratified k-fold cross-validation as split method\n", "skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)\n", "\n", "classifier = GaussianNB()\n", "\n", "# metrics\n", "recall_scores = []\n", "precision_scores = []\n", "\n", "# probabilities of each class (of each fold)\n", "class_probs = []\n", "# counts number of training samples observed in each class \n", "class_counts = []\n", "\n", "# for each fold\n", "for train, test in skf.split(X,y):\n", " \n", " # fit the training data and then return the matrix\n", " training_data = cv.fit_transform(X[train], y[train]).toarray()\n", " # transform testing data and return the matrix\n", " testing_data = cv.transform(X[test]).toarray()\n", "\n", " #fit classifier\n", " classifier.fit(training_data, y[train])\n", " #predict class\n", " predictions_train = classifier.predict(training_data)\n", " predictions_test = classifier.predict(testing_data)\n", "\n", " #print and store metrics\n", " rec = recall_score(y[test], predictions_test)\n", " recall_scores.append(rec)\n", " prec = precision_score(y[test], predictions_test)\n", " precision_scores.append(prec)\n", "\n", " class_probs.append(classifier.class_prior_)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# series of indices of recently estimated articles \n", "indices_estimated_0 = sampling_class0['Index'].tolist()\n", "\n", "# annotate probability\n", "n = 0\n", "for row in class_probs:\n", " index = indices_estimated_0[n]\n", " # save estimated label\n", " df.loc[index, 'Estimated_0'] = row[1]\n", " n += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Recall (Min): {}\".format(min(recall_scores)))\n", "print(\"Recall (Max): {}\".format(max(recall_scores)))\n", "print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n", "print()\n", "print(\"Precision (Min): {}\".format(min(precision_scores)))\n", "print(\"Precision (Max): {}\".format(max(precision_scores)))\n", "print(\"Precision (Average): {}\".format(sum(precision_scores)/10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Number of used samples:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "indices_all_samples = set((indices_estimated_0 + indices_estimated_1) + indices_estimated_2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(indices_all_samples)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check if there are samples where more than one class was marked with 1." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0)])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "indices = df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0), 'Index'].tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# save tri-model to csv \n", "df.to_csv('../data/interactive_labeling_triple_model_auf_1082.csv',\n", " sep='|',\n", " mode='w',\n", " encoding='utf-8',\n", " quoting=csv.QUOTE_NONNUMERIC,\n", " quotechar='\\'')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# read current data set from csv\n", "df = pd.read_csv('../data/interactive_labeling_triple_model_auf_1082.csv',\n", " sep='|',\n", " usecols=range(1,16), # drop first column 'unnamed'\n", " encoding='utf-8',\n", " quoting=csv.QUOTE_NONNUMERIC,\n", " quotechar='\\'')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for index in indices:\n", " show_next(index)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }