thesis-anne/src/working notebooks/2019-05-04-al-three-model-a...

714 lines
26 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Last round number: 17\n",
"Number of manually labeled articles: 1412\n",
"Number of manually unlabeled articles: 8588\n"
]
}
],
"source": [
"import csv\n",
"import operator\n",
"import pickle\n",
"import random\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"\n",
"# initialize random => reproducible sequence\n",
"random.seed(5)\n",
"random_state=5\n",
"\n",
"# set up wider display area\n",
"pd.set_option('display.max_colwidth', -1)\n",
"\n",
"# read current data set from csv\n",
"df = pd.read_csv('../../data/interactive_labeling_round_17_20190502.csv',\n",
" sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')\n",
"\n",
"# find current iteration/round number\n",
"m = int(df['Round'].max())\n",
"print('Last round number: {}'.format(m))\n",
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"m = 10\n",
"df.loc[(df['Round'] >= m), 'Label'] = -1\n",
"df.loc[(df['Round'] >= m), 'Round'] = np.nan\n",
"\n",
"len(df.loc[df['Label'] != -1])\n",
"\n",
"labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"737\n",
"35\n",
"128\n",
"655\n",
"31\n",
"114\n",
"573\n",
"27\n",
"100\n",
"491\n",
"23\n",
"86\n",
"409\n",
"19\n",
"72\n",
"327\n",
"15\n",
"58\n",
"245\n",
"11\n",
"44\n",
"163\n",
"7\n",
"30\n",
"81\n",
"3\n",
"16\n",
"0\n",
"0\n",
"0\n"
]
}
],
"source": [
"sampling_0_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_0_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_0_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_0_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_0_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_0_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_1_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_1_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_1_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_1_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_1_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_1_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_2_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_2_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_2_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_2_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_2_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_2_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_3_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_3_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_3_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_3_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_3_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_3_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_4_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_4_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_4_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_4_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_4_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_4_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_5_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_5_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_5_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_5_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_5_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_5_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_6_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_6_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_6_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_6_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_6_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_6_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_7_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_7_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_7_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_7_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_7_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_7_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_8_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_8_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_8_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_8_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_8_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_8_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_9_class0 = labeled_pos_0.sample(n=81, replace=False, random_state=random_state) # 737\n",
"sampling_9_class1 = labeled_pos_1.sample(n=3, replace=False, random_state=random_state) # 35\n",
"sampling_9_class2 = labeled_pos_2.sample(n=16, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_9_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_9_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_9_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) "
]
},
{
"cell_type": "code",
"execution_count": 238,
"metadata": {},
"outputs": [],
"source": [
"# TESTING DATA\n",
"#testing_data = pd.concat([sampling_0_class0, sampling_0_class1, sampling_0_class2])\n",
"#testing_data = pd.concat([sampling_1_class0, sampling_1_class1, sampling_1_class2])\n",
"#testing_data = pd.concat([sampling_2_class0, sampling_2_class1, sampling_2_class2])\n",
"#testing_data = pd.concat([sampling_3_class0, sampling_3_class1, sampling_3_class2])\n",
"#testing_data = pd.concat([sampling_4_class0, sampling_4_class1, sampling_4_class2])\n",
"#testing_data = pd.concat([sampling_5_class0, sampling_5_class1, sampling_5_class2])\n",
"#testing_data = pd.concat([sampling_6_class0, sampling_6_class1, sampling_6_class2])\n",
"#testing_data = pd.concat([sampling_7_class0, sampling_7_class1, sampling_7_class2])\n",
"#testing_data = pd.concat([sampling_8_class0, sampling_8_class1, sampling_8_class2])\n",
"testing_data = pd.concat([sampling_9_class0, sampling_9_class1, sampling_9_class2])"
]
},
{
"cell_type": "code",
"execution_count": 239,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"100"
]
},
"execution_count": 239,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indices_testing_data = testing_data['Index'].tolist()\n",
"len(testing_data)"
]
},
{
"cell_type": "code",
"execution_count": 240,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"900"
]
},
"execution_count": 240,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# TRAINING DATA\n",
"training_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_testing_data))].reset_index(drop=True)\n",
"indices_training_data = training_data['Index'].tolist()\n",
"len(training_data)"
]
},
{
"cell_type": "code",
"execution_count": 241,
"metadata": {},
"outputs": [],
"source": [
"# Model 2:\n",
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
"sampling_class0 = labeled_pos_0.sample(n=18, random_state=random_state) # 737\n",
"sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 0\n",
"sampling_class1 = labeled_pos_1.sample(n=18, random_state=random_state) # 35\n",
"sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 0\n",
"sampling_class2 = labeled_pos_2.sample(n=35, random_state=random_state) # 128\n",
"sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 1\n",
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
"\n",
"testing_data.loc[testing_data['Label'] == 0, 'Label'] = 0\n",
"testing_data.loc[testing_data['Label'] == 1, 'Label'] = 0\n",
"testing_data.loc[testing_data['Label'] == 2, 'Label'] = 1\n",
"classifier = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [],
"source": [
"# Model 1:\n",
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
"sampling_class0 = labeled_pos_0.sample(n=18, random_state=random_state) # 737\n",
"sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 0\n",
"sampling_class1 = labeled_pos_1.sample(n=35, random_state=random_state) # 35\n",
"sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 1\n",
"sampling_class2 = labeled_pos_2.sample(n=18, random_state=random_state) # 128\n",
"sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 0\n",
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
"\n",
"testing_data.loc[testing_data['Label'] == 0, 'Label'] = 0\n",
"testing_data.loc[testing_data['Label'] == 1, 'Label'] = 1\n",
"testing_data.loc[testing_data['Label'] == 2, 'Label'] = 0\n",
"classifier = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"# Model 0:\n",
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
"sampling_class0 = labeled_pos_0.sample(n=35, random_state=random_state) # 737\n",
"sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 1\n",
"sampling_class1 = labeled_pos_1.sample(n=18, random_state=random_state) # 35\n",
"sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 0\n",
"sampling_class2 = labeled_pos_2.sample(n=18, random_state=random_state) # 128\n",
"sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 0\n",
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
"\n",
"testing_data.loc[testing_data['Label'] == 0, 'Label'] = 1\n",
"testing_data.loc[testing_data['Label'] == 1, 'Label'] = 0\n",
"testing_data.loc[testing_data['Label'] == 2, 'Label'] = 0\n",
"classifier = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"737\n",
"36\n",
"126\n"
]
}
],
"source": [
"# MNB:\n",
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_class0 = labeled_pos_0.sample(n=24, random_state=random_state) # 737\n",
"sampling_class1 = labeled_pos_1.sample(n=24, random_state=random_state) # 35\n",
"sampling_class2 = labeled_pos_2.sample(n=24, random_state=random_state) # 128\n",
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
"indices_training_data = training_data['Index'].tolist()\n",
"len(training_data)\n",
"classifier = MultinomialNB()"
]
},
{
"cell_type": "code",
"execution_count": 242,
"metadata": {},
"outputs": [],
"source": [
"# split training data into text and label set\n",
"# join title and text\n",
"X = training_data['Title'] + '. ' + training_data['Text']\n",
"y = training_data['Label']\n",
"\n",
"# split testing data into text and label set\n",
"U = testing_data['Title'] + '. ' + testing_data['Text']\n",
"v = testing_data['Label']\n",
"\n",
"cv = CountVectorizer()\n",
"# fit the training data and then return the matrix\n",
"training_data = cv.fit_transform(X, y).toarray()\n",
"# transform testing data and return the matrix\n",
"testing_data = cv.transform(U).toarray()\n",
"#fit classifier\n",
"classifier.fit(training_data, y)\n",
"#predict class\n",
"predictions_test = classifier.predict(testing_data)\n",
"\n",
"# annotate estimated labels\n",
"df['Estimated'] = np.nan\n",
"for i, value in enumerate(indices_testing_data):\n",
" df.loc[(df['Index'] == value), 'Estimated'] = predictions_test[i]"
]
},
{
"cell_type": "code",
"execution_count": 243,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n",
"69\n",
"1\n",
"###############\n",
"12\n",
"2\n",
"###############\n",
"metrics:\n",
"\n",
"69\n",
"2\n",
"1\n",
"12\n",
"###############\n",
"2\n",
"69\n",
"12\n",
"1\n",
"###############\n",
"98.57142857142858\n",
"85.18518518518519\n",
"84.52380952380952\n",
"###############\n",
"14.285714285714285\n",
"66.66666666666666\n",
"84.52380952380952\n",
"###############\n",
"56.42857142857143\n",
"75.92592592592592\n",
"84.52380952380952\n"
]
}
],
"source": [
"# Model 0-2:\n",
"print('###############')\n",
"zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
"print(zero_0)\n",
"zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
"print(zero_1)\n",
"print('###############')\n",
"one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
"print(one_0)\n",
"one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
"print(one_1)\n",
"print('###############')\n",
"\n",
"print('metrics:')\n",
"print()\n",
"\n",
"total = zero_0 + zero_1 + one_0 + one_1\n",
"\n",
"tp_0 = zero_0\n",
"print(tp_0)\n",
"tn_0 = one_1\n",
"print(tn_0)\n",
"fp_0 = zero_1\n",
"print(fp_0)\n",
"fn_0 = one_0\n",
"print(fn_0)\n",
"print('###############')\n",
"\n",
"tp_1 = one_1\n",
"print(tp_1)\n",
"tn_1 = zero_0\n",
"print(tn_1)\n",
"fp_1 = one_0\n",
"print(fp_1)\n",
"fn_1 = zero_1\n",
"print(fn_1)\n",
"print('###############')\n",
"\n",
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
"print(prec_0)\n",
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
"print(rec_0)\n",
"acc_0 = (tp_0 + tn_0) / total * 100\n",
"print(acc_0)\n",
"print('###############')\n",
"\n",
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
"print(prec_1)\n",
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
"print(rec_1)\n",
"acc_1 = (tp_1 + tn_1) / total * 100\n",
"print(acc_1)\n",
"print('###############')\n",
"\n",
"print((prec_1 + prec_0) / 2)\n",
"print((rec_1 + rec_0) / 2)\n",
"print((acc_1 + acc_0) / 2)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"confusion matrix:\n",
"###############\n",
"62\n",
"0\n",
"0\n",
"/\n",
"12\n",
"3\n",
"11\n",
"/\n",
"8\n",
"0\n",
"5\n",
"###############\n",
"\n",
"class 0:\n",
"\n",
"TP: 62\n",
"TN: 19\n",
"FP: 0\n",
"FN: 20\n",
"\n",
"class 1:\n",
"\n",
"TP: 3\n",
"TN: 75\n",
"FP: 23\n",
"FN: 0\n",
"\n",
"class 2:\n",
"\n",
"TP: 5\n",
"TN: 77\n",
"FP: 8\n",
"FN: 11\n",
"###############\n",
"\n",
"METRICS:\n",
"\n",
"class 0:\n",
"\n",
"precision: 100.0\n",
"recall: 75.61\n",
"accuracy: 80.2\n",
"\n",
"class 1:\n",
"\n",
"precision: 11.54\n",
"recall: 100.0\n",
"accuracy: 77.23\n",
"\n",
"class 2:\n",
"\n",
"precision: 38.46\n",
"recall: 31.25\n",
"accuracy: 81.19\n",
"\n",
"Average Metrics:\n",
"\n",
"precision: 50.0\n",
"recall: 68.95325203252033\n",
"accuracy: 79.53795379537955\n"
]
}
],
"source": [
"# MNB:\n",
"print('confusion matrix:')\n",
"print('###############')\n",
"zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
"print(zero_0)\n",
"zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
"print(zero_1)\n",
"zero_2 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 2)])\n",
"print(zero_2)\n",
"print('/')\n",
"one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
"print(one_0)\n",
"one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
"print(one_1)\n",
"one_2 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 2)])\n",
"print(one_2)\n",
"print('/')\n",
"\n",
"two_0 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 0)])\n",
"print(two_0)\n",
"two_1 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 1)])\n",
"print(two_1)\n",
"two_2 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 2)])\n",
"print(two_2)\n",
"\n",
"print('###############')\n",
"print()\n",
"total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
"print('class 0:')\n",
"print()\n",
"tp_0 = zero_0\n",
"print('TP: {}'.format(tp_0))\n",
"tn_0 = one_1 + one_2 + two_1 + two_2\n",
"print('TN: {}'.format(tn_0))\n",
"fp_0 = zero_1 + zero_2\n",
"print('FP: {}'.format(fp_0))\n",
"fn_0 = one_0 + two_0\n",
"print('FN: {}'.format(fn_0))\n",
"print()\n",
"print('class 1:')\n",
"print()\n",
"tp_1 = one_1\n",
"print('TP: {}'.format(tp_1))\n",
"tn_1 = zero_0 + zero_2 + two_0 + two_2\n",
"print('TN: {}'.format(tn_1))\n",
"fp_1 = one_0 + one_2\n",
"print('FP: {}'.format(fp_1))\n",
"fn_1 = zero_1 + two_1\n",
"print('FN: {}'.format(fn_1))\n",
"print()\n",
"print('class 2:')\n",
"print()\n",
"tp_2 = two_2\n",
"print('TP: {}'.format(tp_2))\n",
"tn_2 = zero_0 + zero_1 + one_0 + one_1\n",
"print('TN: {}'.format(tn_2))\n",
"fp_2 = two_0 + two_1\n",
"print('FP: {}'.format(fp_2))\n",
"fn_2 = zero_2 + one_2\n",
"print('FN: {}'.format(fn_2))\n",
"print('###############')\n",
"print()\n",
"print('METRICS:')\n",
"print()\n",
"print('class 0:')\n",
"print()\n",
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
"print('precision: {}'.format(round(prec_0, 2)))\n",
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
"print('recall: {}'.format(round(rec_0, 2)))\n",
"acc_0 = (tp_0 + tn_0) / total * 100\n",
"print('accuracy: {}'.format(round(acc_0, 2)))\n",
"print()\n",
"print('class 1:')\n",
"print()\n",
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
"print('precision: {}'.format(round(prec_1, 2)))\n",
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
"print('recall: {}'.format(round(rec_1, 2)))\n",
"acc_1 = (tp_1 + tn_1) / total * 100\n",
"print('accuracy: {}'.format(round(acc_1, 2)))\n",
"print()\n",
"print('class 2:')\n",
"print()\n",
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
"print('precision: {}'.format(round(prec_2, 2)))\n",
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
"print('recall: {}'.format(round(rec_2, 2)))\n",
"acc_2 = (tp_2 + tn_2) / total * 100\n",
"print('accuracy: {}'.format(round(acc_2, 2)))\n",
"print()\n",
"print('Average Metrics:')\n",
"print()\n",
"print('precision: {}'.format((prec_1 + prec_2 + prec_0) / 3))\n",
"print('recall: {}'.format((rec_1 + rec_2 + rec_0) / 3))\n",
"print('accuracy: {}'.format((acc_1 + acc_2 + acc_0) / 3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}