interactive labeling: documentation round 0-9
This commit is contained in:
parent
213bb148de
commit
a2c7a7279e
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,689 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Notebook for calculation of the resubstitution error...\n",
|
||||||
|
"\n",
|
||||||
|
"Note:\n",
|
||||||
|
"\n",
|
||||||
|
"class 0: unrelated news\n",
|
||||||
|
"\n",
|
||||||
|
"class 1: mergers\n",
|
||||||
|
"\n",
|
||||||
|
"class 2: other deals, non-mergers, etc."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import csv\n",
|
||||||
|
"import operator\n",
|
||||||
|
"import pickle\n",
|
||||||
|
"import random\n",
|
||||||
|
"\n",
|
||||||
|
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
|
||||||
|
"import ipywidgets as widgets\n",
|
||||||
|
"from IPython.core.interactiveshell import InteractiveShell\n",
|
||||||
|
"from IPython.display import display\n",
|
||||||
|
"import matplotlib\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||||
|
"from sklearn.feature_selection import SelectPercentile\n",
|
||||||
|
"from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer\n",
|
||||||
|
"from sklearn.model_selection import GridSearchCV\n",
|
||||||
|
"from sklearn.model_selection import StratifiedKFold\n",
|
||||||
|
"from sklearn.pipeline import Pipeline\n",
|
||||||
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||||
|
"from sklearn.semi_supervised import label_propagation\n",
|
||||||
|
"\n",
|
||||||
|
"from BagOfWords import BagOfWords\n",
|
||||||
|
"from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
|
||||||
|
"from MNBInteractive import MNBInteractive"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# initialize random => reproducible sequence\n",
|
||||||
|
"random_state = 5\n",
|
||||||
|
"\n",
|
||||||
|
"# set up wider display area\n",
|
||||||
|
"pd.set_option('display.max_colwidth', -1)\n",
|
||||||
|
"\n",
|
||||||
|
"# show full text for print statement\n",
|
||||||
|
"InteractiveShell.ast_node_interactivity = \"all\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Last iteration number: 11\n",
|
||||||
|
"Number of manually labeled articles: 1082\n",
|
||||||
|
"Number of manually unlabeled articles: 8918\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# read current data set from csv\n",
|
||||||
|
"df = pd.read_csv('../data/interactive_labeling_round_11.csv',\n",
|
||||||
|
" sep='|',\n",
|
||||||
|
" usecols=range(1,13), # drop first column 'unnamed'\n",
|
||||||
|
" encoding='utf-8',\n",
|
||||||
|
" quoting=csv.QUOTE_NONNUMERIC,\n",
|
||||||
|
" quotechar='\\'')\n",
|
||||||
|
"\n",
|
||||||
|
"# find current iteration/round number\n",
|
||||||
|
"m = int(df['Round'].max())\n",
|
||||||
|
"print('Last iteration number: {}'.format(m))\n",
|
||||||
|
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
|
||||||
|
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Calculate the resubstitution error for iteration 0-9 with stratified sampling.\n",
|
||||||
|
"Start with iteration number 0."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"m = 0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 131,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 131,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"m = 3\n",
|
||||||
|
"m"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 132,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# select all samples that were labeled with 0/1/2\n",
|
||||||
|
"set_0 = df.loc[(df['Round'] == m) & (df['Label'] == 0)]\n",
|
||||||
|
"set_1 = df.loc[(df['Round'] == m) & (df['Label'] == 1)]\n",
|
||||||
|
"set_2 = df.loc[(df['Round'] == m) & (df['Label'] == 2)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 133,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"number of labeled samples by class (0/1/2): 82/4/14\n",
|
||||||
|
"minimum of new labeled samples: 4\n",
|
||||||
|
"length of current data set for resubstitution error: 12\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# find minimum\n",
|
||||||
|
"print('number of labeled samples by class (0/1/2): {}/{}/{}'.format(len(set_0), len(set_1), len(set_2)))\n",
|
||||||
|
"strat_len = min(len(set_0), len(set_1), len(set_2))\n",
|
||||||
|
"print('minimum of new labeled samples: {}'.format(strat_len))\n",
|
||||||
|
"# length of current data set for resubstitution error\n",
|
||||||
|
"print('length of current data set for resubstitution error: {}'.format(strat_len * 3))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 111,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# random sampling for selection\n",
|
||||||
|
"selec_0 = set_0.sample(n=strat_len, random_state=random_state)\n",
|
||||||
|
"selec_1 = set_1.sample(n=strat_len, random_state=random_state)\n",
|
||||||
|
"selec_2 = set_2.sample(n=strat_len, random_state=random_state)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 112,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# newly added training data of the current round\n",
|
||||||
|
"# training_data_0 = pd.concat([selec_0, selec_1, selec_2])\n",
|
||||||
|
"# training_data_1 = pd.concat([selec_0, selec_1, selec_2])\n",
|
||||||
|
"# training_data_2 = pd.concat([selec_0, selec_1, selec_2])\n",
|
||||||
|
"# training_data_3 = pd.concat([selec_0, selec_1, selec_2])\n",
|
||||||
|
"training_data_4 = pd.concat([selec_0, selec_1, selec_2])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 113,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[5789.0,\n",
|
||||||
|
" 4237.0,\n",
|
||||||
|
" 2202.0,\n",
|
||||||
|
" 4913.0,\n",
|
||||||
|
" 821.0,\n",
|
||||||
|
" 5973.0,\n",
|
||||||
|
" 6198.0,\n",
|
||||||
|
" 8490.0,\n",
|
||||||
|
" 4815.0,\n",
|
||||||
|
" 2386.0,\n",
|
||||||
|
" 5177.0,\n",
|
||||||
|
" 2482.0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 113,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# indices of training samples\n",
|
||||||
|
"# idx_0 = training_data_0['Index'].tolist()\n",
|
||||||
|
"# idx_1 = training_data_1['Index'].tolist()\n",
|
||||||
|
"# idx_2 = training_data_2['Index'].tolist()\n",
|
||||||
|
"# idx_3 = training_data_3['Index'].tolist()\n",
|
||||||
|
"idx_4 = training_data_4['Index'].tolist()\n",
|
||||||
|
"\n",
|
||||||
|
"train_all = train_all.append(training_data_4)\n",
|
||||||
|
"idx_all = train_all['Index'].tolist()\n",
|
||||||
|
"idx_4"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 140,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"36"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 140,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"train_0_2 = train_0_1.append(training_data_2)\n",
|
||||||
|
"len(train_0_2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 114,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"stratified number in round 3: 12\n",
|
||||||
|
"stratified number in total: 48\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print('stratified number in round {}: {}'.format(m, len(idx_3)))\n",
|
||||||
|
"print('stratified number in total: {}'.format(len(idx_all)))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 115,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# STEP 1:\n",
|
||||||
|
"# resubstitution error round\n",
|
||||||
|
"training_data = training_data_3\n",
|
||||||
|
"testing_data = training_data_3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 116,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 116,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"m"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 119,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"400"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 119,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# STEP 2: \n",
|
||||||
|
"# resubstitution error all labeled articles in round\n",
|
||||||
|
"training_data = training_data_3\n",
|
||||||
|
"testing_data = df.loc[(df['Round'] <= m)]\n",
|
||||||
|
"len(testing_data)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 137,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# STEP 3:\n",
|
||||||
|
"training_data = train_all\n",
|
||||||
|
"testing_data = df.loc[(df['Round'] <= m)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 147,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# STEP 4:\n",
|
||||||
|
"training_data = train_0_2\n",
|
||||||
|
"testing_data = training_data_3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 148,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"# MNB: starting interactive multinomial naives bayes...\n",
|
||||||
|
"\n",
|
||||||
|
"# MNB: ending multinomial naive bayes\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# call script\n",
|
||||||
|
"classes, class_count, class_probs = MNBInteractive.estimate_mnb(training_data, testing_data, True)\n",
|
||||||
|
"\n",
|
||||||
|
"# series of indices of recently estimated articles \n",
|
||||||
|
"indices_estimated = testing_data['Index'].tolist()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 149,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"confusion matrix:\n",
|
||||||
|
"###############\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 149,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 149,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 149,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 149,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"4"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 149,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 149,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 149,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 149,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 149,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n",
|
||||||
|
"\n",
|
||||||
|
"class 0:\n",
|
||||||
|
"\n",
|
||||||
|
"TP: 1\n",
|
||||||
|
"TN: 7\n",
|
||||||
|
"FP: 1\n",
|
||||||
|
"FN: 3\n",
|
||||||
|
"\n",
|
||||||
|
"class 1:\n",
|
||||||
|
"\n",
|
||||||
|
"TP: 4\n",
|
||||||
|
"TN: 3\n",
|
||||||
|
"FP: 5\n",
|
||||||
|
"FN: 0\n",
|
||||||
|
"\n",
|
||||||
|
"class 2:\n",
|
||||||
|
"\n",
|
||||||
|
"TP: 0\n",
|
||||||
|
"TN: 7\n",
|
||||||
|
"FP: 1\n",
|
||||||
|
"FN: 4\n",
|
||||||
|
"###############\n",
|
||||||
|
"\n",
|
||||||
|
"METRICS:\n",
|
||||||
|
"\n",
|
||||||
|
"class 0:\n",
|
||||||
|
"\n",
|
||||||
|
"precision: 50.0\n",
|
||||||
|
"recall: 25.0\n",
|
||||||
|
"accuracy: 66.667\n",
|
||||||
|
"\n",
|
||||||
|
"class 1:\n",
|
||||||
|
"\n",
|
||||||
|
"precision: 44.444\n",
|
||||||
|
"recall: 100.0\n",
|
||||||
|
"accuracy: 58.333\n",
|
||||||
|
"\n",
|
||||||
|
"class 2:\n",
|
||||||
|
"\n",
|
||||||
|
"precision: 0.0\n",
|
||||||
|
"recall: 0.0\n",
|
||||||
|
"accuracy: 58.333\n",
|
||||||
|
"\n",
|
||||||
|
"Average Metrics:\n",
|
||||||
|
"\n",
|
||||||
|
"precision: 31\n",
|
||||||
|
"recall: 42\n",
|
||||||
|
"accuracy: 61\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"n = 0 \n",
|
||||||
|
"for row in class_probs:\n",
|
||||||
|
" for i in range(0, len(classes)):\n",
|
||||||
|
" index = indices_estimated[n]\n",
|
||||||
|
" # save estimated label\n",
|
||||||
|
" if np.amax(row) == row[i]:\n",
|
||||||
|
" testing_data.loc[index, 'Estimated'] = classes[i]\n",
|
||||||
|
" # annotate probability\n",
|
||||||
|
" testing_data.loc[index, 'Probability'] = row[i]\n",
|
||||||
|
" n += 1\n",
|
||||||
|
"\n",
|
||||||
|
"print('confusion matrix:')\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])\n",
|
||||||
|
"zero_0\n",
|
||||||
|
"zero_1 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 1)])\n",
|
||||||
|
"zero_1\n",
|
||||||
|
"zero_2 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 2)])\n",
|
||||||
|
"zero_2\n",
|
||||||
|
"print('/')\n",
|
||||||
|
"one_0 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 0)])\n",
|
||||||
|
"one_0\n",
|
||||||
|
"one_1 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 1)])\n",
|
||||||
|
"one_1\n",
|
||||||
|
"one_2 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 2)])\n",
|
||||||
|
"one_2\n",
|
||||||
|
"print('/')\n",
|
||||||
|
"\n",
|
||||||
|
"two_0 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 0)])\n",
|
||||||
|
"two_0\n",
|
||||||
|
"two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])\n",
|
||||||
|
"two_1\n",
|
||||||
|
"two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])\n",
|
||||||
|
"two_2\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"print()\n",
|
||||||
|
"total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
|
||||||
|
"print('class 0:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"tp_0 = zero_0\n",
|
||||||
|
"print('TP: {}'.format(tp_0))\n",
|
||||||
|
"tn_0 = one_1 + one_2 + two_1 + two_2\n",
|
||||||
|
"print('TN: {}'.format(tn_0))\n",
|
||||||
|
"fp_0 = zero_1 + zero_2\n",
|
||||||
|
"print('FP: {}'.format(fp_0))\n",
|
||||||
|
"fn_0 = one_0 + two_0\n",
|
||||||
|
"print('FN: {}'.format(fn_0))\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('class 1:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"tp_1 = one_1\n",
|
||||||
|
"print('TP: {}'.format(tp_1))\n",
|
||||||
|
"tn_1 = zero_0 + zero_2 + two_0 + two_2\n",
|
||||||
|
"print('TN: {}'.format(tn_1))\n",
|
||||||
|
"fp_1 = one_0 + one_2\n",
|
||||||
|
"print('FP: {}'.format(fp_1))\n",
|
||||||
|
"fn_1 = zero_1 + two_1\n",
|
||||||
|
"print('FN: {}'.format(fn_1))\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('class 2:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"tp_2 = two_2\n",
|
||||||
|
"print('TP: {}'.format(tp_2))\n",
|
||||||
|
"tn_2 = zero_0 + zero_1 + one_0 + one_1\n",
|
||||||
|
"print('TN: {}'.format(tn_2))\n",
|
||||||
|
"fp_2 = two_0 + two_1\n",
|
||||||
|
"print('FP: {}'.format(fp_2))\n",
|
||||||
|
"fn_2 = zero_2 + one_2\n",
|
||||||
|
"print('FN: {}'.format(fn_2))\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('METRICS:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('class 0:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
|
||||||
|
"print('precision: {}'.format(round(prec_0, 3)))\n",
|
||||||
|
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
|
||||||
|
"print('recall: {}'.format(round(rec_0, 3)))\n",
|
||||||
|
"acc_0 = (tp_0 + tn_0) / total * 100\n",
|
||||||
|
"print('accuracy: {}'.format(round(acc_0, 3)))\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('class 1:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
|
||||||
|
"print('precision: {}'.format(round(prec_1, 3)))\n",
|
||||||
|
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
|
||||||
|
"print('recall: {}'.format(round(rec_1, 3)))\n",
|
||||||
|
"acc_1 = (tp_1 + tn_1) / total * 100\n",
|
||||||
|
"print('accuracy: {}'.format(round(acc_1, 3)))\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('class 2:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
|
||||||
|
"print('precision: {}'.format(round(prec_2, 3)))\n",
|
||||||
|
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
|
||||||
|
"print('recall: {}'.format(round(rec_2, 3)))\n",
|
||||||
|
"acc_2 = (tp_2 + tn_2) / total * 100\n",
|
||||||
|
"print('accuracy: {}'.format(round(acc_2, 3)))\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('Average Metrics:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('precision: {}'.format(round((prec_1 + prec_2 + prec_0) / 3), 3))\n",
|
||||||
|
"print('recall: {}'.format(round((rec_1 + rec_2 + rec_0) / 3), 3))\n",
|
||||||
|
"print('accuracy: {}'.format(round((acc_1 + acc_2 + acc_0) / 3), 3))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
|
@ -1,6 +1,8 @@
|
||||||
|
import csv
|
||||||
import matplotlib
|
import matplotlib
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
class LabelingPlotter():
|
class LabelingPlotter():
|
||||||
|
@ -54,7 +56,7 @@ class LabelingPlotter():
|
||||||
|
|
||||||
def plot_cumulative():
|
def plot_cumulative():
|
||||||
# load pickle object
|
# load pickle object
|
||||||
with open('../obj/array_class_probs_round_9.pkl', 'rb') as input:
|
with open('../obj/array_class_probs_round_11.pkl', 'rb') as input:
|
||||||
list = pickle.load(input)
|
list = pickle.load(input)
|
||||||
|
|
||||||
# sort list in descending order
|
# sort list in descending order
|
||||||
|
@ -79,14 +81,43 @@ class LabelingPlotter():
|
||||||
|
|
||||||
|
|
||||||
ax.grid(True)
|
ax.grid(True)
|
||||||
ax.legend(loc='right')
|
#ax.legend(loc='right')
|
||||||
#ax.set_title('Cumulative distribution of highest estimated probability')
|
#ax.set_title('Cumulative distribution of highest estimated probability')
|
||||||
ax.set_xlabel('Highest estimated probability')
|
ax.set_xlabel('Highest estimated probability')
|
||||||
ax.set_ylabel('Fraction of articles with this highest estimated probability')
|
ax.set_ylabel('Fraction of articles with this highest estimated probability')
|
||||||
#plt.axis([0.5, 0.99, 0, 0.006])
|
#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
|
||||||
|
#plt.axis([0.65, 1, 0, 0.003]) # round 10
|
||||||
|
plt.axis([0.7, 1, 0, 0.002]) # round 11
|
||||||
#ax.set_xbound(lower=0.5, upper=0.99)
|
#ax.set_xbound(lower=0.5, upper=0.99)
|
||||||
|
plt.savefig('..\\visualization\\proba_round_11.png')
|
||||||
|
plt.savefig('..\\visualization\\proba_round_11.eps')
|
||||||
|
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
def plot_correlation():
|
||||||
|
m = 10
|
||||||
|
df = pd.read_csv('../data/interactive_labeling_round_{}_temp.csv'.format(m),
|
||||||
|
sep='|',
|
||||||
|
usecols=range(1,13), # drop first column 'unnamed'
|
||||||
|
encoding='utf-8',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
|
# add boolean, if estimation was true
|
||||||
|
df['EstCorrect'] = np.nan
|
||||||
|
df.loc[(df['Label'] != -1) & (df['Label'] == df['Estimated']), 'EstCorrect'] = 1
|
||||||
|
df.loc[(df['Label'] != -1) & (df['Label'] != df['Estimated']), 'EstCorrect'] = 0
|
||||||
|
|
||||||
|
print('estimation was correct: {}'.format(len(df.loc[df['EstCorrect'] == 1])))
|
||||||
|
print('estimation was wrong: {}'.format(len(df.loc[df['EstCorrect'] == 0])))
|
||||||
|
|
||||||
|
x = df.loc[df['Label'] != -1, 'Probability'].tolist()
|
||||||
|
y = df.loc[df['Label'] != -1, 'EstCorrect'].tolist()
|
||||||
|
|
||||||
|
plt.plot(x, y, 'bo')
|
||||||
|
plt.axis([0.4, 1, -0.1, 1.1])
|
||||||
|
plt.show()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
LabelingPlotter.plot_cumulative()
|
#LabelingPlotter.plot_correlation()
|
||||||
|
LabelingPlotter.plot_cumulative()
|
|
@ -24,7 +24,7 @@ class MNBInteractive:
|
||||||
'''fits naive bayes model
|
'''fits naive bayes model
|
||||||
'''
|
'''
|
||||||
|
|
||||||
print('# MNB: starting multinomial naives bayes...')
|
print('# MNB: starting interactive multinomial naives bayes...')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# split labeled data into text and label set
|
# split labeled data into text and label set
|
||||||
|
|
|
@ -0,0 +1,175 @@
|
||||||
|
'''
|
||||||
|
Multinomial Naive Bayes Classifier
|
||||||
|
======================
|
||||||
|
'''
|
||||||
|
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
|
import csv
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_selection import SelectPercentile
|
||||||
|
from sklearn.metrics import recall_score, precision_score
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
|
||||||
|
class MultinomialNaiveBayes:
|
||||||
|
|
||||||
|
def make_mnb(dataset, sklearn_cv=True, percentile=100):
|
||||||
|
'''fits naive bayes model with StratifiedKFold
|
||||||
|
'''
|
||||||
|
print('# starting classical multinomial naive bayes')
|
||||||
|
print('# ...')
|
||||||
|
|
||||||
|
# split data into text and label set
|
||||||
|
# join title and text
|
||||||
|
X = dataset['Title'] + '. ' + dataset['Text']
|
||||||
|
y = dataset['Label']
|
||||||
|
|
||||||
|
if sklearn_cv:
|
||||||
|
cv = CountVectorizer()
|
||||||
|
|
||||||
|
# use stratified k-fold cross-validation as split method
|
||||||
|
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
||||||
|
|
||||||
|
classifier = MultinomialNB(alpha=1.0e-10,
|
||||||
|
fit_prior=False,
|
||||||
|
class_prior=None)
|
||||||
|
|
||||||
|
# metrics
|
||||||
|
recall_scores = []
|
||||||
|
precision_scores = []
|
||||||
|
f1_scores = []
|
||||||
|
|
||||||
|
# probabilities of each class (of each fold)
|
||||||
|
#class_prob = []
|
||||||
|
# counts number of training samples observed in each class
|
||||||
|
#class_counts = []
|
||||||
|
|
||||||
|
# for each fold
|
||||||
|
n = 0
|
||||||
|
for train, test in skf.split(X,y):
|
||||||
|
|
||||||
|
n += 1
|
||||||
|
print('# split no. ' + str(n))
|
||||||
|
|
||||||
|
if sklearn_cv:
|
||||||
|
# use sklearn CountVectorizer
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
testing_data = cv.transform(X[test]).toarray()
|
||||||
|
else:
|
||||||
|
# use my own BagOfWords python implementation
|
||||||
|
stemming = True
|
||||||
|
rel_freq = True
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[train])
|
||||||
|
vocab = BagOfWords.make_vocab(extracted_words)
|
||||||
|
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[test])
|
||||||
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
|
# apply select percentile
|
||||||
|
selector = SelectPercentile(percentile=percentile)
|
||||||
|
selector.fit(training_data, y[train])
|
||||||
|
|
||||||
|
# new reduced data sets
|
||||||
|
training_data_r = selector.transform(training_data)
|
||||||
|
testing_data_r = selector.transform(testing_data)
|
||||||
|
|
||||||
|
#fit classifier
|
||||||
|
classifier.fit(training_data_r, y[train])
|
||||||
|
#predict class
|
||||||
|
predictions_train = classifier.predict(training_data_r)
|
||||||
|
predictions_test = classifier.predict(testing_data_r)
|
||||||
|
# print('train:')
|
||||||
|
# print(y[train])
|
||||||
|
# print('test:')
|
||||||
|
# print(y[test])
|
||||||
|
# print()
|
||||||
|
# print('pred')
|
||||||
|
# print(predictions_test)
|
||||||
|
|
||||||
|
#print and store metrics
|
||||||
|
rec = recall_score(y[test], predictions_test, average='weighted')
|
||||||
|
print('rec: ' + str(rec))
|
||||||
|
recall_scores.append(rec)
|
||||||
|
prec = precision_score(y[test], predictions_test, average='weighted')
|
||||||
|
print('prec: ' + str(prec))
|
||||||
|
print('#')
|
||||||
|
precision_scores.append(prec)
|
||||||
|
# equation for f1 score
|
||||||
|
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||||
|
|
||||||
|
#class_prob.append(classifier.class_prior_)
|
||||||
|
#class_counts.append(classifier.class_count_)
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# probability estimates for the test vector (testing_data)
|
||||||
|
class_probs = classifier.predict_proba(testing_data)
|
||||||
|
|
||||||
|
# number of samples encountered for each class during fitting
|
||||||
|
# this value is weighted by the sample weight when provided
|
||||||
|
class_count = classifier.class_count_
|
||||||
|
|
||||||
|
# classes in order used
|
||||||
|
classes = classifier.classes_
|
||||||
|
|
||||||
|
# return classes and vector of class estimates
|
||||||
|
return recall_scores, precision_scores, f1_scores
|
||||||
|
|
||||||
|
######## nur für resubstitutionsfehler benötigt ########
|
||||||
|
def analyze_errors(training, testing):
|
||||||
|
'''calculates resubstitution error
|
||||||
|
shows indices of false classified articles
|
||||||
|
uses Gaussian Bayes with train test split
|
||||||
|
'''
|
||||||
|
X_train = training['Title'] + ' ' + training['Text']
|
||||||
|
y_train = training['Label']
|
||||||
|
|
||||||
|
X_test = testing['Title'] + ' ' + testing['Text']
|
||||||
|
y_test = testing['Label']
|
||||||
|
|
||||||
|
count_vector = CountVectorizer()
|
||||||
|
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = count_vector.fit_transform(X_train).toarray()
|
||||||
|
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
testing_data = count_vector.transform(X_test).toarray()
|
||||||
|
|
||||||
|
# Naive Bayes
|
||||||
|
classifier = MultinomialNB(alpha=1.0e-10,
|
||||||
|
fit_prior=False,
|
||||||
|
class_prior=None)
|
||||||
|
# fit classifier
|
||||||
|
classifier.fit(training_data, y_train)
|
||||||
|
|
||||||
|
# Predict class
|
||||||
|
predictions = classifier.predict(testing_data)
|
||||||
|
|
||||||
|
print(type(y_test))
|
||||||
|
print(len(y_test))
|
||||||
|
print(type(predictions))
|
||||||
|
print(len(predictions))
|
||||||
|
|
||||||
|
print('Errors at index:')
|
||||||
|
print()
|
||||||
|
n = 0
|
||||||
|
for i in range(len(y_test)):
|
||||||
|
if y_test[i] != predictions[i]:
|
||||||
|
n += 1
|
||||||
|
print('error no.{}'.format(n))
|
||||||
|
print('prediction at index {} is: {}, but actual is: {}'
|
||||||
|
.format(i, predictions[i], y_test[i]))
|
||||||
|
print(X_test[i])
|
||||||
|
print(y_test[i])
|
||||||
|
print()
|
||||||
|
#print metrics
|
||||||
|
print('F1 score: ', format(f1_score(y_test, predictions)))
|
|
@ -0,0 +1,6 @@
|
||||||
|
{
|
||||||
|
"cells": [],
|
||||||
|
"metadata": {},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
Binary file not shown.
After Width: | Height: | Size: 1.2 KiB |
File diff suppressed because it is too large
Load Diff
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
File diff suppressed because it is too large
Load Diff
Binary file not shown.
After Width: | Height: | Size: 30 KiB |
File diff suppressed because it is too large
Load Diff
Binary file not shown.
After Width: | Height: | Size: 25 KiB |
Binary file not shown.
After Width: | Height: | Size: 1.2 KiB |
Loading…
Reference in New Issue