diff --git a/obj/array_class_probs_stratified_round_9.pkl b/obj/array_class_probs_stratified_round_9.pkl
new file mode 100644
index 0000000..08ca500
Binary files /dev/null and b/obj/array_class_probs_stratified_round_9.pkl differ
diff --git a/src/2019-02-24-al-resubstitution-error.ipynb b/src/2019-02-24-al-resubstitution-error.ipynb
index cc32a5a..5036fe2 100644
--- a/src/2019-02-24-al-resubstitution-error.ipynb
+++ b/src/2019-02-24-al-resubstitution-error.ipynb
@@ -105,37 +105,28 @@
},
{
"cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "m = 0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 131,
+ "execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "3"
+ "9"
]
},
- "execution_count": 131,
+ "execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "m = 3\n",
+ "m += 1\n",
"m"
]
},
{
"cell_type": "code",
- "execution_count": 132,
+ "execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
@@ -147,16 +138,16 @@
},
{
"cell_type": "code",
- "execution_count": 133,
+ "execution_count": 72,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "number of labeled samples by class (0/1/2): 82/4/14\n",
- "minimum of new labeled samples: 4\n",
- "length of current data set for resubstitution error: 12\n"
+ "number of labeled samples by class (0/1/2): 80/2/18\n",
+ "minimum of new labeled samples: 2\n",
+ "length of current data set for resubstitution error: 6\n"
]
}
],
@@ -171,7 +162,7 @@
},
{
"cell_type": "code",
- "execution_count": 111,
+ "execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
@@ -183,122 +174,136 @@
},
{
"cell_type": "code",
- "execution_count": 112,
+ "execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"# newly added training data of the current round\n",
- "# training_data_0 = pd.concat([selec_0, selec_1, selec_2])\n",
- "# training_data_1 = pd.concat([selec_0, selec_1, selec_2])\n",
- "# training_data_2 = pd.concat([selec_0, selec_1, selec_2])\n",
- "# training_data_3 = pd.concat([selec_0, selec_1, selec_2])\n",
- "training_data_4 = pd.concat([selec_0, selec_1, selec_2])"
+ "#training_data_0 = pd.concat([selec_0, selec_1, selec_2])\n",
+ "#training_data_1 = pd.concat([selec_0, selec_1, selec_2])\n",
+ "#training_data_2 = pd.concat([selec_0, selec_1, selec_2])\n",
+ "#training_data_3 = pd.concat([selec_0, selec_1, selec_2])\n",
+ "#training_data_4 = pd.concat([selec_0, selec_1, selec_2])\n",
+ "#training_data_5 = pd.concat([selec_0, selec_1, selec_2])\n",
+ "#training_data_6 = pd.concat([selec_0, selec_1, selec_2])\n",
+ "#training_data_7 = pd.concat([selec_0, selec_1, selec_2])\n",
+ "#training_data_8 = pd.concat([selec_0, selec_1, selec_2])\n",
+ "training_data_9 = pd.concat([selec_0, selec_1, selec_2])"
]
},
{
"cell_type": "code",
- "execution_count": 113,
+ "execution_count": 75,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[5789.0,\n",
- " 4237.0,\n",
- " 2202.0,\n",
- " 4913.0,\n",
- " 821.0,\n",
- " 5973.0,\n",
- " 6198.0,\n",
- " 8490.0,\n",
- " 4815.0,\n",
- " 2386.0,\n",
- " 5177.0,\n",
- " 2482.0]"
- ]
- },
- "execution_count": 113,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"# indices of training samples\n",
"# idx_0 = training_data_0['Index'].tolist()\n",
"# idx_1 = training_data_1['Index'].tolist()\n",
"# idx_2 = training_data_2['Index'].tolist()\n",
"# idx_3 = training_data_3['Index'].tolist()\n",
- "idx_4 = training_data_4['Index'].tolist()\n",
- "\n",
- "train_all = train_all.append(training_data_4)\n",
- "idx_all = train_all['Index'].tolist()\n",
- "idx_4"
+ "# idx_4 = training_data_4['Index'].tolist()\n",
+ "# idx_5 = training_data_5['Index'].tolist()\n",
+ "# idx_6 = training_data_6['Index'].tolist()\n",
+ "# idx_7 = training_data_7['Index'].tolist()\n",
+ "# idx_8 = training_data_8['Index'].tolist()\n",
+ "idx_9 = training_data_9['Index'].tolist()"
]
},
{
"cell_type": "code",
- "execution_count": 140,
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#train_all = training_data_0\n",
+ "train_0_8 = training_data_0.append([training_data_1, training_data_2, training_data_3, training_data_4, training_data_5, training_data_6, training_data_7, training_data_8])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#idx_all = idx_0\n",
+ "idx_all = train_all['Index'].tolist()\n",
+ "#idx_9"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "36"
+ "117"
]
},
- "execution_count": 140,
+ "execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "train_0_2 = train_0_1.append(training_data_2)\n",
- "len(train_0_2)"
+ "len(train_all)"
]
},
{
"cell_type": "code",
- "execution_count": 114,
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_0_9 = train_0_2.append(training_data_3)\n",
+ "len(train_0_3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "stratified number in round 3: 12\n",
- "stratified number in total: 48\n"
+ "stratified number in round 9: 6\n",
+ "stratified number in total: 138\n"
]
}
],
"source": [
- "print('stratified number in round {}: {}'.format(m, len(idx_3)))\n",
+ "print('stratified number in round {}: {}'.format(m, len(idx_9)))\n",
"print('stratified number in total: {}'.format(len(idx_all)))"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 114,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# STEP 1:\n",
+ "# resubstitution error round\n",
+ "training_data = train_0_8\n",
+ "testing_data = training_data_9"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
- "outputs": [],
- "source": [
- "# STEP 1:\n",
- "# resubstitution error round\n",
- "training_data = training_data_3\n",
- "testing_data = training_data_3"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 116,
- "metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "3"
+ "9"
]
},
- "execution_count": 116,
+ "execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
@@ -309,16 +314,16 @@
},
{
"cell_type": "code",
- "execution_count": 119,
+ "execution_count": 160,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "400"
+ "1082"
]
},
- "execution_count": 119,
+ "execution_count": 160,
"metadata": {},
"output_type": "execute_result"
}
@@ -326,36 +331,38 @@
"source": [
"# STEP 2: \n",
"# resubstitution error all labeled articles in round\n",
- "training_data = training_data_3\n",
- "testing_data = df.loc[(df['Round'] <= m)]\n",
+ "training_data = train_all\n",
+ "testing_data = df.loc[(df['Round'] <= 11)]# & (~df['Index'].isin(idx_all))]\n",
+ "#df[~df['Index'].isin(idx_all)]\n",
+ "#df.loc[(df['Label'] == -1) | (df['Round'] >= 10)]\n",
"len(testing_data)"
]
},
{
"cell_type": "code",
- "execution_count": 137,
+ "execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"# STEP 3:\n",
"training_data = train_all\n",
- "testing_data = df.loc[(df['Round'] <= m)]"
+ "testing_data = train_all"
]
},
{
"cell_type": "code",
- "execution_count": 147,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# STEP 4:\n",
- "training_data = train_0_2\n",
- "testing_data = training_data_3"
+ "training_data = train_all\n",
+ "testing_data = train_all"
]
},
{
"cell_type": "code",
- "execution_count": 148,
+ "execution_count": 161,
"metadata": {},
"outputs": [
{
@@ -378,174 +385,19 @@
},
{
"cell_type": "code",
- "execution_count": 149,
+ "execution_count": 162,
"metadata": {},
"outputs": [
{
- "name": "stdout",
+ "name": "stderr",
"output_type": "stream",
"text": [
- "confusion matrix:\n",
- "###############\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "1"
- ]
- },
- "execution_count": 149,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "text/plain": [
- "0"
- ]
- },
- "execution_count": 149,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "text/plain": [
- "1"
- ]
- },
- "execution_count": 149,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "/\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "2"
- ]
- },
- "execution_count": 149,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "text/plain": [
- "4"
- ]
- },
- "execution_count": 149,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "text/plain": [
- "3"
- ]
- },
- "execution_count": 149,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "/\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "1"
- ]
- },
- "execution_count": 149,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "text/plain": [
- "0"
- ]
- },
- "execution_count": 149,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "text/plain": [
- "0"
- ]
- },
- "execution_count": 149,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "###############\n",
+ "C:\\Users\\Anne\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:543: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
- "class 0:\n",
- "\n",
- "TP: 1\n",
- "TN: 7\n",
- "FP: 1\n",
- "FN: 3\n",
- "\n",
- "class 1:\n",
- "\n",
- "TP: 4\n",
- "TN: 3\n",
- "FP: 5\n",
- "FN: 0\n",
- "\n",
- "class 2:\n",
- "\n",
- "TP: 0\n",
- "TN: 7\n",
- "FP: 1\n",
- "FN: 4\n",
- "###############\n",
- "\n",
- "METRICS:\n",
- "\n",
- "class 0:\n",
- "\n",
- "precision: 50.0\n",
- "recall: 25.0\n",
- "accuracy: 66.667\n",
- "\n",
- "class 1:\n",
- "\n",
- "precision: 44.444\n",
- "recall: 100.0\n",
- "accuracy: 58.333\n",
- "\n",
- "class 2:\n",
- "\n",
- "precision: 0.0\n",
- "recall: 0.0\n",
- "accuracy: 58.333\n",
- "\n",
- "Average Metrics:\n",
- "\n",
- "precision: 31\n",
- "recall: 42\n",
- "accuracy: 61\n"
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ " self.obj[item] = s\n"
]
}
],
@@ -559,8 +411,302 @@
" testing_data.loc[index, 'Estimated'] = classes[i]\n",
" # annotate probability\n",
" testing_data.loc[index, 'Probability'] = row[i]\n",
- " n += 1\n",
+ " n += 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 130,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#testing_data[:3]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 131,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of articles that were estimated as class 0:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "7140"
+ ]
+ },
+ "execution_count": 131,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of articles that were estimated as class 1 (merger):\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "2007"
+ ]
+ },
+ "execution_count": 131,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of articles that were estimated as class 2:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "736"
+ ]
+ },
+ "execution_count": 131,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print('Number of articles that were estimated as class 0:')\n",
+ "len(testing_data.loc[(testing_data['Estimated'] == 0)])\n",
+ "print('Number of articles that were estimated as class 1 (merger):')\n",
+ "len(testing_data.loc[(testing_data['Estimated'] == 1)])\n",
+ "print('Number of articles that were estimated as class 2:')\n",
+ "len(testing_data.loc[(testing_data['Estimated'] == 2)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 133,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of articles that actually are class 0:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "847"
+ ]
+ },
+ "execution_count": 133,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of articles that actually are class 1 (merger):\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "50"
+ ]
+ },
+ "execution_count": 133,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of articles that actually are class 2:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "185"
+ ]
+ },
+ "execution_count": 133,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print('Number of articles that actually are class 0:')\n",
+ "len(df.loc[(df['Label'] == 0)])\n",
+ "print('Number of articles that actually are class 1 (merger):')\n",
+ "len(df.loc[(df['Label'] == 1)])\n",
+ "print('Number of articles that actually are class 2:')\n",
+ "len(df.loc[(df['Label'] == 2)])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Confusion matrix:\n",
"\n",
+ "| Predicted \\ Actual | 0 | 1 | 2 | \n",
+ "|--------------------|--------|--------|--------|\n",
+ "| 0 | zero_0 | zero_1 | zero_2 | \n",
+ "| 1 | one_0 | one_1 | one_2 | \n",
+ "| 2 | two_0 | two_1 | two_2 | \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 181,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Nachberechnung fürs Latex:\n",
+ "zero_0 = 1\n",
+ "zero_1 = 1\n",
+ "zero_2 = 0\n",
+ "\n",
+ "one_0 = 4\n",
+ "one_1 = 3\n",
+ "one_2 = 4\n",
+ "\n",
+ "two_0 = 0\n",
+ "two_1 = 1\n",
+ "two_2 = 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 163,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "confusion matrix:\n",
+ "###############\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "701"
+ ]
+ },
+ "execution_count": 163,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "execution_count": 163,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "41"
+ ]
+ },
+ "execution_count": 163,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "99"
+ ]
+ },
+ "execution_count": 163,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "49"
+ ]
+ },
+ "execution_count": 163,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "74"
+ ]
+ },
+ "execution_count": 163,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "47"
+ ]
+ },
+ "execution_count": 163,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "1"
+ ]
+ },
+ "execution_count": 163,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "70"
+ ]
+ },
+ "execution_count": 163,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
"print('confusion matrix:')\n",
"print('###############')\n",
"zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])\n",
@@ -583,7 +729,73 @@
"two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])\n",
"two_1\n",
"two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])\n",
- "two_2\n",
+ "two_2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 182,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "###############\n",
+ "\n",
+ "class 0:\n",
+ "\n",
+ "TP: 1\n",
+ "TN: 9\n",
+ "FP: 1\n",
+ "FN: 4\n",
+ "\n",
+ "class 1:\n",
+ "\n",
+ "TP: 3\n",
+ "TN: 2\n",
+ "FP: 8\n",
+ "FN: 2\n",
+ "\n",
+ "class 2:\n",
+ "\n",
+ "TP: 1\n",
+ "TN: 9\n",
+ "FP: 1\n",
+ "FN: 4\n",
+ "###############\n",
+ "\n",
+ "METRICS:\n",
+ "\n",
+ "class 0:\n",
+ "\n",
+ "precision: 50.0\n",
+ "recall: 20.0\n",
+ "accuracy: 66.67\n",
+ "\n",
+ "class 1:\n",
+ "\n",
+ "precision: 27.27\n",
+ "recall: 60.0\n",
+ "accuracy: 33.33\n",
+ "\n",
+ "class 2:\n",
+ "\n",
+ "precision: 50.0\n",
+ "recall: 20.0\n",
+ "accuracy: 66.67\n",
+ "\n",
+ "Average Metrics:\n",
+ "\n",
+ "precision: 42.42424242424242\n",
+ "recall: 33.333333333333336\n",
+ "accuracy: 55.55555555555554\n"
+ ]
+ }
+ ],
+ "source": [
"print('###############')\n",
"print()\n",
"total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
@@ -626,35 +838,180 @@
"print('class 0:')\n",
"print()\n",
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
- "print('precision: {}'.format(round(prec_0, 3)))\n",
+ "print('precision: {}'.format(round(prec_0, 2)))\n",
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
- "print('recall: {}'.format(round(rec_0, 3)))\n",
+ "print('recall: {}'.format(round(rec_0, 2)))\n",
"acc_0 = (tp_0 + tn_0) / total * 100\n",
- "print('accuracy: {}'.format(round(acc_0, 3)))\n",
+ "print('accuracy: {}'.format(round(acc_0, 2)))\n",
"print()\n",
"print('class 1:')\n",
"print()\n",
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
- "print('precision: {}'.format(round(prec_1, 3)))\n",
+ "print('precision: {}'.format(round(prec_1, 2)))\n",
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
- "print('recall: {}'.format(round(rec_1, 3)))\n",
+ "print('recall: {}'.format(round(rec_1, 2)))\n",
"acc_1 = (tp_1 + tn_1) / total * 100\n",
- "print('accuracy: {}'.format(round(acc_1, 3)))\n",
+ "print('accuracy: {}'.format(round(acc_1, 2)))\n",
"print()\n",
"print('class 2:')\n",
"print()\n",
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
- "print('precision: {}'.format(round(prec_2, 3)))\n",
+ "print('precision: {}'.format(round(prec_2, 2)))\n",
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
- "print('recall: {}'.format(round(rec_2, 3)))\n",
+ "print('recall: {}'.format(round(rec_2, 2)))\n",
"acc_2 = (tp_2 + tn_2) / total * 100\n",
- "print('accuracy: {}'.format(round(acc_2, 3)))\n",
+ "print('accuracy: {}'.format(round(acc_2, 2)))\n",
"print()\n",
"print('Average Metrics:')\n",
"print()\n",
- "print('precision: {}'.format(round((prec_1 + prec_2 + prec_0) / 3), 3))\n",
- "print('recall: {}'.format(round((rec_1 + rec_2 + rec_0) / 3), 3))\n",
- "print('accuracy: {}'.format(round((acc_1 + acc_2 + acc_0) / 3), 3))"
+ "print('precision: {}'.format((prec_1 + prec_2 + prec_0) / 3))\n",
+ "print('recall: {}'.format((rec_1 + rec_2 + rec_0) / 3))\n",
+ "print('accuracy: {}'.format((acc_1 + acc_2 + acc_0) / 3))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 126,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# annotate highest estimated probability for every instance\n",
+ "maxima = []\n",
+ "\n",
+ "for row in class_probs:\n",
+ " maxima.append(np.amax(row))\n",
+ " \n",
+ "# save class_probs array\n",
+ "with open('../obj/'+ 'array_class_probs_stratified_round_9' + '.pkl', 'wb') as f:\n",
+ " pickle.dump(maxima, f, pickle.HIGHEST_PROTOCOL)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Vorgehensweise: Wie finden wir ein besseres Model?\n",
+ "\n",
+ "1) Vorschlag:\n",
+ "\n",
+ "K-Cross-Validation auf 1000 Daten, 10% holdout\n",
+ "1000 Daten => stratified sample gebildet\n",
+ "Mittelwert, Min/Max berechnet"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 139,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Text | \n",
+ " Title | \n",
+ " Label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 10 | \n",
+ " Industrials - Wed Jan 25, 2017 - 11:51pm EST China stocks climb to new 6-week highs; Hong Kong firmer * SSEC +0.1 pct, CSI300 +0.3 pct, HSI +1.4 pct * China's Dec industrial profits grow at sharply slower pace SHANGHAI Jan 26 China stocks are set for a five-day winning streak, hitting a fresh six-week high on Thursday morning, but gains were curbed after profits earned by industrial firms grew at a sharply slower pace last month. Market turnover stayed thin on the last trading day before the Lunar New Year, China's biggest holiday, starting on Friday. Markets will be closed for a week and will reopen on Feb. 3. Hong Kong stocks rallied and were poised for four days of gains, drawing inspiration from the Dow Jones Industrial Average breaching the 20,000-point level for the first time on Wednesday. Sentiment was also helped by a weaker U.S. dollar, easing fears of capital outflows from the city. In China, the blue-chip CSI300 index rose 0.3 percent, to 3,387.16 points at the end of the morning session, while the Shanghai Composite Index gained 0.1 percent, to 3,153.77 points. Blue chip shares have gained almost 1 percent so far this week. \"Investors are in a holiday mood now,\" said Cao Xuefeng, head of research at Huaxi Securities in Chengdu, noting the market is traditionally firm ahead of the Lunar New year. But bullish sentiment was partly offset by China's profit growth earned by industrial firms in December, which eased sharply to 2.3 percent compared with November's 14.5 percent. Cao said the slower pace was due to a cooling property market and seasonal factors as many workers had already left the factories for their home towns ahead of the new year. \"The path of U.S. interest rate rises, Trump's policies to China, whether he will brand China a currency manipulator, is there going to be a trade war - all these will affect the economy in China this year,\" Cao said, adding that it was hard to predict Trump's next move. \"He plays against the rules. He isn't like former U.S. presidents.\" Sector performance was mixed in China. An index tracking the industrial sector lost 0.1 percent at midday after briefly hitting a two-week high in early trade. Banks were among best gainers on the mainland. An index tracking the sector was up nearly 0.8 percent, after China's banking regulator reported that commercial banks' non-performing loan (NPL) ratio stood at 1.74 percent at the end of 2016, basically flat from end of the third quarter. In Hong Kong, the Hang Seng index added 1.4 percent, to 23,365.01 points, while the Hong Kong China Enterprises Index gained 1.4 percent, to 9,875.77 points. The Dow surged on Wednesday as solid earnings and optimism over President Donald Trump's pro-growth initiatives revitalised a post-election rally. Sectors gained across the board at midday, with tech stocks and real estate developers among the best performers. Hong Kong exchanges will be closed on Jan. 30 and 31 for the Lunar New Year. (Reporting by Jackie Cai and John Ruwitch; Editing by Jacqueline Wong) Next In Industrials | \n",
+ " China stocks climb to new 6-week highs; Hong Kong firmer | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " Financials 1:09am EST Australia shares end higher on materials and financials; NZ up (Updates to close) Jan 25 Australian shares closed modestly higher on Wednesday, supported by financial stocks that rose on positive leads from U.S. counterparts, and by materials that were underpinned by higher commodity prices. The S&P/ASX 200 index rose 0.4 percent, or 21.40 points, to end at 5,671.50. The S&P 500 and Nasdaq set record highs on Tuesday in a broad rally led by financial and technology stocks. Australia's local financial index snapped six sessions of losses, with the 'Big four' up between 0.4 percent and 1.1 percent. The metals and mining index rose as much as 2.4 percent to its highest in over two years. Steel and iron ore futures in China rose for a second day on Wednesday, supported by hopes that demand for both commodities will strengthen after the Lunar New Year holiday. Mining giant BHP Billiton Ltd rose as much as 3.5 percent to its highest since June 2015. The world's biggest miner reported a 9 percent rise in iron ore output in its fiscal second quarter. Rival Rio Tinto Ltd gained as much as 3.8 percent, its highest in over two and a half years, after it agreed to sell its unit Coal & Allied Industries Ltd to Yancoal Australia Ltd for up to $2.45 billion in cash. New Zealand's benchmark S&P/NZX 50 index closed 0.4 percent, or 26.75 points higher, at 7,090.91, its highest close in over three months. Industrials led gains, with Port Of Tauranga ending 2.3 percent higher. (Reporting by Sindhu Chandrasekaran in Bengaluru; Editing by Kim Coghill) Next In Financials | \n",
+ " Australia shares end higher on materials and financials; NZ up | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Text \\\n",
+ "10 Industrials - Wed Jan 25, 2017 - 11:51pm EST China stocks climb to new 6-week highs; Hong Kong firmer * SSEC +0.1 pct, CSI300 +0.3 pct, HSI +1.4 pct * China's Dec industrial profits grow at sharply slower pace SHANGHAI Jan 26 China stocks are set for a five-day winning streak, hitting a fresh six-week high on Thursday morning, but gains were curbed after profits earned by industrial firms grew at a sharply slower pace last month. Market turnover stayed thin on the last trading day before the Lunar New Year, China's biggest holiday, starting on Friday. Markets will be closed for a week and will reopen on Feb. 3. Hong Kong stocks rallied and were poised for four days of gains, drawing inspiration from the Dow Jones Industrial Average breaching the 20,000-point level for the first time on Wednesday. Sentiment was also helped by a weaker U.S. dollar, easing fears of capital outflows from the city. In China, the blue-chip CSI300 index rose 0.3 percent, to 3,387.16 points at the end of the morning session, while the Shanghai Composite Index gained 0.1 percent, to 3,153.77 points. Blue chip shares have gained almost 1 percent so far this week. \"Investors are in a holiday mood now,\" said Cao Xuefeng, head of research at Huaxi Securities in Chengdu, noting the market is traditionally firm ahead of the Lunar New year. But bullish sentiment was partly offset by China's profit growth earned by industrial firms in December, which eased sharply to 2.3 percent compared with November's 14.5 percent. Cao said the slower pace was due to a cooling property market and seasonal factors as many workers had already left the factories for their home towns ahead of the new year. \"The path of U.S. interest rate rises, Trump's policies to China, whether he will brand China a currency manipulator, is there going to be a trade war - all these will affect the economy in China this year,\" Cao said, adding that it was hard to predict Trump's next move. \"He plays against the rules. He isn't like former U.S. presidents.\" Sector performance was mixed in China. An index tracking the industrial sector lost 0.1 percent at midday after briefly hitting a two-week high in early trade. Banks were among best gainers on the mainland. An index tracking the sector was up nearly 0.8 percent, after China's banking regulator reported that commercial banks' non-performing loan (NPL) ratio stood at 1.74 percent at the end of 2016, basically flat from end of the third quarter. In Hong Kong, the Hang Seng index added 1.4 percent, to 23,365.01 points, while the Hong Kong China Enterprises Index gained 1.4 percent, to 9,875.77 points. The Dow surged on Wednesday as solid earnings and optimism over President Donald Trump's pro-growth initiatives revitalised a post-election rally. Sectors gained across the board at midday, with tech stocks and real estate developers among the best performers. Hong Kong exchanges will be closed on Jan. 30 and 31 for the Lunar New Year. (Reporting by Jackie Cai and John Ruwitch; Editing by Jacqueline Wong) Next In Industrials \n",
+ "16 Financials 1:09am EST Australia shares end higher on materials and financials; NZ up (Updates to close) Jan 25 Australian shares closed modestly higher on Wednesday, supported by financial stocks that rose on positive leads from U.S. counterparts, and by materials that were underpinned by higher commodity prices. The S&P/ASX 200 index rose 0.4 percent, or 21.40 points, to end at 5,671.50. The S&P 500 and Nasdaq set record highs on Tuesday in a broad rally led by financial and technology stocks. Australia's local financial index snapped six sessions of losses, with the 'Big four' up between 0.4 percent and 1.1 percent. The metals and mining index rose as much as 2.4 percent to its highest in over two years. Steel and iron ore futures in China rose for a second day on Wednesday, supported by hopes that demand for both commodities will strengthen after the Lunar New Year holiday. Mining giant BHP Billiton Ltd rose as much as 3.5 percent to its highest since June 2015. The world's biggest miner reported a 9 percent rise in iron ore output in its fiscal second quarter. Rival Rio Tinto Ltd gained as much as 3.8 percent, its highest in over two and a half years, after it agreed to sell its unit Coal & Allied Industries Ltd to Yancoal Australia Ltd for up to $2.45 billion in cash. New Zealand's benchmark S&P/NZX 50 index closed 0.4 percent, or 26.75 points higher, at 7,090.91, its highest close in over three months. Industrials led gains, with Port Of Tauranga ending 2.3 percent higher. (Reporting by Sindhu Chandrasekaran in Bengaluru; Editing by Kim Coghill) Next In Financials \n",
+ "\n",
+ " Title Label \n",
+ "10 China stocks climb to new 6-week highs; Hong Kong firmer 0.0 \n",
+ "16 Australia shares end higher on materials and financials; NZ up 0.0 "
+ ]
+ },
+ "execution_count": 139,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.loc[(df['Label'] != -1), ['Text', 'Title', 'Label']][:2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 138,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "# starting classical multinomial naive bayes\n",
+ "# ...\n",
+ "# split no. 1\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\Anne\\Anaconda3\\lib\\site-packages\\pandas\\core\\series.py:842: FutureWarning: \n",
+ "Passing list-likes to .loc or [] with any missing label will raise\n",
+ "KeyError in the future, you can use .reindex() as an alternative.\n",
+ "\n",
+ "See the documentation here:\n",
+ "https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike\n",
+ " return self.loc[key]\n"
+ ]
+ },
+ {
+ "ename": "ValueError",
+ "evalue": "np.nan is an invalid document, expected byte or unicode string.",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mrecall_scores\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprecision_scores\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mf1_scores\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mMultinomialNaiveBayes\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmake_mnb\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Label'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msklearn_cv\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[1;32m~\\BA\\Python\\src\\MultinomialNaiveBayes.py\u001b[0m in \u001b[0;36mmake_mnb\u001b[1;34m(dataset, sklearn_cv, percentile)\u001b[0m\n\u001b[0;32m 58\u001b[0m \u001b[1;31m# use sklearn CountVectorizer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 59\u001b[0m \u001b[1;31m# fit the training data and then return the matrix\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 60\u001b[1;33m \u001b[0mtraining_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 61\u001b[0m \u001b[1;31m# transform testing data and return the matrix\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 62\u001b[0m \u001b[0mtesting_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mtest\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py\u001b[0m in \u001b[0;36mfit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 1030\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1031\u001b[0m vocabulary, X = self._count_vocab(raw_documents,\n\u001b[1;32m-> 1032\u001b[1;33m self.fixed_vocabulary_)\n\u001b[0m\u001b[0;32m 1033\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1034\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbinary\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py\u001b[0m in \u001b[0;36m_count_vocab\u001b[1;34m(self, raw_documents, fixed_vocab)\u001b[0m\n\u001b[0;32m 940\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mdoc\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mraw_documents\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 941\u001b[0m \u001b[0mfeature_counter\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 942\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mfeature\u001b[0m \u001b[1;32min\u001b[0m \u001b[0manalyze\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 943\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 944\u001b[0m \u001b[0mfeature_idx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvocabulary\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mfeature\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py\u001b[0m in \u001b[0;36m\u001b[1;34m(doc)\u001b[0m\n\u001b[0;32m 326\u001b[0m tokenize)\n\u001b[0;32m 327\u001b[0m return lambda doc: self._word_ngrams(\n\u001b[1;32m--> 328\u001b[1;33m tokenize(preprocess(self.decode(doc))), stop_words)\n\u001b[0m\u001b[0;32m 329\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 330\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py\u001b[0m in \u001b[0;36mdecode\u001b[1;34m(self, doc)\u001b[0m\n\u001b[0;32m 141\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 142\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdoc\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 143\u001b[1;33m raise ValueError(\"np.nan is an invalid document, expected byte or \"\n\u001b[0m\u001b[0;32m 144\u001b[0m \"unicode string.\")\n\u001b[0;32m 145\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mValueError\u001b[0m: np.nan is an invalid document, expected byte or unicode string."
+ ]
+ }
+ ],
+ "source": [
+ "recall_scores, precision_scores, f1_scores = MultinomialNaiveBayes.make_mnb(df.loc[(df['Label'] != -1)], sklearn_cv=True)"
]
},
{
diff --git a/src/LabelingPlotter.py b/src/LabelingPlotter.py
index c86c2ba..48a7070 100644
--- a/src/LabelingPlotter.py
+++ b/src/LabelingPlotter.py
@@ -56,7 +56,7 @@ class LabelingPlotter():
def plot_cumulative():
# load pickle object
- with open('../obj/array_class_probs_round_11.pkl', 'rb') as input:
+ with open('../obj/array_class_probs_stratified_round_9.pkl', 'rb') as input:
list = pickle.load(input)
# sort list in descending order
@@ -86,11 +86,12 @@ class LabelingPlotter():
ax.set_xlabel('Highest estimated probability')
ax.set_ylabel('Fraction of articles with this highest estimated probability')
#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
+ plt.axis([0.5, 1, 0, 0.015]) # round 9 stratified
#plt.axis([0.65, 1, 0, 0.003]) # round 10
- plt.axis([0.7, 1, 0, 0.002]) # round 11
+ #plt.axis([0.7, 1, 0, 0.002]) # round 11
#ax.set_xbound(lower=0.5, upper=0.99)
- plt.savefig('..\\visualization\\proba_round_11.png')
- plt.savefig('..\\visualization\\proba_round_11.eps')
+ plt.savefig('..\\visualization\\proba_stratified_round_9.png')
+ plt.savefig('..\\visualization\\proba_stratified_round_9.eps')
plt.show()
diff --git a/src/MultinomialNaiveBayes.py b/src/MultinomialNaiveBayes.py
index 942c25e..67709e7 100644
--- a/src/MultinomialNaiveBayes.py
+++ b/src/MultinomialNaiveBayes.py
@@ -25,7 +25,11 @@ class MultinomialNaiveBayes:
# split data into text and label set
# join title and text
X = dataset['Title'] + '. ' + dataset['Text']
+
+ print(X[:12])
+
y = dataset['Label']
+ print(y[:12])
if sklearn_cv:
cv = CountVectorizer()
@@ -57,6 +61,12 @@ class MultinomialNaiveBayes:
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
+ print('Title + Text von train')
+ print(X[train])
+
+ print('Label von train')
+ print(y[train])
+
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
@@ -172,4 +182,24 @@ class MultinomialNaiveBayes:
print(y_test[i])
print()
#print metrics
- print('F1 score: ', format(f1_score(y_test, predictions)))
\ No newline at end of file
+ print('F1 score: ', format(f1_score(y_test, predictions)))
+
+if __name__ == '__main__':
+
+ # read csv file
+ print('# reading dataset')
+ print('# ...')
+
+ # read current data set from csv
+ df = pd.read_csv('../data/interactive_labeling_round_11.csv',
+ sep='|',
+ usecols=range(1,13), # drop first column 'unnamed'
+ encoding='utf-8',
+ quoting=csv.QUOTE_NONNUMERIC,
+ quotechar='\'')
+
+ # select only labeled articles
+ #print('Anzahl aller gelabelten:')
+ #print(len(df.loc[df['Label'] != -1]))
+ #print(df.loc[df['Label'] != -1][:5])
+ MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reindex(), sklearn_cv=True, percentile=100)
\ No newline at end of file
diff --git a/visualization/proba_round_11.eps b/visualization/proba_round_11.eps
index 4552fe5..d194896 100644
--- a/visualization/proba_round_11.eps
+++ b/visualization/proba_round_11.eps
@@ -1,7 +1,7 @@
%!PS-Adobe-3.0 EPSF-3.0
%%Title: ..\visualization\proba_round_11.eps
%%Creator: matplotlib version 3.0.2, http://matplotlib.org/
-%%CreationDate: Thu Feb 21 14:11:04 2019
+%%CreationDate: Tue Mar 5 08:51:48 2019
%%Orientation: portrait
%%BoundingBox: 18 252 594 540
%%EndComments
diff --git a/visualization/proba_stratified_round_9.eps b/visualization/proba_stratified_round_9.eps
new file mode 100644
index 0000000..5c5517f
--- /dev/null
+++ b/visualization/proba_stratified_round_9.eps
@@ -0,0 +1,1528 @@
+%!PS-Adobe-3.0 EPSF-3.0
+%%Title: ..\visualization\proba_stratified_round_9.eps
+%%Creator: matplotlib version 3.0.2, http://matplotlib.org/
+%%CreationDate: Tue Mar 5 08:54:37 2019
+%%Orientation: portrait
+%%BoundingBox: 18 252 594 540
+%%EndComments
+%%BeginProlog
+/mpldict 8 dict def
+mpldict begin
+/m { moveto } bind def
+/l { lineto } bind def
+/r { rlineto } bind def
+/c { curveto } bind def
+/cl { closepath } bind def
+/box {
+m
+1 index 0 r
+0 exch r
+neg 0 r
+cl
+} bind def
+/clipbox {
+box
+clip
+newpath
+} bind def
+%!PS-Adobe-3.0 Resource-Font
+%%Title: DejaVu Sans
+%%Copyright: Copyright (c) 2003 by Bitstream, Inc. All Rights Reserved. Copyright (c) 2006 by Tavmjong Bah. All Rights Reserved. DejaVu changes are in public domain
+%%Creator: Converted from TrueType to type 3 by PPR
+25 dict begin
+/_d{bind def}bind def
+/_m{moveto}_d
+/_l{lineto}_d
+/_cl{closepath eofill}_d
+/_c{curveto}_d
+/_sc{7 -1 roll{setcachedevice}{pop pop pop pop pop pop}ifelse}_d
+/_e{exec}_d
+/FontName /DejaVuSans def
+/PaintType 0 def
+/FontMatrix[.001 0 0 .001 0 0]def
+/FontBBox[-1021 -463 1793 1232]def
+/FontType 3 def
+/Encoding [ /space /period /zero /one /two /four /five /six /seven /eight /nine /F /H /a /b /c /d /e /f /g /h /i /l /m /n /o /p /r /s /t /w /y ] def
+/FontInfo 10 dict dup begin
+/FamilyName (DejaVu Sans) def
+/FullName (DejaVu Sans) def
+/Notice (Copyright (c) 2003 by Bitstream, Inc. All Rights Reserved. Copyright (c) 2006 by Tavmjong Bah. All Rights Reserved. DejaVu changes are in public domain ) def
+/Weight (Book) def
+/Version (Version 2.35) def
+/ItalicAngle 0.0 def
+/isFixedPitch false def
+/UnderlinePosition -130 def
+/UnderlineThickness 90 def
+end readonly def
+/CharStrings 33 dict dup begin
+/.notdef 0 def
+/space{318 0 0 0 0 0 _sc
+}_d
+/period{318 0 107 0 210 124 _sc
+107 124 _m
+210 124 _l
+210 0 _l
+107 0 _l
+107 124 _l
+_cl}_d
+/zero{636 0 66 -13 570 742 _sc
+318 664 _m
+267 664 229 639 203 589 _c
+177 539 165 464 165 364 _c
+165 264 177 189 203 139 _c
+229 89 267 64 318 64 _c
+369 64 407 89 433 139 _c
+458 189 471 264 471 364 _c
+471 464 458 539 433 589 _c
+407 639 369 664 318 664 _c
+318 742 _m
+399 742 461 709 505 645 _c
+548 580 570 486 570 364 _c
+570 241 548 147 505 83 _c
+461 19 399 -13 318 -13 _c
+236 -13 173 19 130 83 _c
+87 147 66 241 66 364 _c
+66 486 87 580 130 645 _c
+173 709 236 742 318 742 _c
+_cl}_d
+/one{636 0 110 0 544 729 _sc
+124 83 _m
+285 83 _l
+285 639 _l
+110 604 _l
+110 694 _l
+284 729 _l
+383 729 _l
+383 83 _l
+544 83 _l
+544 0 _l
+124 0 _l
+124 83 _l
+_cl}_d
+/two{{636 0 73 0 536 742 _sc
+192 83 _m
+536 83 _l
+536 0 _l
+73 0 _l
+73 83 _l
+110 121 161 173 226 239 _c
+290 304 331 346 348 365 _c
+380 400 402 430 414 455 _c
+426 479 433 504 433 528 _c
+433 566 419 598 392 622 _c
+365 646 330 659 286 659 _c
+255 659 222 653 188 643 _c
+154 632 117 616 78 594 _c
+78 694 _l
+118 710 155 722 189 730 _c
+223 738 255 742 284 742 _c
+}_e{359 742 419 723 464 685 _c
+509 647 532 597 532 534 _c
+532 504 526 475 515 449 _c
+504 422 484 390 454 354 _c
+446 344 420 317 376 272 _c
+332 227 271 164 192 83 _c
+_cl}_e}_d
+/four{636 0 49 0 580 729 _sc
+378 643 _m
+129 254 _l
+378 254 _l
+378 643 _l
+352 729 _m
+476 729 _l
+476 254 _l
+580 254 _l
+580 172 _l
+476 172 _l
+476 0 _l
+378 0 _l
+378 172 _l
+49 172 _l
+49 267 _l
+352 729 _l
+_cl}_d
+/five{{636 0 77 -13 549 729 _sc
+108 729 _m
+495 729 _l
+495 646 _l
+198 646 _l
+198 467 _l
+212 472 227 476 241 478 _c
+255 480 270 482 284 482 _c
+365 482 429 459 477 415 _c
+525 370 549 310 549 234 _c
+549 155 524 94 475 51 _c
+426 8 357 -13 269 -13 _c
+238 -13 207 -10 175 -6 _c
+143 -1 111 6 77 17 _c
+77 116 _l
+106 100 136 88 168 80 _c
+199 72 232 69 267 69 _c
+}_e{323 69 368 83 401 113 _c
+433 143 450 183 450 234 _c
+450 284 433 324 401 354 _c
+368 384 323 399 267 399 _c
+241 399 214 396 188 390 _c
+162 384 135 375 108 363 _c
+108 729 _l
+_cl}_e}_d
+/six{{636 0 70 -13 573 742 _sc
+330 404 _m
+286 404 251 388 225 358 _c
+199 328 186 286 186 234 _c
+186 181 199 139 225 109 _c
+251 79 286 64 330 64 _c
+374 64 409 79 435 109 _c
+461 139 474 181 474 234 _c
+474 286 461 328 435 358 _c
+409 388 374 404 330 404 _c
+526 713 _m
+526 623 _l
+501 635 476 644 451 650 _c
+425 656 400 659 376 659 _c
+310 659 260 637 226 593 _c
+}_e{192 549 172 482 168 394 _c
+187 422 211 444 240 459 _c
+269 474 301 482 336 482 _c
+409 482 467 459 509 415 _c
+551 371 573 310 573 234 _c
+573 159 550 99 506 54 _c
+462 9 403 -13 330 -13 _c
+246 -13 181 19 137 83 _c
+92 147 70 241 70 364 _c
+70 479 97 571 152 639 _c
+206 707 280 742 372 742 _c
+396 742 421 739 447 735 _c
+472 730 498 723 526 713 _c
+_cl}_e}_d
+/seven{636 0 82 0 551 729 _sc
+82 729 _m
+551 729 _l
+551 687 _l
+286 0 _l
+183 0 _l
+432 646 _l
+82 646 _l
+82 729 _l
+_cl}_d
+/eight{{636 0 68 -13 568 742 _sc
+318 346 _m
+271 346 234 333 207 308 _c
+180 283 167 249 167 205 _c
+167 161 180 126 207 101 _c
+234 76 271 64 318 64 _c
+364 64 401 76 428 102 _c
+455 127 469 161 469 205 _c
+469 249 455 283 429 308 _c
+402 333 365 346 318 346 _c
+219 388 _m
+177 398 144 418 120 447 _c
+96 476 85 511 85 553 _c
+85 611 105 657 147 691 _c
+188 725 245 742 318 742 _c
+}_e{390 742 447 725 489 691 _c
+530 657 551 611 551 553 _c
+551 511 539 476 515 447 _c
+491 418 459 398 417 388 _c
+464 377 501 355 528 323 _c
+554 291 568 251 568 205 _c
+568 134 546 80 503 43 _c
+459 5 398 -13 318 -13 _c
+237 -13 175 5 132 43 _c
+89 80 68 134 68 205 _c
+68 251 81 291 108 323 _c
+134 355 171 377 219 388 _c
+183 544 _m
+183 506 194 476 218 455 _c
+}_e{242 434 275 424 318 424 _c
+360 424 393 434 417 455 _c
+441 476 453 506 453 544 _c
+453 582 441 611 417 632 _c
+393 653 360 664 318 664 _c
+275 664 242 653 218 632 _c
+194 611 183 582 183 544 _c
+_cl}_e}_d
+/nine{{636 0 63 -13 566 742 _sc
+110 15 _m
+110 105 _l
+134 93 159 84 185 78 _c
+210 72 235 69 260 69 _c
+324 69 374 90 408 134 _c
+442 178 462 244 468 334 _c
+448 306 424 284 396 269 _c
+367 254 335 247 300 247 _c
+226 247 168 269 126 313 _c
+84 357 63 417 63 494 _c
+63 568 85 628 129 674 _c
+173 719 232 742 306 742 _c
+390 742 455 709 499 645 _c
+543 580 566 486 566 364 _c
+}_e{566 248 538 157 484 89 _c
+429 21 356 -13 264 -13 _c
+239 -13 214 -10 189 -6 _c
+163 -2 137 5 110 15 _c
+306 324 _m
+350 324 385 339 411 369 _c
+437 399 450 441 450 494 _c
+450 546 437 588 411 618 _c
+385 648 350 664 306 664 _c
+262 664 227 648 201 618 _c
+175 588 162 546 162 494 _c
+162 441 175 399 201 369 _c
+227 339 262 324 306 324 _c
+_cl}_e}_d
+/F{575 0 98 0 517 729 _sc
+98 729 _m
+517 729 _l
+517 646 _l
+197 646 _l
+197 431 _l
+486 431 _l
+486 348 _l
+197 348 _l
+197 0 _l
+98 0 _l
+98 729 _l
+_cl}_d
+/H{752 0 98 0 654 729 _sc
+98 729 _m
+197 729 _l
+197 430 _l
+555 430 _l
+555 729 _l
+654 729 _l
+654 0 _l
+555 0 _l
+555 347 _l
+197 347 _l
+197 0 _l
+98 0 _l
+98 729 _l
+_cl}_d
+/a{{613 0 60 -13 522 560 _sc
+343 275 _m
+270 275 220 266 192 250 _c
+164 233 150 205 150 165 _c
+150 133 160 107 181 89 _c
+202 70 231 61 267 61 _c
+317 61 357 78 387 114 _c
+417 149 432 196 432 255 _c
+432 275 _l
+343 275 _l
+522 312 _m
+522 0 _l
+432 0 _l
+432 83 _l
+411 49 385 25 355 10 _c
+325 -5 287 -13 243 -13 _c
+187 -13 142 2 109 33 _c
+76 64 60 106 60 159 _c
+}_e{60 220 80 266 122 298 _c
+163 329 224 345 306 345 _c
+432 345 _l
+432 354 _l
+432 395 418 427 391 450 _c
+364 472 326 484 277 484 _c
+245 484 215 480 185 472 _c
+155 464 127 453 100 439 _c
+100 522 _l
+132 534 164 544 195 550 _c
+226 556 256 560 286 560 _c
+365 560 424 539 463 498 _c
+502 457 522 395 522 312 _c
+_cl}_e}_d
+/b{{635 0 91 -13 580 760 _sc
+487 273 _m
+487 339 473 390 446 428 _c
+418 466 381 485 334 485 _c
+286 485 249 466 222 428 _c
+194 390 181 339 181 273 _c
+181 207 194 155 222 117 _c
+249 79 286 61 334 61 _c
+381 61 418 79 446 117 _c
+473 155 487 207 487 273 _c
+181 464 _m
+199 496 223 520 252 536 _c
+281 552 316 560 356 560 _c
+422 560 476 533 518 481 _c
+559 428 580 359 580 273 _c
+}_e{580 187 559 117 518 65 _c
+476 13 422 -13 356 -13 _c
+316 -13 281 -5 252 10 _c
+223 25 199 49 181 82 _c
+181 0 _l
+91 0 _l
+91 760 _l
+181 760 _l
+181 464 _l
+_cl}_e}_d
+/c{{550 0 55 -13 488 560 _sc
+488 526 _m
+488 442 _l
+462 456 437 466 411 473 _c
+385 480 360 484 334 484 _c
+276 484 230 465 198 428 _c
+166 391 150 339 150 273 _c
+150 206 166 154 198 117 _c
+230 80 276 62 334 62 _c
+360 62 385 65 411 72 _c
+437 79 462 90 488 104 _c
+488 21 _l
+462 9 436 0 410 -5 _c
+383 -10 354 -13 324 -13 _c
+242 -13 176 12 128 64 _c
+}_e{79 115 55 185 55 273 _c
+55 362 79 432 128 483 _c
+177 534 244 560 330 560 _c
+358 560 385 557 411 551 _c
+437 545 463 537 488 526 _c
+_cl}_e}_d
+/d{{635 0 55 -13 544 760 _sc
+454 464 _m
+454 760 _l
+544 760 _l
+544 0 _l
+454 0 _l
+454 82 _l
+435 49 411 25 382 10 _c
+353 -5 319 -13 279 -13 _c
+213 -13 159 13 117 65 _c
+75 117 55 187 55 273 _c
+55 359 75 428 117 481 _c
+159 533 213 560 279 560 _c
+319 560 353 552 382 536 _c
+411 520 435 496 454 464 _c
+148 273 _m
+148 207 161 155 188 117 _c
+215 79 253 61 301 61 _c
+}_e{348 61 385 79 413 117 _c
+440 155 454 207 454 273 _c
+454 339 440 390 413 428 _c
+385 466 348 485 301 485 _c
+253 485 215 466 188 428 _c
+161 390 148 339 148 273 _c
+_cl}_e}_d
+/e{{615 0 55 -13 562 560 _sc
+562 296 _m
+562 252 _l
+149 252 _l
+153 190 171 142 205 110 _c
+238 78 284 62 344 62 _c
+378 62 412 66 444 74 _c
+476 82 509 95 541 113 _c
+541 28 _l
+509 14 476 3 442 -3 _c
+408 -9 373 -13 339 -13 _c
+251 -13 182 12 131 62 _c
+80 112 55 181 55 268 _c
+55 357 79 428 127 481 _c
+175 533 241 560 323 560 _c
+397 560 455 536 498 489 _c
+}_e{540 441 562 377 562 296 _c
+472 322 _m
+471 371 457 410 431 440 _c
+404 469 368 484 324 484 _c
+274 484 234 469 204 441 _c
+174 413 156 373 152 322 _c
+472 322 _l
+_cl}_e}_d
+/f{352 0 23 0 371 760 _sc
+371 760 _m
+371 685 _l
+285 685 _l
+253 685 230 678 218 665 _c
+205 652 199 629 199 595 _c
+199 547 _l
+347 547 _l
+347 477 _l
+199 477 _l
+199 0 _l
+109 0 _l
+109 477 _l
+23 477 _l
+23 547 _l
+109 547 _l
+109 585 _l
+109 645 123 690 151 718 _c
+179 746 224 760 286 760 _c
+371 760 _l
+_cl}_d
+/g{{635 0 55 -207 544 560 _sc
+454 280 _m
+454 344 440 395 414 431 _c
+387 467 349 485 301 485 _c
+253 485 215 467 188 431 _c
+161 395 148 344 148 280 _c
+148 215 161 165 188 129 _c
+215 93 253 75 301 75 _c
+349 75 387 93 414 129 _c
+440 165 454 215 454 280 _c
+544 68 _m
+544 -24 523 -93 482 -139 _c
+440 -184 377 -207 292 -207 _c
+260 -207 231 -204 203 -200 _c
+175 -195 147 -188 121 -178 _c
+}_e{121 -91 _l
+147 -105 173 -115 199 -122 _c
+225 -129 251 -133 278 -133 _c
+336 -133 380 -117 410 -87 _c
+439 -56 454 -10 454 52 _c
+454 96 _l
+435 64 411 40 382 24 _c
+353 8 319 0 279 0 _c
+211 0 157 25 116 76 _c
+75 127 55 195 55 280 _c
+55 364 75 432 116 483 _c
+157 534 211 560 279 560 _c
+319 560 353 552 382 536 _c
+411 520 435 496 454 464 _c
+454 547 _l
+544 547 _l
+}_e{544 68 _l
+_cl}_e}_d
+/h{634 0 91 0 549 760 _sc
+549 330 _m
+549 0 _l
+459 0 _l
+459 327 _l
+459 379 448 417 428 443 _c
+408 469 378 482 338 482 _c
+289 482 251 466 223 435 _c
+195 404 181 362 181 309 _c
+181 0 _l
+91 0 _l
+91 760 _l
+181 760 _l
+181 462 _l
+202 494 227 519 257 535 _c
+286 551 320 560 358 560 _c
+420 560 468 540 500 501 _c
+532 462 549 405 549 330 _c
+_cl}_d
+/i{278 0 94 0 184 760 _sc
+94 547 _m
+184 547 _l
+184 0 _l
+94 0 _l
+94 547 _l
+94 760 _m
+184 760 _l
+184 646 _l
+94 646 _l
+94 760 _l
+_cl}_d
+/l{278 0 94 0 184 760 _sc
+94 760 _m
+184 760 _l
+184 0 _l
+94 0 _l
+94 760 _l
+_cl}_d
+/m{{974 0 91 0 889 560 _sc
+520 442 _m
+542 482 569 511 600 531 _c
+631 550 668 560 711 560 _c
+767 560 811 540 842 500 _c
+873 460 889 403 889 330 _c
+889 0 _l
+799 0 _l
+799 327 _l
+799 379 789 418 771 444 _c
+752 469 724 482 686 482 _c
+639 482 602 466 575 435 _c
+548 404 535 362 535 309 _c
+535 0 _l
+445 0 _l
+445 327 _l
+445 379 435 418 417 444 _c
+398 469 369 482 331 482 _c
+}_e{285 482 248 466 221 435 _c
+194 404 181 362 181 309 _c
+181 0 _l
+91 0 _l
+91 547 _l
+181 547 _l
+181 462 _l
+201 495 226 520 255 536 _c
+283 552 317 560 357 560 _c
+397 560 430 550 458 530 _c
+486 510 506 480 520 442 _c
+_cl}_e}_d
+/n{634 0 91 0 549 560 _sc
+549 330 _m
+549 0 _l
+459 0 _l
+459 327 _l
+459 379 448 417 428 443 _c
+408 469 378 482 338 482 _c
+289 482 251 466 223 435 _c
+195 404 181 362 181 309 _c
+181 0 _l
+91 0 _l
+91 547 _l
+181 547 _l
+181 462 _l
+202 494 227 519 257 535 _c
+286 551 320 560 358 560 _c
+420 560 468 540 500 501 _c
+532 462 549 405 549 330 _c
+_cl}_d
+/o{612 0 55 -13 557 560 _sc
+306 484 _m
+258 484 220 465 192 427 _c
+164 389 150 338 150 273 _c
+150 207 163 156 191 118 _c
+219 80 257 62 306 62 _c
+354 62 392 80 420 118 _c
+448 156 462 207 462 273 _c
+462 337 448 389 420 427 _c
+392 465 354 484 306 484 _c
+306 560 _m
+384 560 445 534 490 484 _c
+534 433 557 363 557 273 _c
+557 183 534 113 490 63 _c
+445 12 384 -13 306 -13 _c
+227 -13 165 12 121 63 _c
+77 113 55 183 55 273 _c
+55 363 77 433 121 484 _c
+165 534 227 560 306 560 _c
+_cl}_d
+/p{{635 0 91 -207 580 560 _sc
+181 82 _m
+181 -207 _l
+91 -207 _l
+91 547 _l
+181 547 _l
+181 464 _l
+199 496 223 520 252 536 _c
+281 552 316 560 356 560 _c
+422 560 476 533 518 481 _c
+559 428 580 359 580 273 _c
+580 187 559 117 518 65 _c
+476 13 422 -13 356 -13 _c
+316 -13 281 -5 252 10 _c
+223 25 199 49 181 82 _c
+487 273 _m
+487 339 473 390 446 428 _c
+418 466 381 485 334 485 _c
+}_e{286 485 249 466 222 428 _c
+194 390 181 339 181 273 _c
+181 207 194 155 222 117 _c
+249 79 286 61 334 61 _c
+381 61 418 79 446 117 _c
+473 155 487 207 487 273 _c
+_cl}_e}_d
+/r{411 0 91 0 411 560 _sc
+411 463 _m
+401 469 390 473 378 476 _c
+366 478 353 480 339 480 _c
+288 480 249 463 222 430 _c
+194 397 181 350 181 288 _c
+181 0 _l
+91 0 _l
+91 547 _l
+181 547 _l
+181 462 _l
+199 495 224 520 254 536 _c
+284 552 321 560 365 560 _c
+371 560 378 559 386 559 _c
+393 558 401 557 411 555 _c
+411 463 _l
+_cl}_d
+/s{{521 0 54 -13 472 560 _sc
+443 531 _m
+443 446 _l
+417 458 391 468 364 475 _c
+336 481 308 485 279 485 _c
+234 485 200 478 178 464 _c
+156 450 145 430 145 403 _c
+145 382 153 366 169 354 _c
+185 342 217 330 265 320 _c
+296 313 _l
+360 299 405 279 432 255 _c
+458 230 472 195 472 151 _c
+472 100 452 60 412 31 _c
+372 1 316 -13 246 -13 _c
+216 -13 186 -10 154 -5 _c
+}_e{122 0 89 8 54 20 _c
+54 113 _l
+87 95 120 82 152 74 _c
+184 65 216 61 248 61 _c
+290 61 323 68 346 82 _c
+368 96 380 117 380 144 _c
+380 168 371 187 355 200 _c
+339 213 303 226 247 238 _c
+216 245 _l
+160 257 119 275 95 299 _c
+70 323 58 356 58 399 _c
+58 450 76 490 112 518 _c
+148 546 200 560 268 560 _c
+301 560 332 557 362 552 _c
+391 547 418 540 443 531 _c
+}_e{_cl}_e}_d
+/t{392 0 27 0 368 702 _sc
+183 702 _m
+183 547 _l
+368 547 _l
+368 477 _l
+183 477 _l
+183 180 _l
+183 135 189 106 201 94 _c
+213 81 238 75 276 75 _c
+368 75 _l
+368 0 _l
+276 0 _l
+206 0 158 13 132 39 _c
+106 65 93 112 93 180 _c
+93 477 _l
+27 477 _l
+27 547 _l
+93 547 _l
+93 702 _l
+183 702 _l
+_cl}_d
+/w{818 0 42 0 776 547 _sc
+42 547 _m
+132 547 _l
+244 120 _l
+356 547 _l
+462 547 _l
+574 120 _l
+686 547 _l
+776 547 _l
+633 0 _l
+527 0 _l
+409 448 _l
+291 0 _l
+185 0 _l
+42 547 _l
+_cl}_d
+/y{592 0 30 -207 562 547 _sc
+322 -50 _m
+296 -114 271 -157 247 -177 _c
+223 -197 191 -207 151 -207 _c
+79 -207 _l
+79 -132 _l
+132 -132 _l
+156 -132 175 -126 189 -114 _c
+203 -102 218 -75 235 -31 _c
+251 9 _l
+30 547 _l
+125 547 _l
+296 119 _l
+467 547 _l
+562 547 _l
+322 -50 _l
+_cl}_d
+end readonly def
+
+/BuildGlyph
+ {exch begin
+ CharStrings exch
+ 2 copy known not{pop /.notdef}if
+ true 3 1 roll get exec
+ end}_d
+
+/BuildChar {
+ 1 index /Encoding get exch get
+ 1 index /BuildGlyph get exec
+}_d
+
+FontName currentdict end definefont pop
+end
+%%EndProlog
+mpldict begin
+18 252 translate
+576 288 0 0 clipbox
+gsave
+0 0 m
+576 0 l
+576 288 l
+0 288 l
+cl
+1.000 setgray
+fill
+grestore
+gsave
+72 31.68 m
+518.4 31.68 l
+518.4 253.44 l
+72 253.44 l
+cl
+1.000 setgray
+fill
+grestore
+1.000 setlinewidth
+0 setlinejoin
+0 setlinecap
+[] 0 setdash
+0.122 0.467 0.706 setrgbcolor
+gsave
+446.4 221.8 72 31.68 clipbox
+86.627066 31.68 m
+86.627066 34.671804 l
+95.262525 34.671804 l
+95.262525 36.167706 l
+103.897983 36.167706 l
+103.897983 37.663608 l
+112.533442 37.663608 l
+112.533442 39.15951 l
+121.168901 39.15951 l
+121.168901 39.15951 l
+129.804359 39.15951 l
+129.804359 40.655412 l
+138.439818 40.655412 l
+138.439818 43.647216 l
+147.075277 43.647216 l
+147.075277 43.647216 l
+155.710735 43.647216 l
+155.710735 43.647216 l
+164.346194 43.647216 l
+164.346194 43.647216 l
+172.981653 43.647216 l
+172.981653 46.639021 l
+181.617111 46.639021 l
+181.617111 48.134923 l
+190.25257 48.134923 l
+190.25257 49.630825 l
+198.888029 49.630825 l
+198.888029 49.630825 l
+207.523488 49.630825 l
+207.523488 51.126727 l
+216.158946 51.126727 l
+216.158946 52.622629 l
+224.794405 52.622629 l
+224.794405 55.614433 l
+233.429864 55.614433 l
+233.429864 58.606237 l
+242.065322 58.606237 l
+242.065322 61.598041 l
+250.700781 61.598041 l
+250.700781 61.598041 l
+259.33624 61.598041 l
+259.33624 66.085747 l
+267.971698 66.085747 l
+267.971698 67.581649 l
+276.607157 67.581649 l
+276.607157 67.581649 l
+285.242616 67.581649 l
+285.242616 69.077551 l
+293.878074 69.077551 l
+293.878074 70.573453 l
+302.513533 70.573453 l
+302.513533 70.573453 l
+311.148992 70.573453 l
+311.148992 70.573453 l
+319.78445 70.573453 l
+319.78445 72.069355 l
+328.419909 72.069355 l
+328.419909 78.052964 l
+337.055368 78.052964 l
+337.055368 78.052964 l
+345.690826 78.052964 l
+345.690826 79.548866 l
+354.326285 79.548866 l
+354.326285 82.54067 l
+362.961744 82.54067 l
+362.961744 85.532474 l
+371.597202 85.532474 l
+371.597202 87.028376 l
+380.232661 87.028376 l
+380.232661 90.02018 l
+388.86812 90.02018 l
+388.86812 93.011984 l
+397.503578 93.011984 l
+397.503578 94.507886 l
+406.139037 94.507886 l
+406.139037 100.491494 l
+414.774496 100.491494 l
+414.774496 103.483299 l
+423.409955 103.483299 l
+423.409955 104.979201 l
+432.045413 104.979201 l
+432.045413 110.962809 l
+440.680872 110.962809 l
+440.680872 113.954613 l
+449.316331 113.954613 l
+449.316331 118.442319 l
+457.951789 118.442319 l
+457.951789 125.921829 l
+466.587248 125.921829 l
+466.587248 137.889046 l
+475.222707 137.889046 l
+475.222707 143.872654 l
+483.858165 143.872654 l
+483.858165 152.848066 l
+492.493624 152.848066 l
+492.493624 161.823479 l
+501.129083 161.823479 l
+501.129083 181.270205 l
+509.764541 181.270205 l
+509.764541 289 l
+518.4 289 m
+518.4 31.68 l
+stroke
+grestore
+0.800 setlinewidth
+1 setlinejoin
+2 setlinecap
+[] 0 setdash
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+72 31.68 m
+72 253.44 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+0 -3.5 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+72 31.68 o
+grestore
+/DejaVuSans findfont
+10.000 scalefont
+setfont
+gsave
+64.046875 17.086250 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /five glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+161.28 31.68 m
+161.28 253.44 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+0 -3.5 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+161.28 31.68 o
+grestore
+gsave
+153.326875 17.086250 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /six glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+250.56 31.68 m
+250.56 253.44 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+0 -3.5 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+250.56 31.68 o
+grestore
+gsave
+242.606875 17.086250 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /seven glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+339.84 31.68 m
+339.84 253.44 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+0 -3.5 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+339.84 31.68 o
+grestore
+gsave
+331.886875 17.086250 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /eight glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+429.12 31.68 m
+429.12 253.44 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+0 -3.5 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+429.12 31.68 o
+grestore
+gsave
+421.166875 17.086250 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /nine glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+518.4 31.68 m
+518.4 253.44 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+0 -3.5 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+518.4 31.68 o
+grestore
+gsave
+510.446875 17.086250 translate
+0.000000 rotate
+0.000000 0.000000 m /one glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /zero glyphshow
+grestore
+gsave
+220.895312 3.414375 translate
+0.000000 rotate
+0.000000 0.000000 m /H glyphshow
+7.519531 0.000000 m /i glyphshow
+10.297852 0.000000 m /g glyphshow
+16.645508 0.000000 m /h glyphshow
+22.983398 0.000000 m /e glyphshow
+29.135742 0.000000 m /s glyphshow
+34.345703 0.000000 m /t glyphshow
+38.266602 0.000000 m /space glyphshow
+41.445312 0.000000 m /e glyphshow
+47.597656 0.000000 m /s glyphshow
+52.807617 0.000000 m /t glyphshow
+56.728516 0.000000 m /i glyphshow
+59.506836 0.000000 m /m glyphshow
+69.248047 0.000000 m /a glyphshow
+75.375977 0.000000 m /t glyphshow
+79.296875 0.000000 m /e glyphshow
+85.449219 0.000000 m /d glyphshow
+91.796875 0.000000 m /space glyphshow
+94.975586 0.000000 m /p glyphshow
+101.323242 0.000000 m /r glyphshow
+105.434570 0.000000 m /o glyphshow
+111.552734 0.000000 m /b glyphshow
+117.900391 0.000000 m /a glyphshow
+124.028320 0.000000 m /b glyphshow
+130.375977 0.000000 m /i glyphshow
+133.154297 0.000000 m /l glyphshow
+135.932617 0.000000 m /i glyphshow
+138.710938 0.000000 m /t glyphshow
+142.631836 0.000000 m /y glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+72 31.68 m
+518.4 31.68 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+-3.5 0 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+72 31.68 o
+grestore
+gsave
+36.375000 27.883125 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /zero glyphshow
+15.903320 0.000000 m /zero glyphshow
+22.265625 0.000000 m /zero glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+72 61.248 m
+518.4 61.248 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+-3.5 0 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+72 61.248 o
+grestore
+gsave
+36.375000 57.451125 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /zero glyphshow
+15.903320 0.000000 m /zero glyphshow
+22.265625 0.000000 m /two glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+72 90.816 m
+518.4 90.816 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+-3.5 0 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+72 90.816 o
+grestore
+gsave
+36.375000 87.019125 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /zero glyphshow
+15.903320 0.000000 m /zero glyphshow
+22.265625 0.000000 m /four glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+72 120.384 m
+518.4 120.384 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+-3.5 0 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+72 120.384 o
+grestore
+gsave
+36.375000 116.587125 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /zero glyphshow
+15.903320 0.000000 m /zero glyphshow
+22.265625 0.000000 m /six glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+72 149.952 m
+518.4 149.952 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+-3.5 0 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+72 149.952 o
+grestore
+gsave
+36.375000 146.155125 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /zero glyphshow
+15.903320 0.000000 m /zero glyphshow
+22.265625 0.000000 m /eight glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+72 179.52 m
+518.4 179.52 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+-3.5 0 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+72 179.52 o
+grestore
+gsave
+36.375000 175.723125 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /zero glyphshow
+15.903320 0.000000 m /one glyphshow
+22.265625 0.000000 m /zero glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+72 209.088 m
+518.4 209.088 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+-3.5 0 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+72 209.088 o
+grestore
+gsave
+36.375000 205.291125 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /zero glyphshow
+15.903320 0.000000 m /one glyphshow
+22.265625 0.000000 m /two glyphshow
+grestore
+2 setlinecap
+0.690 setgray
+gsave
+446.4 221.8 72 31.68 clipbox
+72 238.656 m
+518.4 238.656 l
+stroke
+grestore
+0 setlinecap
+0.000 setgray
+gsave
+/o {
+gsave
+newpath
+translate
+0.8 setlinewidth
+1 setlinejoin
+0 setlinecap
+0 0 m
+-3.5 0 l
+
+gsave
+0.000 setgray
+fill
+grestore
+stroke
+grestore
+} bind def
+72 238.656 o
+grestore
+gsave
+36.375000 234.859125 translate
+0.000000 rotate
+0.000000 0.000000 m /zero glyphshow
+6.362305 0.000000 m /period glyphshow
+9.541016 0.000000 m /zero glyphshow
+15.903320 0.000000 m /one glyphshow
+22.265625 0.000000 m /four glyphshow
+grestore
+gsave
+30.296875 -2.322812 translate
+90.000000 rotate
+0.000000 0.000000 m /F glyphshow
+5.751953 0.000000 m /r glyphshow
+9.863281 0.000000 m /a glyphshow
+15.991211 0.000000 m /c glyphshow
+21.489258 0.000000 m /t glyphshow
+25.410156 0.000000 m /i glyphshow
+28.188477 0.000000 m /o glyphshow
+34.306641 0.000000 m /n glyphshow
+40.644531 0.000000 m /space glyphshow
+43.823242 0.000000 m /o glyphshow
+49.941406 0.000000 m /f glyphshow
+53.461914 0.000000 m /space glyphshow
+56.640625 0.000000 m /a glyphshow
+62.768555 0.000000 m /r glyphshow
+66.879883 0.000000 m /t glyphshow
+70.800781 0.000000 m /i glyphshow
+73.579102 0.000000 m /c glyphshow
+79.077148 0.000000 m /l glyphshow
+81.855469 0.000000 m /e glyphshow
+88.007812 0.000000 m /s glyphshow
+93.217773 0.000000 m /space glyphshow
+96.396484 0.000000 m /w glyphshow
+104.575195 0.000000 m /i glyphshow
+107.353516 0.000000 m /t glyphshow
+111.274414 0.000000 m /h glyphshow
+117.612305 0.000000 m /space glyphshow
+120.791016 0.000000 m /t glyphshow
+124.711914 0.000000 m /h glyphshow
+131.049805 0.000000 m /i glyphshow
+133.828125 0.000000 m /s glyphshow
+139.038086 0.000000 m /space glyphshow
+142.216797 0.000000 m /h glyphshow
+148.554688 0.000000 m /i glyphshow
+151.333008 0.000000 m /g glyphshow
+157.680664 0.000000 m /h glyphshow
+164.018555 0.000000 m /e glyphshow
+170.170898 0.000000 m /s glyphshow
+175.380859 0.000000 m /t glyphshow
+179.301758 0.000000 m /space glyphshow
+182.480469 0.000000 m /e glyphshow
+188.632812 0.000000 m /s glyphshow
+193.842773 0.000000 m /t glyphshow
+197.763672 0.000000 m /i glyphshow
+200.541992 0.000000 m /m glyphshow
+210.283203 0.000000 m /a glyphshow
+216.411133 0.000000 m /t glyphshow
+220.332031 0.000000 m /e glyphshow
+226.484375 0.000000 m /d glyphshow
+232.832031 0.000000 m /space glyphshow
+236.010742 0.000000 m /p glyphshow
+242.358398 0.000000 m /r glyphshow
+246.469727 0.000000 m /o glyphshow
+252.587891 0.000000 m /b glyphshow
+258.935547 0.000000 m /a glyphshow
+265.063477 0.000000 m /b glyphshow
+271.411133 0.000000 m /i glyphshow
+274.189453 0.000000 m /l glyphshow
+276.967773 0.000000 m /i glyphshow
+279.746094 0.000000 m /t glyphshow
+283.666992 0.000000 m /y glyphshow
+grestore
+0 setlinejoin
+2 setlinecap
+[] 0 setdash
+gsave
+72 31.68 m
+72 253.44 l
+stroke
+grestore
+gsave
+518.4 31.68 m
+518.4 253.44 l
+stroke
+grestore
+gsave
+72 31.68 m
+518.4 31.68 l
+stroke
+grestore
+gsave
+72 253.44 m
+518.4 253.44 l
+stroke
+grestore
+
+end
+showpage
diff --git a/visualization/proba_stratified_round_9.png b/visualization/proba_stratified_round_9.png
new file mode 100644
index 0000000..584993f
Binary files /dev/null and b/visualization/proba_stratified_round_9.png differ