thesis-anne/src/working notebooks/2019-03-12-al-model-evaluat...

2075 lines
75 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import operator\n",
"import pickle\n",
"import random\n",
"\n",
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
"import ipywidgets as widgets\n",
"from IPython.core.interactiveshell import InteractiveShell\n",
"from IPython.display import display\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.svm import SVC\n",
"from sklearn.svm import LinearSVC\n",
"\n",
"from BagOfWords import BagOfWords\n",
"from MNBInteractive import MNBInteractive\n",
"from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
"from NaiveBayes import NaiveBayes"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# initialize random => reproducible sequence\n",
"random.seed(5)\n",
"\n",
"# set up wider display area\n",
"pd.set_option('display.max_colwidth', -1)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Last round number: 11\n",
"Number of manually labeled articles: 1082\n",
"Number of manually unlabeled articles: 8918\n"
]
}
],
"source": [
"# read current data set from csv\n",
"df = pd.read_csv('../data/interactive_labeling_round_11.csv',\n",
" sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')\n",
"\n",
"# find current iteration/round number\n",
"m = int(df['Round'].max())\n",
"print('Last round number: {}'.format(m))\n",
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def show_next(index):\n",
" ''' this method displays an article's text and an interactive slider to set its label manually\n",
" '''\n",
" print('News article no. {}:'.format(index))\n",
" print()\n",
" print('HEADLINE:')\n",
" print(df.loc[df['Index'] == index, 'Title'])\n",
" print()\n",
" print('TEXT:')\n",
" print(df.loc[df['Index'] == index, 'Text'])\n",
" print()\n",
" print('ESTIMATED_0:')\n",
" print(df.loc[df['Index'] == index, 'Estimated_0'])\n",
" print()\n",
" print('ESTIMATED_1:')\n",
" print(df.loc[df['Index'] == index, 'Estimated_1'])\n",
" print()\n",
" print('ESTIMATED_2:')\n",
" print(df.loc[df['Index'] == index, 'Estimated_2'])\n",
" \n",
" def f(x):\n",
" # save user input\n",
" df.loc[df['Index'] == index, 'Label'] = x\n",
"\n",
" # create slider widget for labels\n",
" interact(f, x = widgets.IntSlider(min=-1, max=2, step=1, value=df.loc[df['Index'] == index, 'Label']))\n",
" print('0: Other/Unrelated news, 1: Merger,')\n",
" print('2: Topics related to deals, investments and mergers')\n",
" print('___________________________________________________________________________________________________________')\n",
" print()\n",
" print()\n",
"\n",
"# list of article indices that will be shown next\n",
"label_next = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## How to find a better model:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A) Multinomial Naive Bayes Algorithm"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"B) Multinomial Naive Bayes with bigram"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), bigram=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use my own BOW implementation:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=False, bigram=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"min(recall_scores)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"max(recall_scores)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"min(precision_scores)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"max(precision_scores)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# toDo: läuft noch nicht\n",
"\n",
"# series of indices of recently estimated articles \n",
"indices_estimated = df.loc[df['Label'] != -1, 'Index'].tolist()\n",
"\n",
"# annotate probability\n",
"n = 0\n",
"for row in class_probs[0]:\n",
" index = indices_estimated[n]\n",
" # save estimated label\n",
" df.loc[index, 'Estimated_2'] = row[1]\n",
" n += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
"print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
"print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
"print()\n",
"print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
"print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
"print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('confusion matrix:')\n",
"print('###############')\n",
"zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])\n",
"zero_0\n",
"zero_1 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 1)])\n",
"zero_1\n",
"zero_2 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 2)])\n",
"zero_2\n",
"print('/')\n",
"one_0 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 0)])\n",
"one_0\n",
"one_1 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 1)])\n",
"one_1\n",
"one_2 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 2)])\n",
"one_2\n",
"print('/')\n",
"\n",
"two_0 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 0)])\n",
"two_0\n",
"two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])\n",
"two_1\n",
"two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])\n",
"two_2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Building three separate models:\n",
"\n",
"B) One model per class: Funktioniert es besser wenn man 3 Modelle hat.\n",
"Begründung: wir sind interessiert an Klasse 1\n",
"Pro Klasse 1 Modell bauen (Hier ist das ziel das beste Modell zu finden. Dafür nehmen wir 1082 gelabelte Daten.)\n",
"3 Modelle => Ergebnis für 1 Sample: (70%, 40%, 80%) unklar => überprüfen\n",
"=> (90%, 90%, 90%) => überprüfen\n",
"liefert das bessere ambiguity Samples als oben\n",
"Stratified sample: (50 + 50 (1/2 von der anderen Klasse 1/2 der dritten Klasse))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"847"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(labeled_pos_0)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"50"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(labeled_pos_1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"185"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(labeled_pos_2)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# add three new columns for the three models, initialize with nans\n",
"df['Estimated_0'] = np.nan\n",
"df['Estimated_1'] = np.nan\n",
"df['Estimated_2'] = np.nan"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"sampling_class0_0 = labeled_pos_0.sample(n=100, random_state=5)\n",
"sampling_class0_1 = labeled_pos_1.sample(n=50, random_state=5)\n",
"sampling_class0_2 = labeled_pos_2.sample(n=50, random_state=5)\n",
"\n",
"sampling_class1_0 = labeled_pos_0.sample(n=25, random_state=5)\n",
"sampling_class1_1 = sampling_class0_1\n",
"sampling_class1_2 = labeled_pos_2.sample(n=25, random_state=5)\n",
"\n",
"sampling_class2_0 = labeled_pos_0.sample(n=50, random_state=5)\n",
"sampling_class2_1 = sampling_class0_1\n",
"sampling_class2_2 = labeled_pos_2.sample(n=100, random_state=5)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"sampling_class0_complement = pd.concat([sampling_class0_1, sampling_class0_2])\n",
"sampling_class1_complement = pd.concat([sampling_class1_0, sampling_class1_2])\n",
"sampling_class2_complement = pd.concat([sampling_class2_0, sampling_class2_1])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# prepare for binary classification:\n",
"# pos_label = 3\n",
"sampling_class0_0['Label'] = 3\n",
"sampling_class1_1['Label'] = 3\n",
"sampling_class2_2['Label'] = 3\n",
"# neg_label = 4\n",
"sampling_class0_complement['Label'] = 4\n",
"sampling_class1_complement['Label'] = 4\n",
"sampling_class2_complement['Label'] = 4"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"sampling_class0 = pd.concat([sampling_class0_0, sampling_class0_complement]).reset_index(drop=True)\n",
"sampling_class1 = pd.concat([sampling_class1_1, sampling_class1_complement]).reset_index(drop=True)\n",
"sampling_class2 = pd.concat([sampling_class2_2, sampling_class2_complement]).reset_index(drop=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Apply Algorithm to estimate all labeled articles (1082 samples):"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"200"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data = sampling_class2\n",
"indices_train = train_data['Index'].tolist()\n",
"len(indices_train)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1082"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#test_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_train))].reset_index(drop=True)\n",
"test_data = df.loc[(df['Label'] != -1)].reset_index(drop=True)\n",
"len(test_data)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"test_data.loc[(test_data['Label'] == 0), 'Label'] = 3\n",
"test_data.loc[(test_data['Label'] == 1) | (test_data['Label'] == 2), 'Label'] = 4"
]
},
{
"cell_type": "code",
"execution_count": 219,
"metadata": {},
"outputs": [],
"source": [
"# nur für Berechnung, ob 3er Modell EINDEUTIG richtiger, als normal\n",
"train_data = df.loc[(df['Label'] != -1)].reset_index(drop=True)\n",
"test_data = df.loc[(df['Label'] != -1)].reset_index(drop=True)\n",
"classifier = LinearSVC()"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {},
"outputs": [],
"source": [
"# nur für Berechnung, ob normales Modell 'richtiger'\n",
"train_data = df.loc[df['Index'].isin(subset_indices)].reset_index(drop=True)\n",
"test_data = df.loc[df['Index'].isin(subset_indices)].reset_index(drop=True)\n",
"classifier = LinearSVC()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Vorhersagen für gesamten Datensatz (10 000):\n",
"train_data = df.loc[df['Label'] != -1].reset_index(drop=True)\n",
"X = train_data['Title'] + '. ' + train_data['Text']\n",
"y = train_data['Label']\n",
"U = test_data['Title'] + '. ' + test_data['Text']"
]
},
{
"cell_type": "code",
"execution_count": 220,
"metadata": {},
"outputs": [],
"source": [
"classifier = LinearSVC()\n",
"cv = CountVectorizer()\n",
"# fit the training data and then return the matrix\n",
"training_data = cv.fit_transform(X, y).toarray()\n",
"testing_data = cv.transform(U).toarray()\n",
"#fit classifier\n",
"classifier.fit(training_data, y)\n",
"\n",
"#predict class\n",
"predictions_test = classifier.predict(testing_data)"
]
},
{
"cell_type": "code",
"execution_count": 234,
"metadata": {},
"outputs": [],
"source": [
"# make prediction for hole dataset\n",
"len(predictions_test)\n",
"indices_predicted = df.loc[df['Label'] != -1, 'Index'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 237,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[16.0, 17.0, 29.0, 33.0, 50.0, 57.0, 58.0, 64.0, 65.0, 88.0, 91.0, 98.0, 151.0, 162.0, 163.0, 167.0, 198.0, 213.0, 220.0, 230.0, 232.0, 247.0, 254.0, 279.0, 300.0, 307.0, 325.0, 328.0, 331.0, 356.0, 364.0, 379.0, 383.0, 388.0, 411.0, 417.0, 459.0, 470.0, 507.0, 522.0, 530.0, 558.0, 566.0, 573.0, 580.0, 583.0, 586.0, 590.0, 599.0, 613.0, 620.0, 634.0, 637.0, 654.0, 663.0, 688.0, 763.0, 789.0, 821.0, 824.0, 842.0, 866.0, 904.0, 920.0, 922.0, 925.0, 940.0, 958.0, 970.0, 978.0, 987.0, 998.0, 1001.0, 1017.0, 1029.0, 1089.0, 1095.0, 1111.0, 1120.0, 1142.0, 1146.0, 1161.0, 1164.0, 1177.0, 1188.0, 1255.0, 1291.0, 1321.0, 1343.0, 1369.0, 1376.0, 1394.0, 1422.0, 1475.0, 1482.0, 1524.0, 1536.0, 1548.0, 1568.0, 1578.0, 1579.0, 1586.0, 1596.0, 1612.0, 1620.0, 1672.0, 1700.0, 1760.0, 1762.0, 1770.0, 1779.0, 1815.0, 1836.0, 1856.0, 1858.0, 1859.0, 1871.0, 1879.0, 1886.0, 1887.0, 1903.0, 1910.0, 1914.0, 1931.0, 1954.0, 1960.0, 1967.0, 1998.0, 1999.0, 2075.0, 2081.0, 2105.0, 2114.0, 2117.0, 2156.0, 2173.0, 2202.0, 2203.0, 2241.0, 2254.0, 2261.0, 2265.0, 2279.0, 2298.0, 2309.0, 2314.0, 2315.0, 2316.0, 2343.0, 2345.0, 2362.0, 2388.0, 2399.0, 2402.0, 2412.0, 2417.0, 2419.0, 2435.0, 2441.0, 2447.0, 2451.0, 2465.0, 2480.0, 2482.0, 2495.0, 2498.0, 2512.0, 2526.0, 2548.0, 2564.0, 2571.0, 2585.0, 2586.0, 2597.0, 2612.0, 2618.0, 2621.0, 2637.0, 2651.0, 2659.0, 2667.0, 2670.0, 2673.0, 2675.0, 2718.0, 2723.0, 2730.0, 2734.0, 2741.0, 2773.0, 2782.0, 2786.0, 2789.0, 2793.0, 2817.0, 2826.0, 2861.0, 2877.0, 2906.0, 2911.0, 2917.0, 2918.0, 2932.0, 2954.0, 2957.0, 2966.0, 2988.0, 2990.0, 2993.0, 3001.0, 3005.0, 3006.0, 3031.0, 3041.0, 3042.0, 3053.0, 3055.0, 3056.0, 3070.0, 3087.0, 3091.0, 3101.0, 3103.0, 3129.0, 3130.0, 3140.0, 3186.0, 3188.0, 3195.0, 3223.0, 3234.0, 3237.0, 3249.0, 3251.0, 3254.0, 3256.0, 3274.0, 3293.0, 3318.0, 3340.0, 3342.0, 3372.0, 3389.0, 3398.0, 3445.0, 3470.0, 3501.0, 3513.0, 3517.0, 3527.0, 3619.0, 3631.0, 3682.0, 3700.0, 3702.0, 3718.0, 3731.0, 3738.0, 3741.0, 3778.0, 3780.0, 3793.0, 3834.0, 3860.0, 3862.0, 3866.0, 3886.0, 3901.0, 3937.0, 3968.0, 4013.0, 4015.0, 4028.0, 4043.0, 4069.0, 4081.0, 4090.0, 4122.0, 4153.0, 4161.0, 4182.0, 4185.0, 4233.0, 4235.0, 4237.0, 4250.0, 4268.0, 4388.0, 4411.0, 4423.0, 4430.0, 4439.0, 4443.0, 4444.0, 4467.0, 4474.0, 4492.0, 4520.0, 4536.0, 4559.0, 4562.0, 4566.0, 4571.0, 4596.0, 4597.0, 4607.0, 4660.0, 4682.0, 4702.0, 4716.0, 4742.0, 4743.0, 4763.0, 4819.0, 4820.0, 4834.0, 4842.0, 4845.0, 4851.0, 4857.0, 4874.0, 4886.0, 4896.0, 4901.0, 4907.0, 4912.0, 4913.0, 4923.0, 4924.0, 4938.0, 4940.0, 4961.0, 4962.0, 4966.0, 4978.0, 4981.0, 4994.0, 4996.0, 4999.0, 5001.0, 5022.0, 5037.0, 5038.0, 5052.0, 5066.0, 5071.0, 5074.0, 5096.0, 5112.0, 5116.0, 5130.0, 5154.0, 5155.0, 5159.0, 5161.0, 5163.0, 5177.0, 5179.0, 5184.0, 5200.0, 5221.0, 5222.0, 5224.0, 5226.0, 5227.0, 5246.0, 5249.0, 5257.0, 5268.0, 5285.0, 5299.0, 5300.0, 5377.0, 5385.0, 5397.0, 5427.0, 5430.0, 5464.0, 5473.0, 5496.0, 5497.0, 5550.0, 5575.0, 5583.0, 5584.0, 5592.0, 5595.0, 5599.0, 5635.0, 5642.0, 5661.0, 5664.0, 5688.0, 5698.0, 5699.0, 5707.0, 5708.0, 5755.0, 5772.0, 5791.0, 5796.0, 5808.0, 5819.0, 5831.0, 5847.0, 5864.0, 5865.0, 5870.0, 5894.0, 5910.0, 5931.0, 5932.0, 5950.0, 5957.0, 5973.0, 6000.0, 6002.0, 6004.0, 6008.0, 6018.0, 6040.0, 6047.0, 6054.0, 6056.0, 6059.0, 6060.0, 6064.0, 6079.0, 6080.0, 6081.0, 6094.0, 6098.0, 6099.0, 6141.0, 6158.0, 6164.0, 6179.0, 6186.0, 6192.0, 6193.0, 6198.0, 6205.0, 6222.0, 6233.0, 6270.0, 6283.0, 6297.0, 6311.0, 6315.0, 6335.0, 6352.0, 6360.0, 6378.0, 6385.0, 6396.0, 6416.0, 6421.0, 6424.0, 6450.0, 6468.0, 6473.0, 6495.0, 6503.0, 6516.0, 6520.0, 6527.0, 6535.0, 6552.0, 6585.0, 6596.0, 6598.0, 6637.0, 6649.0, 6662.0, 6677.0, 6680.0, 6700.0, 6718.0, 6729.0, 6738.0, 6768.0, 6782.0, 6786.0, 6799.0, 6808.0, 6819.0, 6823.0, 6827.0, 6846.0, 6864.0, 6869.0, 6880.0, 6884.0, 6888.0, 6956.0, 6974.0, 6981.0, 6998.0, 7004.0, 7039.0, 7043.0, 7072.0, 7118.0, 7140.0, 7154.0, 7157.0, 7188.0, 7262.0, 7270.0, 7280.0, 7288.0, 7309.0, 7347.0, 7351.0, 7370.0, 7371.0, 7376.0, 7395.0, 7415.0, 7439.0, 7447.0, 7460.0, 7485.0, 7488.0, 7503.0, 7506.0, 7515.0, 7538.0, 7562.0, 7574.0, 7588.0, 7596.0, 7650.0, 7660.0, 7694.0, 7705.0, 7708.0, 7723.0, 7761.0, 7799.0, 7827.0, 7842.0, 7857.0, 7871.0, 7892.0, 7893.0, 7905.0, 7914.0, 7931.0, 7942.0, 7968.0, 7969.0, 7986.0, 8016.0, 8020.0, 8023.0, 8029.0, 8037.0, 8066.0, 8070.0, 8081.0, 8087.0, 8112.0, 8123.0, 8126.0, 8140.0, 8142.0, 8152.0, 8157.0, 8177.0, 8180.0, 8192.0, 8195.0, 8197.0, 8215.0, 8233.0, 8250.0, 8263.0, 8278.0, 8304.0, 8307.0, 8317.0, 8352.0, 8355.0, 8380.0, 8381.0, 8393.0, 8445.0, 8452.0, 8475.0, 8490.0, 8495.0, 8504.0, 8510.0, 8547.0, 8551.0, 8583.0, 8610.0, 8615.0, 8650.0, 8682.0, 8707.0, 8728.0, 8777.0, 8784.0, 8790.0, 8818.0, 8825.0, 8835.0, 8837.0, 8845.0, 8850.0, 8853.0, 8857.0, 8859.0, 8870.0, 8871.0, 8891.0, 8898.0, 8921.0, 8926.0, 8929.0, 8952.0, 9021.0, 9027.0, 9047.0, 9109.0, 9139.0, 9141.0, 9144.0, 9187.0, 9201.0, 9215.0, 9218.0, 9233.0, 9234.0, 9243.0, 9253.0, 9265.0, 9312.0, 9319.0, 9360.0, 9362.0, 9375.0, 9377.0, 9380.0, 9392.0, 9395.0, 9414.0, 9417.0, 9418.0, 9422.0, 9423.0, 9446.0, 9472.0, 9485.0, 9491.0, 9498.0, 9511.0, 9544.0, 9549.0, 9553.0, 9555.0, 9559.0, 9606.0, 9610.0, 9622.0, 9636.0, 9642.0, 9656.0, 9674.0, 9691.0, 9720.0, 9736.0, 9746.0, 9753.0, 9768.0, 9789.0, 9799.0, 9854.0, 9885.0, 9908.0, 9919.0, 9927.0, 9934.0, 9953.0, 9962.0, 9963.0, 9984.0, 9986.0]\n"
]
}
],
"source": [
"print(subset_indices)\n"
]
},
{
"cell_type": "code",
"execution_count": 228,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2. 0. 0. ... 0. 0. 0.]\n",
"dict_keys([2.0, 0.0, 1.0])\n",
"dict_values([1108, 7619, 191])\n"
]
}
],
"source": [
"from collections import Counter\n",
"\n",
"print(Counter(predictions_test).keys()) # equals to list(set(words))\n",
"print(Counter(predictions_test).values()) # counts the elements' frequency"
]
},
{
"cell_type": "code",
"execution_count": 235,
"metadata": {},
"outputs": [],
"source": [
"df['Estimated'] = np.nan\n",
"for i, value in enumerate(indices_predicted):\n",
" df.loc[(df['Index'] == value), 'Estimated'] = predictions_test[i]"
]
},
{
"cell_type": "code",
"execution_count": 209,
"metadata": {},
"outputs": [],
"source": [
"# split training data into text and label set\n",
"# join title and text\n",
"X = train_data['Title'] + '. ' + train_data['Text']\n",
"y = train_data['Label']"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {},
"outputs": [],
"source": [
"# split testing data into text and label set\n",
"U = test_data['Title'] + '. ' + test_data['Text']\n",
"v = test_data['Label']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"With CountVectorizer / own BOW:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"classifier = GaussianNB()\n",
"#classifier = SVC(probability = True,\n",
"# gamma = 'auto')"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"cv = CountVectorizer()\n",
"\n",
"# probabilities of each class\n",
"class_probs = []\n",
"\n",
"# use sklearn CountVectorizer\n",
"# fit the training data and then return the matrix\n",
"training_data = cv.fit_transform(X, y).toarray()\n",
"# transform testing data and return the matrix\n",
"testing_data = cv.transform(U).toarray()\n",
"\n",
"# use my BOW\n",
"#extracted_words = BagOfWords.extract_all_words(X)\n",
"#vocab = BagOfWords.make_vocab(extracted_words)\n",
"\n",
"# fit the training data and then return the matrix\n",
"#training_data = BagOfWords.make_matrix(extracted_words,\n",
"# vocab)\n",
"\n",
"# transform testing data and return the matrix\n",
"#extracted_words = BagOfWords.extract_all_words(U)\n",
"#testing_data = BagOfWords.make_matrix(extracted_words,\n",
"# vocab)\n",
"\n",
"#fit classifier\n",
"classifier.fit(training_data, y)\n",
"\n",
"#predict class\n",
"predictions_test = classifier.predict(testing_data)\n",
"\n",
"class_probs = classifier.predict_proba(testing_data)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0., 1.],\n",
" [0., 1.],\n",
" [0., 1.],\n",
" ...,\n",
" [1., 0.],\n",
" [0., 1.],\n",
" [0., 1.]])"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"classifier.predict_proba(testing_data)"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"confusion matrix:\n",
"###############\n",
"465\n",
"0\n",
"10\n",
"/\n",
"57\n",
"50\n",
"23\n",
"/\n",
"36\n",
"0\n",
"49\n"
]
}
],
"source": [
"print('confusion matrix:')\n",
"print('###############')\n",
"zero_0 = len(test_data.loc[(test_data['NewLabel'] == 0) & (test_data['Label'] == 0)])\n",
"print(zero_0)\n",
"zero_1 = len(test_data.loc[(test_data['NewLabel'] == 0) & (test_data['Label'] == 1)])\n",
"print(zero_1)\n",
"zero_2 = len(test_data.loc[(test_data['NewLabel'] == 0) & (test_data['Label'] == 2)])\n",
"print(zero_2)\n",
"print('/')\n",
"one_0 = len(test_data.loc[(test_data['NewLabel'] == 1) & (test_data['Label'] == 0)])\n",
"print(one_0)\n",
"one_1 = len(test_data.loc[(test_data['NewLabel'] == 1) & (test_data['Label'] == 1)])\n",
"print(one_1)\n",
"one_2 = len(test_data.loc[(test_data['NewLabel'] == 1) & (test_data['Label'] == 2)])\n",
"print(one_2)\n",
"print('/')\n",
"\n",
"two_0 = len(test_data.loc[(test_data['NewLabel'] == 2) & (test_data['Label'] == 0)])\n",
"print(two_0)\n",
"two_1 = len(test_data.loc[(test_data['NewLabel'] == 2) & (test_data['Label'] == 1)])\n",
"print(two_1)\n",
"two_2 = len(test_data.loc[(test_data['NewLabel'] == 2) & (test_data['Label'] == 2)])\n",
"print(two_2)"
]
},
{
"cell_type": "code",
"execution_count": 236,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n",
"742\n",
"46\n",
"164\n",
"###############\n",
"14\n",
"1\n",
"4\n",
"###############\n",
"91\n",
"3\n",
"17\n",
"###############\n",
"metrics:\n",
"\n",
"742\n",
"25\n",
"210\n",
"105\n",
"###############\n",
"1\n",
"1014\n",
"18\n",
"49\n",
"###############\n",
"17\n",
"803\n",
"94\n",
"168\n",
"###############\n",
"77.94117647058823\n",
"87.60330578512396\n",
"70.88724584103512\n",
"###############\n",
"5.263157894736842\n",
"2.0\n",
"93.80776340110906\n",
"###############\n",
"15.315315315315313\n",
"9.18918918918919\n",
"75.78558225508318\n",
"###############\n",
"32.839883226880126\n",
"32.93083165810439\n",
"80.16019716574245\n"
]
}
],
"source": [
"print('###############')\n",
"zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
"print(zero_0)\n",
"zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
"print(zero_1)\n",
"zero_2 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 2)])\n",
"print(zero_2)\n",
"print('###############')\n",
"one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
"print(one_0)\n",
"one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
"print(one_1)\n",
"one_2 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 2)])\n",
"print(one_2)\n",
"print('###############')\n",
"\n",
"two_0 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 0)])\n",
"print(two_0)\n",
"two_1 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 1)])\n",
"print(two_1)\n",
"two_2 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 2)])\n",
"print(two_2)\n",
"print('###############')\n",
"print('metrics:')\n",
"print()\n",
"\n",
"total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
"\n",
"tp_0 = zero_0\n",
"print(tp_0)\n",
"tn_0 = one_1 + one_2 + two_1 + two_2\n",
"print(tn_0)\n",
"fp_0 = zero_1 + zero_2\n",
"print(fp_0)\n",
"fn_0 = one_0 + two_0\n",
"print(fn_0)\n",
"print('###############')\n",
"\n",
"tp_1 = one_1\n",
"print(tp_1)\n",
"tn_1 = zero_0 + zero_2 + two_0 + two_2\n",
"print(tn_1)\n",
"fp_1 = one_0 + one_2\n",
"print(fp_1)\n",
"fn_1 = zero_1 + two_1\n",
"print(fn_1)\n",
"print('###############')\n",
"\n",
"tp_2 = two_2\n",
"print(tp_2)\n",
"tn_2 = zero_0 + zero_1 + one_0 + one_1\n",
"print(tn_2)\n",
"fp_2 = two_0 + two_1\n",
"print(fp_2)\n",
"fn_2 = zero_2 + one_2\n",
"print(fn_2)\n",
"print('###############')\n",
"\n",
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
"print(prec_0)\n",
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
"print(rec_0)\n",
"acc_0 = (tp_0 + tn_0) / total * 100\n",
"print(acc_0)\n",
"print('###############')\n",
"\n",
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
"print(prec_1)\n",
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
"print(rec_1)\n",
"acc_1 = (tp_1 + tn_1) / total * 100\n",
"print(acc_1)\n",
"print('###############')\n",
"\n",
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
"print(prec_2)\n",
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
"print(rec_2)\n",
"acc_2 = (tp_2 + tn_2) / total * 100\n",
"print(acc_2)\n",
"print('###############')\n",
"\n",
"print((prec_1 + prec_2 + prec_0) / 3)\n",
"print((rec_1 + rec_2 + rec_0) / 3)\n",
"print((acc_1 + acc_2 + acc_0) / 3)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"recall: 0.18772136953955135\n",
"precision: 0.5335570469798657\n"
]
}
],
"source": [
"#print and store metrics\n",
"rec = recall_score(v, predictions_test, pos_label=3)\n",
"print('recall: ' + str(rec))\n",
"prec = precision_score(v, predictions_test, pos_label=3)\n",
"print('precision: ' + str(prec))"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [],
"source": [
"len(df.loc[(df['NewLabel'] == 0) | (df['NewLabel'] == 1) | (df['NewLabel'] == 2)])\n",
"subset_indices = df.loc[((df['NewLabel'] == 0) | (df['NewLabel'] == 1) | (df['NewLabel'] == 2)), 'Index'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"690"
]
},
"execution_count": 183,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(subset_indices)"
]
},
{
"cell_type": "code",
"execution_count": 205,
"metadata": {},
"outputs": [],
"source": [
"#df.loc[(df['Label'] == 1), ['Index', 'Title', 'Text']][:10]\n",
"#df.loc[3860]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Annotate Labels:"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"# series of indices of recently estimated articles \n",
"indices_estimated_2 = test_data['Index'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"# annotate probability\n",
"n = 0\n",
"for row in class_probs:\n",
" index = indices_estimated_0[n]\n",
" # save estimated label\n",
" df.loc[index, 'Estimated_0'] = row[0]\n",
" n += 1"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [],
"source": [
"df = df.rename(columns={'Estimated_0': 'Model_0', 'Estimated_1': 'Model_1', 'Estimated_2': 'Model_2'})"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"df = df.round(2)"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Model_0</th>\n",
" <th>Model_1</th>\n",
" <th>Model_2</th>\n",
" <th>Label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Model_0 Model_1 Model_2 Label\n",
"0 1.0 1.0 0.0 0.0 \n",
"1 1.0 0.0 0.0 0.0 \n",
"2 0.0 1.0 0.0 2.0 \n",
"3 1.0 0.0 0.0 0.0 \n",
"4 1.0 1.0 0.0 0.0 \n",
"5 0.0 1.0 0.0 0.0 \n",
"6 1.0 0.0 0.0 0.0 \n",
"7 1.0 0.0 1.0 0.0 \n",
"8 1.0 0.0 0.0 0.0 \n",
"9 1.0 0.0 0.0 0.0 "
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[(df['Label'] != -1), ['Model_0', 'Model_1', 'Model_2', 'Label']][:10].reset_index(drop=True)#.to_latex()"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\\\begin{tabular}{lrrr}\\n\\\\toprule\\n{} & Model\\\\_0 & Model\\\\_1 & Model\\\\_2 \\\\\\\\\\n\\\\midrule\\n0 & 0.83 & 0.51 & 0.53 \\\\\\\\\\n1 & 0.42 & 0.50 & 0.46 \\\\\\\\\\n2 & 0.35 & 0.50 & 0.42 \\\\\\\\\\n3 & 0.46 & 0.50 & 0.44 \\\\\\\\\\n4 & 0.51 & 0.50 & 0.46 \\\\\\\\\\n5 & 0.32 & 0.50 & 0.42 \\\\\\\\\\n6 & 0.94 & 0.51 & 0.62 \\\\\\\\\\n7 & 0.80 & 0.50 & 0.61 \\\\\\\\\\n8 & 0.32 & 0.50 & 0.42 \\\\\\\\\\n9 & 0.87 & 0.51 & 0.62 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[(df['Label'] != -1), ['Model_0', 'Model_1', 'Model_2']][:10].reset_index(drop=True).to_latex()"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"378"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#len(df.loc[(df['Label'] != -1) & ((df['Model_0'] + df['Model_1'] + df['Model_2']) > 1.0), ['Estimated_0', 'Estimated_1', 'Estimated_2']])\n",
"indices_ambiguous = df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0), 'Index'].reset_index(drop=True)\n",
"len(indices_ambiguous)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"n = 0\n",
"for row in class_probs:\n",
" index = indices_estimated_1[n]\n",
" # save estimated label\n",
" df.loc[index, 'Estimated_1'] = row[0]\n",
" n += 1"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"n = 0\n",
"for row in class_probs:\n",
" index = indices_estimated_2[n]\n",
" # save estimated label\n",
" df.loc[index, 'Estimated_2'] = row[0]\n",
" n += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.loc[index, 'Estimated_2']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Apply Naive Bayes Model (10-fold-cross validation):"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset = sampling_class0\n",
"\n",
"X = dataset['Title'] + '. ' + dataset['Text']\n",
"y = dataset['Label']\n",
"\n",
"cv = CountVectorizer()\n",
"\n",
"# use stratified k-fold cross-validation as split method\n",
"skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)\n",
"\n",
"classifier = GaussianNB()\n",
"\n",
"# metrics\n",
"recall_scores = []\n",
"precision_scores = []\n",
"\n",
"# probabilities of each class (of each fold)\n",
"class_probs = []\n",
"# counts number of training samples observed in each class \n",
"class_counts = []\n",
"\n",
"# for each fold\n",
"for train, test in skf.split(X,y):\n",
" \n",
" # fit the training data and then return the matrix\n",
" training_data = cv.fit_transform(X[train], y[train]).toarray()\n",
" # transform testing data and return the matrix\n",
" testing_data = cv.transform(X[test]).toarray()\n",
"\n",
" #fit classifier\n",
" classifier.fit(training_data, y[train])\n",
" #predict class\n",
" predictions_train = classifier.predict(training_data)\n",
" predictions_test = classifier.predict(testing_data)\n",
"\n",
" #print and store metrics\n",
" rec = recall_score(y[test], predictions_test)\n",
" recall_scores.append(rec)\n",
" prec = precision_score(y[test], predictions_test)\n",
" precision_scores.append(prec)\n",
"\n",
" class_probs.append(classifier.class_prior_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Annotate label:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# series of indices of recently estimated articles \n",
"indices_estimated_0 = sampling_class0['Index'].tolist()\n",
"\n",
"# annotate probability\n",
"n = 0\n",
"for row in class_probs:\n",
" index = indices_estimated_0[n]\n",
" # save estimated label\n",
" df.loc[index, 'Estimated_0'] = row[1]\n",
" n += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
"print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
"print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
"print()\n",
"print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
"print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
"print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of used samples:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"indices_all_samples = set((indices_estimated_0 + indices_estimated_1) + indices_estimated_2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(indices_all_samples)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check if there are samples where more than one class was marked with 1."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0)])"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Uuid 7f565b5746ba33be1784c7d3d6a7ede0c9cd81a2 \n",
"Title Toshiba to sell less than 20 pct of chip unit, but may opt for IPO later \n",
"Text Industrials 25am EST Toshiba to sell less than 20 pct of chip unit, but may opt for IPO later TOKYO Jan 27 Toshiba Corp is currently looking to sell less than 20 percent of its memory chip business as it looks to raise capital to offset an upcoming multi-billion dollar charge, but may eventually list it, executives said on Friday. Toshiba Chief Executive Officer Satoshi Tsunakawa said he will do all he can to ensure the company doesn't fall into negative net worth as a result of a writedown on its U.S. nuclear unit. The conglomerate will review its overseas nuclear business, Tsunakawa said, but added it has no plans to sell its infrastructure business. Toshiba's board on Friday approved plans to make its core memory chip business a separate company and seek outside investment in it. (Reporting by Makiko Yamazaki; Editing by Edwina Gibbs) Next In Industrials\n",
"Site reuters.com \n",
"SiteSection http://feeds.reuters.com/reuters/financialsNews \n",
"Url http://www.reuters.com/article/toshiba-accounting-chips-idUST9N1F103J \n",
"Timestamp 2017-01-27T15:25:00.000+02:00 \n",
"Index 0 \n",
"Round NaN \n",
"Label -1 \n",
"Probability 1 \n",
"Estimated 2 \n",
"Estimated_0 NaN \n",
"Estimated_1 NaN \n",
"Estimated_2 NaN \n",
"Name: 0, dtype: object"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[0]"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"704"
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) <= 1.0)])"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [],
"source": [
"# indices of 'clear' articles\n",
"clear_indices = df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) <= 1.0), 'Index'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [],
"source": [
"# save 3 arrays for NaiveBayes:\n",
"list_0 = df.loc[(df['Label'] != -1), 'Estimated_0'].tolist()\n",
"list_1 = df.loc[(df['Label'] != -1), 'Estimated_1'].tolist() \n",
"list_2 = df.loc[(df['Label'] != -1), 'Estimated_2'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [],
"source": [
"#len(list_2)\n",
"df['NewLabel'] = -1"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [],
"source": [
"with open('../obj/'+ 'array_3model_svm_class0'.format(m) + '.pkl', 'wb') as f:\n",
" pickle.dump(list_0, f, pickle.HIGHEST_PROTOCOL)\n",
"with open('../obj/'+ 'array_3model_svm_class1'.format(m) + '.pkl', 'wb') as f:\n",
" pickle.dump(list_1, f, pickle.HIGHEST_PROTOCOL)\n",
"with open('../obj/'+ 'array_3model_svm_class2'.format(m) + '.pkl', 'wb') as f:\n",
" pickle.dump(list_2, f, pickle.HIGHEST_PROTOCOL)"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"690"
]
},
"execution_count": 133,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df.loc[(df['NewLabel'] == 0) | (df['NewLabel'] == 1) | (df['NewLabel'] == 2)])"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [],
"source": [
"# annotate estimated label\n",
"estimated_articles = df.loc[(df['Label'] != -1), 'Index'].tolist()\n",
"for index in estimated_articles:\n",
" # save estimated label\n",
" df.loc[(df['Index'] == index), 'NewLabel'] = -1\n",
" df.loc[(df['Index'] == index) & (df['Estimated_0'] == 1.0) & (df['Estimated_1'] == 0.0) & (df['Estimated_2'] == 0.0), 'NewLabel'] = 0\n",
" df.loc[(df['Index'] == index) & (df['Estimated_0'] == 0.0) & (df['Estimated_1'] == 1.0) & (df['Estimated_2'] == 0.0), 'NewLabel'] = 1\n",
" df.loc[(df['Index'] == index) & (df['Estimated_0'] == 0.0) & (df['Estimated_1'] == 0.0) & (df['Estimated_2'] == 1.0), 'NewLabel'] = 2"
]
},
{
"cell_type": "code",
"execution_count": 174,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Estimated_0</th>\n",
" <th>Estimated_1</th>\n",
" <th>Estimated_2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>155</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>175</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>188</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>365</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>407</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>475</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>516</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>546</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>623</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>630</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>714</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>944</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1042</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1140</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1204</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1541</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2207</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2361</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2383</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2460</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2549</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2604</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2735</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2938</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3050</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3057</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3082</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3114</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6356</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6422</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6475</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6575</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6701</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6784</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6983</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7228</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7342</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7424</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7458</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7546</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7792</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8044</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8241</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8448</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8462</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8491</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8761</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9057</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9133</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9197</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9261</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9279</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9667</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9714</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9877</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9912</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9956</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9977</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>90 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Estimated_0 Estimated_1 Estimated_2\n",
"59 0.0 1.0 1.0 \n",
"155 0.0 0.0 0.0 \n",
"175 0.0 1.0 1.0 \n",
"188 0.0 1.0 1.0 \n",
"365 0.0 1.0 1.0 \n",
"407 0.0 1.0 1.0 \n",
"475 0.0 1.0 1.0 \n",
"516 0.0 1.0 1.0 \n",
"546 0.0 0.0 0.0 \n",
"623 0.0 1.0 1.0 \n",
"630 0.0 1.0 1.0 \n",
"714 0.0 1.0 1.0 \n",
"944 0.0 1.0 1.0 \n",
"1042 0.0 1.0 1.0 \n",
"1140 0.0 0.0 0.0 \n",
"1204 0.0 0.0 0.0 \n",
"1541 0.0 1.0 1.0 \n",
"2020 0.0 1.0 1.0 \n",
"2207 0.0 1.0 1.0 \n",
"2361 0.0 1.0 1.0 \n",
"2383 0.0 1.0 1.0 \n",
"2460 0.0 1.0 1.0 \n",
"2549 0.0 1.0 1.0 \n",
"2604 0.0 1.0 1.0 \n",
"2735 0.0 1.0 1.0 \n",
"2938 0.0 1.0 1.0 \n",
"3050 0.0 1.0 1.0 \n",
"3057 0.0 1.0 1.0 \n",
"3082 0.0 1.0 1.0 \n",
"3114 0.0 1.0 1.0 \n",
"... ... ... ... \n",
"6356 0.0 0.0 0.0 \n",
"6422 0.0 1.0 1.0 \n",
"6475 0.0 1.0 1.0 \n",
"6575 0.0 0.0 0.0 \n",
"6701 0.0 0.0 0.0 \n",
"6784 0.0 1.0 1.0 \n",
"6983 0.0 1.0 1.0 \n",
"7228 0.0 1.0 1.0 \n",
"7342 0.0 1.0 1.0 \n",
"7424 0.0 1.0 1.0 \n",
"7458 0.0 1.0 1.0 \n",
"7546 0.0 1.0 1.0 \n",
"7792 0.0 1.0 1.0 \n",
"8044 0.0 0.0 0.0 \n",
"8241 0.0 1.0 1.0 \n",
"8448 0.0 1.0 1.0 \n",
"8462 0.0 1.0 1.0 \n",
"8491 0.0 1.0 1.0 \n",
"8761 0.0 0.0 0.0 \n",
"9057 0.0 1.0 1.0 \n",
"9133 0.0 1.0 1.0 \n",
"9197 0.0 1.0 1.0 \n",
"9261 0.0 1.0 1.0 \n",
"9279 0.0 1.0 1.0 \n",
"9667 0.0 1.0 1.0 \n",
"9714 0.0 1.0 1.0 \n",
"9877 0.0 1.0 1.0 \n",
"9912 0.0 1.0 1.0 \n",
"9956 0.0 0.0 0.0 \n",
"9977 0.0 1.0 1.0 \n",
"\n",
"[90 rows x 3 columns]"
]
},
"execution_count": 174,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[(df['NewLabel'] == -1) & (df['Estimated_0'] != (0 | 1)), ['Estimated_0', 'Estimated_1', 'Estimated_2' ]][:100]\n",
"#df['NewLabel'] = np.nan"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"# save tri-model to csv \n",
"df.to_csv('../data/interactive_labeling_triple_model_on_1082_NaiveBayes.csv',\n",
" sep='|',\n",
" mode='w',\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [],
"source": [
"# read current data set from csv\n",
"df = pd.read_csv('../data/interactive_labeling_triple_model_on_1082_NaiveBayes.csv',\n",
" sep='|',\n",
" usecols=range(1,16), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for index in indices:\n",
" show_next(index)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}