2075 lines
75 KiB
Plaintext
2075 lines
75 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Model Evaluation"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import csv\n",
|
||
"import operator\n",
|
||
"import pickle\n",
|
||
"import random\n",
|
||
"\n",
|
||
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
|
||
"import ipywidgets as widgets\n",
|
||
"from IPython.core.interactiveshell import InteractiveShell\n",
|
||
"from IPython.display import display\n",
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||
"from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
|
||
"from sklearn.model_selection import GridSearchCV\n",
|
||
"from sklearn.model_selection import StratifiedKFold\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.naive_bayes import GaussianNB\n",
|
||
"from sklearn.naive_bayes import MultinomialNB\n",
|
||
"from sklearn.svm import SVC\n",
|
||
"from sklearn.svm import LinearSVC\n",
|
||
"\n",
|
||
"from BagOfWords import BagOfWords\n",
|
||
"from MNBInteractive import MNBInteractive\n",
|
||
"from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
|
||
"from NaiveBayes import NaiveBayes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# initialize random => reproducible sequence\n",
|
||
"random.seed(5)\n",
|
||
"\n",
|
||
"# set up wider display area\n",
|
||
"pd.set_option('display.max_colwidth', -1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Last round number: 11\n",
|
||
"Number of manually labeled articles: 1082\n",
|
||
"Number of manually unlabeled articles: 8918\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# read current data set from csv\n",
|
||
"df = pd.read_csv('../data/interactive_labeling_round_11.csv',\n",
|
||
" sep='|',\n",
|
||
" usecols=range(1,13), # drop first column 'unnamed'\n",
|
||
" encoding='utf-8',\n",
|
||
" quoting=csv.QUOTE_NONNUMERIC,\n",
|
||
" quotechar='\\'')\n",
|
||
"\n",
|
||
"# find current iteration/round number\n",
|
||
"m = int(df['Round'].max())\n",
|
||
"print('Last round number: {}'.format(m))\n",
|
||
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
|
||
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def show_next(index):\n",
|
||
" ''' this method displays an article's text and an interactive slider to set its label manually\n",
|
||
" '''\n",
|
||
" print('News article no. {}:'.format(index))\n",
|
||
" print()\n",
|
||
" print('HEADLINE:')\n",
|
||
" print(df.loc[df['Index'] == index, 'Title'])\n",
|
||
" print()\n",
|
||
" print('TEXT:')\n",
|
||
" print(df.loc[df['Index'] == index, 'Text'])\n",
|
||
" print()\n",
|
||
" print('ESTIMATED_0:')\n",
|
||
" print(df.loc[df['Index'] == index, 'Estimated_0'])\n",
|
||
" print()\n",
|
||
" print('ESTIMATED_1:')\n",
|
||
" print(df.loc[df['Index'] == index, 'Estimated_1'])\n",
|
||
" print()\n",
|
||
" print('ESTIMATED_2:')\n",
|
||
" print(df.loc[df['Index'] == index, 'Estimated_2'])\n",
|
||
" \n",
|
||
" def f(x):\n",
|
||
" # save user input\n",
|
||
" df.loc[df['Index'] == index, 'Label'] = x\n",
|
||
"\n",
|
||
" # create slider widget for labels\n",
|
||
" interact(f, x = widgets.IntSlider(min=-1, max=2, step=1, value=df.loc[df['Index'] == index, 'Label']))\n",
|
||
" print('0: Other/Unrelated news, 1: Merger,')\n",
|
||
" print('2: Topics related to deals, investments and mergers')\n",
|
||
" print('___________________________________________________________________________________________________________')\n",
|
||
" print()\n",
|
||
" print()\n",
|
||
"\n",
|
||
"# list of article indices that will be shown next\n",
|
||
"label_next = []"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## How to find a better model:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"A) Multinomial Naive Bayes Algorithm"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"B) Multinomial Naive Bayes with bigram"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), bigram=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Use my own BOW implementation:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=False, bigram=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"min(recall_scores)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"max(recall_scores)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"min(precision_scores)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"max(precision_scores)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# toDo: läuft noch nicht\n",
|
||
"\n",
|
||
"# series of indices of recently estimated articles \n",
|
||
"indices_estimated = df.loc[df['Label'] != -1, 'Index'].tolist()\n",
|
||
"\n",
|
||
"# annotate probability\n",
|
||
"n = 0\n",
|
||
"for row in class_probs[0]:\n",
|
||
" index = indices_estimated[n]\n",
|
||
" # save estimated label\n",
|
||
" df.loc[index, 'Estimated_2'] = row[1]\n",
|
||
" n += 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
|
||
"print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
|
||
"print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
|
||
"print()\n",
|
||
"print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
|
||
"print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
|
||
"print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"print('confusion matrix:')\n",
|
||
"print('###############')\n",
|
||
"zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])\n",
|
||
"zero_0\n",
|
||
"zero_1 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 1)])\n",
|
||
"zero_1\n",
|
||
"zero_2 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 2)])\n",
|
||
"zero_2\n",
|
||
"print('/')\n",
|
||
"one_0 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 0)])\n",
|
||
"one_0\n",
|
||
"one_1 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 1)])\n",
|
||
"one_1\n",
|
||
"one_2 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 2)])\n",
|
||
"one_2\n",
|
||
"print('/')\n",
|
||
"\n",
|
||
"two_0 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 0)])\n",
|
||
"two_0\n",
|
||
"two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])\n",
|
||
"two_1\n",
|
||
"two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])\n",
|
||
"two_2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Building three separate models:\n",
|
||
"\n",
|
||
"B) One model per class: Funktioniert es besser wenn man 3 Modelle hat.\n",
|
||
"Begründung: wir sind interessiert an Klasse 1\n",
|
||
"Pro Klasse 1 Modell bauen (Hier ist das ziel das beste Modell zu finden. Dafür nehmen wir 1082 gelabelte Daten.)\n",
|
||
"3 Modelle => Ergebnis für 1 Sample: (70%, 40%, 80%) unklar => überprüfen\n",
|
||
"=> (90%, 90%, 90%) => überprüfen\n",
|
||
"liefert das bessere ambiguity Samples als oben\n",
|
||
"Stratified sample: (50 + 50 (1/2 von der anderen Klasse 1/2 der dritten Klasse))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
|
||
"labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
|
||
"labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"847"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(labeled_pos_0)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"50"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(labeled_pos_1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"185"
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(labeled_pos_2)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# add three new columns for the three models, initialize with nans\n",
|
||
"df['Estimated_0'] = np.nan\n",
|
||
"df['Estimated_1'] = np.nan\n",
|
||
"df['Estimated_2'] = np.nan"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"sampling_class0_0 = labeled_pos_0.sample(n=100, random_state=5)\n",
|
||
"sampling_class0_1 = labeled_pos_1.sample(n=50, random_state=5)\n",
|
||
"sampling_class0_2 = labeled_pos_2.sample(n=50, random_state=5)\n",
|
||
"\n",
|
||
"sampling_class1_0 = labeled_pos_0.sample(n=25, random_state=5)\n",
|
||
"sampling_class1_1 = sampling_class0_1\n",
|
||
"sampling_class1_2 = labeled_pos_2.sample(n=25, random_state=5)\n",
|
||
"\n",
|
||
"sampling_class2_0 = labeled_pos_0.sample(n=50, random_state=5)\n",
|
||
"sampling_class2_1 = sampling_class0_1\n",
|
||
"sampling_class2_2 = labeled_pos_2.sample(n=100, random_state=5)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"sampling_class0_complement = pd.concat([sampling_class0_1, sampling_class0_2])\n",
|
||
"sampling_class1_complement = pd.concat([sampling_class1_0, sampling_class1_2])\n",
|
||
"sampling_class2_complement = pd.concat([sampling_class2_0, sampling_class2_1])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# prepare for binary classification:\n",
|
||
"# pos_label = 3\n",
|
||
"sampling_class0_0['Label'] = 3\n",
|
||
"sampling_class1_1['Label'] = 3\n",
|
||
"sampling_class2_2['Label'] = 3\n",
|
||
"# neg_label = 4\n",
|
||
"sampling_class0_complement['Label'] = 4\n",
|
||
"sampling_class1_complement['Label'] = 4\n",
|
||
"sampling_class2_complement['Label'] = 4"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"sampling_class0 = pd.concat([sampling_class0_0, sampling_class0_complement]).reset_index(drop=True)\n",
|
||
"sampling_class1 = pd.concat([sampling_class1_1, sampling_class1_complement]).reset_index(drop=True)\n",
|
||
"sampling_class2 = pd.concat([sampling_class2_2, sampling_class2_complement]).reset_index(drop=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Apply Algorithm to estimate all labeled articles (1082 samples):"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 75,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"200"
|
||
]
|
||
},
|
||
"execution_count": 75,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train_data = sampling_class2\n",
|
||
"indices_train = train_data['Index'].tolist()\n",
|
||
"len(indices_train)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 76,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"1082"
|
||
]
|
||
},
|
||
"execution_count": 76,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"#test_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_train))].reset_index(drop=True)\n",
|
||
"test_data = df.loc[(df['Label'] != -1)].reset_index(drop=True)\n",
|
||
"len(test_data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 77,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"test_data.loc[(test_data['Label'] == 0), 'Label'] = 3\n",
|
||
"test_data.loc[(test_data['Label'] == 1) | (test_data['Label'] == 2), 'Label'] = 4"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 219,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# nur für Berechnung, ob 3er Modell EINDEUTIG richtiger, als normal\n",
|
||
"train_data = df.loc[(df['Label'] != -1)].reset_index(drop=True)\n",
|
||
"test_data = df.loc[(df['Label'] != -1)].reset_index(drop=True)\n",
|
||
"classifier = LinearSVC()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 185,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# nur für Berechnung, ob normales Modell 'richtiger'\n",
|
||
"train_data = df.loc[df['Index'].isin(subset_indices)].reset_index(drop=True)\n",
|
||
"test_data = df.loc[df['Index'].isin(subset_indices)].reset_index(drop=True)\n",
|
||
"classifier = LinearSVC()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Vorhersagen für gesamten Datensatz (10 000):\n",
|
||
"train_data = df.loc[df['Label'] != -1].reset_index(drop=True)\n",
|
||
"X = train_data['Title'] + '. ' + train_data['Text']\n",
|
||
"y = train_data['Label']\n",
|
||
"U = test_data['Title'] + '. ' + test_data['Text']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 220,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"classifier = LinearSVC()\n",
|
||
"cv = CountVectorizer()\n",
|
||
"# fit the training data and then return the matrix\n",
|
||
"training_data = cv.fit_transform(X, y).toarray()\n",
|
||
"testing_data = cv.transform(U).toarray()\n",
|
||
"#fit classifier\n",
|
||
"classifier.fit(training_data, y)\n",
|
||
"\n",
|
||
"#predict class\n",
|
||
"predictions_test = classifier.predict(testing_data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 234,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# make prediction for hole dataset\n",
|
||
"len(predictions_test)\n",
|
||
"indices_predicted = df.loc[df['Label'] != -1, 'Index'].tolist()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 237,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[16.0, 17.0, 29.0, 33.0, 50.0, 57.0, 58.0, 64.0, 65.0, 88.0, 91.0, 98.0, 151.0, 162.0, 163.0, 167.0, 198.0, 213.0, 220.0, 230.0, 232.0, 247.0, 254.0, 279.0, 300.0, 307.0, 325.0, 328.0, 331.0, 356.0, 364.0, 379.0, 383.0, 388.0, 411.0, 417.0, 459.0, 470.0, 507.0, 522.0, 530.0, 558.0, 566.0, 573.0, 580.0, 583.0, 586.0, 590.0, 599.0, 613.0, 620.0, 634.0, 637.0, 654.0, 663.0, 688.0, 763.0, 789.0, 821.0, 824.0, 842.0, 866.0, 904.0, 920.0, 922.0, 925.0, 940.0, 958.0, 970.0, 978.0, 987.0, 998.0, 1001.0, 1017.0, 1029.0, 1089.0, 1095.0, 1111.0, 1120.0, 1142.0, 1146.0, 1161.0, 1164.0, 1177.0, 1188.0, 1255.0, 1291.0, 1321.0, 1343.0, 1369.0, 1376.0, 1394.0, 1422.0, 1475.0, 1482.0, 1524.0, 1536.0, 1548.0, 1568.0, 1578.0, 1579.0, 1586.0, 1596.0, 1612.0, 1620.0, 1672.0, 1700.0, 1760.0, 1762.0, 1770.0, 1779.0, 1815.0, 1836.0, 1856.0, 1858.0, 1859.0, 1871.0, 1879.0, 1886.0, 1887.0, 1903.0, 1910.0, 1914.0, 1931.0, 1954.0, 1960.0, 1967.0, 1998.0, 1999.0, 2075.0, 2081.0, 2105.0, 2114.0, 2117.0, 2156.0, 2173.0, 2202.0, 2203.0, 2241.0, 2254.0, 2261.0, 2265.0, 2279.0, 2298.0, 2309.0, 2314.0, 2315.0, 2316.0, 2343.0, 2345.0, 2362.0, 2388.0, 2399.0, 2402.0, 2412.0, 2417.0, 2419.0, 2435.0, 2441.0, 2447.0, 2451.0, 2465.0, 2480.0, 2482.0, 2495.0, 2498.0, 2512.0, 2526.0, 2548.0, 2564.0, 2571.0, 2585.0, 2586.0, 2597.0, 2612.0, 2618.0, 2621.0, 2637.0, 2651.0, 2659.0, 2667.0, 2670.0, 2673.0, 2675.0, 2718.0, 2723.0, 2730.0, 2734.0, 2741.0, 2773.0, 2782.0, 2786.0, 2789.0, 2793.0, 2817.0, 2826.0, 2861.0, 2877.0, 2906.0, 2911.0, 2917.0, 2918.0, 2932.0, 2954.0, 2957.0, 2966.0, 2988.0, 2990.0, 2993.0, 3001.0, 3005.0, 3006.0, 3031.0, 3041.0, 3042.0, 3053.0, 3055.0, 3056.0, 3070.0, 3087.0, 3091.0, 3101.0, 3103.0, 3129.0, 3130.0, 3140.0, 3186.0, 3188.0, 3195.0, 3223.0, 3234.0, 3237.0, 3249.0, 3251.0, 3254.0, 3256.0, 3274.0, 3293.0, 3318.0, 3340.0, 3342.0, 3372.0, 3389.0, 3398.0, 3445.0, 3470.0, 3501.0, 3513.0, 3517.0, 3527.0, 3619.0, 3631.0, 3682.0, 3700.0, 3702.0, 3718.0, 3731.0, 3738.0, 3741.0, 3778.0, 3780.0, 3793.0, 3834.0, 3860.0, 3862.0, 3866.0, 3886.0, 3901.0, 3937.0, 3968.0, 4013.0, 4015.0, 4028.0, 4043.0, 4069.0, 4081.0, 4090.0, 4122.0, 4153.0, 4161.0, 4182.0, 4185.0, 4233.0, 4235.0, 4237.0, 4250.0, 4268.0, 4388.0, 4411.0, 4423.0, 4430.0, 4439.0, 4443.0, 4444.0, 4467.0, 4474.0, 4492.0, 4520.0, 4536.0, 4559.0, 4562.0, 4566.0, 4571.0, 4596.0, 4597.0, 4607.0, 4660.0, 4682.0, 4702.0, 4716.0, 4742.0, 4743.0, 4763.0, 4819.0, 4820.0, 4834.0, 4842.0, 4845.0, 4851.0, 4857.0, 4874.0, 4886.0, 4896.0, 4901.0, 4907.0, 4912.0, 4913.0, 4923.0, 4924.0, 4938.0, 4940.0, 4961.0, 4962.0, 4966.0, 4978.0, 4981.0, 4994.0, 4996.0, 4999.0, 5001.0, 5022.0, 5037.0, 5038.0, 5052.0, 5066.0, 5071.0, 5074.0, 5096.0, 5112.0, 5116.0, 5130.0, 5154.0, 5155.0, 5159.0, 5161.0, 5163.0, 5177.0, 5179.0, 5184.0, 5200.0, 5221.0, 5222.0, 5224.0, 5226.0, 5227.0, 5246.0, 5249.0, 5257.0, 5268.0, 5285.0, 5299.0, 5300.0, 5377.0, 5385.0, 5397.0, 5427.0, 5430.0, 5464.0, 5473.0, 5496.0, 5497.0, 5550.0, 5575.0, 5583.0, 5584.0, 5592.0, 5595.0, 5599.0, 5635.0, 5642.0, 5661.0, 5664.0, 5688.0, 5698.0, 5699.0, 5707.0, 5708.0, 5755.0, 5772.0, 5791.0, 5796.0, 5808.0, 5819.0, 5831.0, 5847.0, 5864.0, 5865.0, 5870.0, 5894.0, 5910.0, 5931.0, 5932.0, 5950.0, 5957.0, 5973.0, 6000.0, 6002.0, 6004.0, 6008.0, 6018.0, 6040.0, 6047.0, 6054.0, 6056.0, 6059.0, 6060.0, 6064.0, 6079.0, 6080.0, 6081.0, 6094.0, 6098.0, 6099.0, 6141.0, 6158.0, 6164.0, 6179.0, 6186.0, 6192.0, 6193.0, 6198.0, 6205.0, 6222.0, 6233.0, 6270.0, 6283.0, 6297.0, 6311.0, 6315.0, 6335.0, 6352.0, 6360.0, 6378.0, 6385.0, 6396.0, 6416.0, 6421.0, 6424.0, 6450.0, 6468.0, 6473.0, 6495.0, 6503.0, 6516.0, 6520.0, 6527.0, 6535.0, 6552.0, 6585.0, 6596.0, 6598.0, 6637.0, 6649.0, 6662.0, 6677.0, 6680.0, 6700.0, 6718.0, 6729.0, 6738.0, 6768.0, 6782.0, 6786.0, 6799.0, 6808.0, 6819.0, 6823.0, 6827.0, 6846.0, 6864.0, 6869.0, 6880.0, 6884.0, 6888.0, 6956.0, 6974.0, 6981.0, 6998.0, 7004.0, 7039.0, 7043.0, 7072.0, 7118.0, 7140.0, 7154.0, 7157.0, 7188.0, 7262.0, 7270.0, 7280.0, 7288.0, 7309.0, 7347.0, 7351.0, 7370.0, 7371.0, 7376.0, 7395.0, 7415.0, 7439.0, 7447.0, 7460.0, 7485.0, 7488.0, 7503.0, 7506.0, 7515.0, 7538.0, 7562.0, 7574.0, 7588.0, 7596.0, 7650.0, 7660.0, 7694.0, 7705.0, 7708.0, 7723.0, 7761.0, 7799.0, 7827.0, 7842.0, 7857.0, 7871.0, 7892.0, 7893.0, 7905.0, 7914.0, 7931.0, 7942.0, 7968.0, 7969.0, 7986.0, 8016.0, 8020.0, 8023.0, 8029.0, 8037.0, 8066.0, 8070.0, 8081.0, 8087.0, 8112.0, 8123.0, 8126.0, 8140.0, 8142.0, 8152.0, 8157.0, 8177.0, 8180.0, 8192.0, 8195.0, 8197.0, 8215.0, 8233.0, 8250.0, 8263.0, 8278.0, 8304.0, 8307.0, 8317.0, 8352.0, 8355.0, 8380.0, 8381.0, 8393.0, 8445.0, 8452.0, 8475.0, 8490.0, 8495.0, 8504.0, 8510.0, 8547.0, 8551.0, 8583.0, 8610.0, 8615.0, 8650.0, 8682.0, 8707.0, 8728.0, 8777.0, 8784.0, 8790.0, 8818.0, 8825.0, 8835.0, 8837.0, 8845.0, 8850.0, 8853.0, 8857.0, 8859.0, 8870.0, 8871.0, 8891.0, 8898.0, 8921.0, 8926.0, 8929.0, 8952.0, 9021.0, 9027.0, 9047.0, 9109.0, 9139.0, 9141.0, 9144.0, 9187.0, 9201.0, 9215.0, 9218.0, 9233.0, 9234.0, 9243.0, 9253.0, 9265.0, 9312.0, 9319.0, 9360.0, 9362.0, 9375.0, 9377.0, 9380.0, 9392.0, 9395.0, 9414.0, 9417.0, 9418.0, 9422.0, 9423.0, 9446.0, 9472.0, 9485.0, 9491.0, 9498.0, 9511.0, 9544.0, 9549.0, 9553.0, 9555.0, 9559.0, 9606.0, 9610.0, 9622.0, 9636.0, 9642.0, 9656.0, 9674.0, 9691.0, 9720.0, 9736.0, 9746.0, 9753.0, 9768.0, 9789.0, 9799.0, 9854.0, 9885.0, 9908.0, 9919.0, 9927.0, 9934.0, 9953.0, 9962.0, 9963.0, 9984.0, 9986.0]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(subset_indices)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 228,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[2. 0. 0. ... 0. 0. 0.]\n",
|
||
"dict_keys([2.0, 0.0, 1.0])\n",
|
||
"dict_values([1108, 7619, 191])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from collections import Counter\n",
|
||
"\n",
|
||
"print(Counter(predictions_test).keys()) # equals to list(set(words))\n",
|
||
"print(Counter(predictions_test).values()) # counts the elements' frequency"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 235,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df['Estimated'] = np.nan\n",
|
||
"for i, value in enumerate(indices_predicted):\n",
|
||
" df.loc[(df['Index'] == value), 'Estimated'] = predictions_test[i]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 209,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# split training data into text and label set\n",
|
||
"# join title and text\n",
|
||
"X = train_data['Title'] + '. ' + train_data['Text']\n",
|
||
"y = train_data['Label']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 156,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# split testing data into text and label set\n",
|
||
"U = test_data['Title'] + '. ' + test_data['Text']\n",
|
||
"v = test_data['Label']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"With CountVectorizer / own BOW:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"classifier = GaussianNB()\n",
|
||
"#classifier = SVC(probability = True,\n",
|
||
"# gamma = 'auto')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 158,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"cv = CountVectorizer()\n",
|
||
"\n",
|
||
"# probabilities of each class\n",
|
||
"class_probs = []\n",
|
||
"\n",
|
||
"# use sklearn CountVectorizer\n",
|
||
"# fit the training data and then return the matrix\n",
|
||
"training_data = cv.fit_transform(X, y).toarray()\n",
|
||
"# transform testing data and return the matrix\n",
|
||
"testing_data = cv.transform(U).toarray()\n",
|
||
"\n",
|
||
"# use my BOW\n",
|
||
"#extracted_words = BagOfWords.extract_all_words(X)\n",
|
||
"#vocab = BagOfWords.make_vocab(extracted_words)\n",
|
||
"\n",
|
||
"# fit the training data and then return the matrix\n",
|
||
"#training_data = BagOfWords.make_matrix(extracted_words,\n",
|
||
"# vocab)\n",
|
||
"\n",
|
||
"# transform testing data and return the matrix\n",
|
||
"#extracted_words = BagOfWords.extract_all_words(U)\n",
|
||
"#testing_data = BagOfWords.make_matrix(extracted_words,\n",
|
||
"# vocab)\n",
|
||
"\n",
|
||
"#fit classifier\n",
|
||
"classifier.fit(training_data, y)\n",
|
||
"\n",
|
||
"#predict class\n",
|
||
"predictions_test = classifier.predict(testing_data)\n",
|
||
"\n",
|
||
"class_probs = classifier.predict_proba(testing_data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 81,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([[0., 1.],\n",
|
||
" [0., 1.],\n",
|
||
" [0., 1.],\n",
|
||
" ...,\n",
|
||
" [1., 0.],\n",
|
||
" [0., 1.],\n",
|
||
" [0., 1.]])"
|
||
]
|
||
},
|
||
"execution_count": 81,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"classifier.predict_proba(testing_data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 163,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"confusion matrix:\n",
|
||
"###############\n",
|
||
"465\n",
|
||
"0\n",
|
||
"10\n",
|
||
"/\n",
|
||
"57\n",
|
||
"50\n",
|
||
"23\n",
|
||
"/\n",
|
||
"36\n",
|
||
"0\n",
|
||
"49\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print('confusion matrix:')\n",
|
||
"print('###############')\n",
|
||
"zero_0 = len(test_data.loc[(test_data['NewLabel'] == 0) & (test_data['Label'] == 0)])\n",
|
||
"print(zero_0)\n",
|
||
"zero_1 = len(test_data.loc[(test_data['NewLabel'] == 0) & (test_data['Label'] == 1)])\n",
|
||
"print(zero_1)\n",
|
||
"zero_2 = len(test_data.loc[(test_data['NewLabel'] == 0) & (test_data['Label'] == 2)])\n",
|
||
"print(zero_2)\n",
|
||
"print('/')\n",
|
||
"one_0 = len(test_data.loc[(test_data['NewLabel'] == 1) & (test_data['Label'] == 0)])\n",
|
||
"print(one_0)\n",
|
||
"one_1 = len(test_data.loc[(test_data['NewLabel'] == 1) & (test_data['Label'] == 1)])\n",
|
||
"print(one_1)\n",
|
||
"one_2 = len(test_data.loc[(test_data['NewLabel'] == 1) & (test_data['Label'] == 2)])\n",
|
||
"print(one_2)\n",
|
||
"print('/')\n",
|
||
"\n",
|
||
"two_0 = len(test_data.loc[(test_data['NewLabel'] == 2) & (test_data['Label'] == 0)])\n",
|
||
"print(two_0)\n",
|
||
"two_1 = len(test_data.loc[(test_data['NewLabel'] == 2) & (test_data['Label'] == 1)])\n",
|
||
"print(two_1)\n",
|
||
"two_2 = len(test_data.loc[(test_data['NewLabel'] == 2) & (test_data['Label'] == 2)])\n",
|
||
"print(two_2)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 236,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"###############\n",
|
||
"742\n",
|
||
"46\n",
|
||
"164\n",
|
||
"###############\n",
|
||
"14\n",
|
||
"1\n",
|
||
"4\n",
|
||
"###############\n",
|
||
"91\n",
|
||
"3\n",
|
||
"17\n",
|
||
"###############\n",
|
||
"metrics:\n",
|
||
"\n",
|
||
"742\n",
|
||
"25\n",
|
||
"210\n",
|
||
"105\n",
|
||
"###############\n",
|
||
"1\n",
|
||
"1014\n",
|
||
"18\n",
|
||
"49\n",
|
||
"###############\n",
|
||
"17\n",
|
||
"803\n",
|
||
"94\n",
|
||
"168\n",
|
||
"###############\n",
|
||
"77.94117647058823\n",
|
||
"87.60330578512396\n",
|
||
"70.88724584103512\n",
|
||
"###############\n",
|
||
"5.263157894736842\n",
|
||
"2.0\n",
|
||
"93.80776340110906\n",
|
||
"###############\n",
|
||
"15.315315315315313\n",
|
||
"9.18918918918919\n",
|
||
"75.78558225508318\n",
|
||
"###############\n",
|
||
"32.839883226880126\n",
|
||
"32.93083165810439\n",
|
||
"80.16019716574245\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print('###############')\n",
|
||
"zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
|
||
"print(zero_0)\n",
|
||
"zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
|
||
"print(zero_1)\n",
|
||
"zero_2 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 2)])\n",
|
||
"print(zero_2)\n",
|
||
"print('###############')\n",
|
||
"one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
|
||
"print(one_0)\n",
|
||
"one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
|
||
"print(one_1)\n",
|
||
"one_2 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 2)])\n",
|
||
"print(one_2)\n",
|
||
"print('###############')\n",
|
||
"\n",
|
||
"two_0 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 0)])\n",
|
||
"print(two_0)\n",
|
||
"two_1 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 1)])\n",
|
||
"print(two_1)\n",
|
||
"two_2 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 2)])\n",
|
||
"print(two_2)\n",
|
||
"print('###############')\n",
|
||
"print('metrics:')\n",
|
||
"print()\n",
|
||
"\n",
|
||
"total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
|
||
"\n",
|
||
"tp_0 = zero_0\n",
|
||
"print(tp_0)\n",
|
||
"tn_0 = one_1 + one_2 + two_1 + two_2\n",
|
||
"print(tn_0)\n",
|
||
"fp_0 = zero_1 + zero_2\n",
|
||
"print(fp_0)\n",
|
||
"fn_0 = one_0 + two_0\n",
|
||
"print(fn_0)\n",
|
||
"print('###############')\n",
|
||
"\n",
|
||
"tp_1 = one_1\n",
|
||
"print(tp_1)\n",
|
||
"tn_1 = zero_0 + zero_2 + two_0 + two_2\n",
|
||
"print(tn_1)\n",
|
||
"fp_1 = one_0 + one_2\n",
|
||
"print(fp_1)\n",
|
||
"fn_1 = zero_1 + two_1\n",
|
||
"print(fn_1)\n",
|
||
"print('###############')\n",
|
||
"\n",
|
||
"tp_2 = two_2\n",
|
||
"print(tp_2)\n",
|
||
"tn_2 = zero_0 + zero_1 + one_0 + one_1\n",
|
||
"print(tn_2)\n",
|
||
"fp_2 = two_0 + two_1\n",
|
||
"print(fp_2)\n",
|
||
"fn_2 = zero_2 + one_2\n",
|
||
"print(fn_2)\n",
|
||
"print('###############')\n",
|
||
"\n",
|
||
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
|
||
"print(prec_0)\n",
|
||
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
|
||
"print(rec_0)\n",
|
||
"acc_0 = (tp_0 + tn_0) / total * 100\n",
|
||
"print(acc_0)\n",
|
||
"print('###############')\n",
|
||
"\n",
|
||
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
|
||
"print(prec_1)\n",
|
||
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
|
||
"print(rec_1)\n",
|
||
"acc_1 = (tp_1 + tn_1) / total * 100\n",
|
||
"print(acc_1)\n",
|
||
"print('###############')\n",
|
||
"\n",
|
||
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
|
||
"print(prec_2)\n",
|
||
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
|
||
"print(rec_2)\n",
|
||
"acc_2 = (tp_2 + tn_2) / total * 100\n",
|
||
"print(acc_2)\n",
|
||
"print('###############')\n",
|
||
"\n",
|
||
"print((prec_1 + prec_2 + prec_0) / 3)\n",
|
||
"print((rec_1 + rec_2 + rec_0) / 3)\n",
|
||
"print((acc_1 + acc_2 + acc_0) / 3)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 82,
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"recall: 0.18772136953955135\n",
|
||
"precision: 0.5335570469798657\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"#print and store metrics\n",
|
||
"rec = recall_score(v, predictions_test, pos_label=3)\n",
|
||
"print('recall: ' + str(rec))\n",
|
||
"prec = precision_score(v, predictions_test, pos_label=3)\n",
|
||
"print('precision: ' + str(prec))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 181,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"len(df.loc[(df['NewLabel'] == 0) | (df['NewLabel'] == 1) | (df['NewLabel'] == 2)])\n",
|
||
"subset_indices = df.loc[((df['NewLabel'] == 0) | (df['NewLabel'] == 1) | (df['NewLabel'] == 2)), 'Index'].tolist()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 183,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"690"
|
||
]
|
||
},
|
||
"execution_count": 183,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(subset_indices)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 205,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#df.loc[(df['Label'] == 1), ['Index', 'Title', 'Text']][:10]\n",
|
||
"#df.loc[3860]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Annotate Labels:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 83,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# series of indices of recently estimated articles \n",
|
||
"indices_estimated_2 = test_data['Index'].tolist()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 63,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# annotate probability\n",
|
||
"n = 0\n",
|
||
"for row in class_probs:\n",
|
||
" index = indices_estimated_0[n]\n",
|
||
" # save estimated label\n",
|
||
" df.loc[index, 'Estimated_0'] = row[0]\n",
|
||
" n += 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 122,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df = df.rename(columns={'Estimated_0': 'Model_0', 'Estimated_1': 'Model_1', 'Estimated_2': 'Model_2'})"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 115,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df = df.round(2)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 123,
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Model_0</th>\n",
|
||
" <th>Model_1</th>\n",
|
||
" <th>Model_2</th>\n",
|
||
" <th>Label</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Model_0 Model_1 Model_2 Label\n",
|
||
"0 1.0 1.0 0.0 0.0 \n",
|
||
"1 1.0 0.0 0.0 0.0 \n",
|
||
"2 0.0 1.0 0.0 2.0 \n",
|
||
"3 1.0 0.0 0.0 0.0 \n",
|
||
"4 1.0 1.0 0.0 0.0 \n",
|
||
"5 0.0 1.0 0.0 0.0 \n",
|
||
"6 1.0 0.0 0.0 0.0 \n",
|
||
"7 1.0 0.0 1.0 0.0 \n",
|
||
"8 1.0 0.0 0.0 0.0 \n",
|
||
"9 1.0 0.0 0.0 0.0 "
|
||
]
|
||
},
|
||
"execution_count": 123,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.loc[(df['Label'] != -1), ['Model_0', 'Model_1', 'Model_2', 'Label']][:10].reset_index(drop=True)#.to_latex()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 117,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'\\\\begin{tabular}{lrrr}\\n\\\\toprule\\n{} & Model\\\\_0 & Model\\\\_1 & Model\\\\_2 \\\\\\\\\\n\\\\midrule\\n0 & 0.83 & 0.51 & 0.53 \\\\\\\\\\n1 & 0.42 & 0.50 & 0.46 \\\\\\\\\\n2 & 0.35 & 0.50 & 0.42 \\\\\\\\\\n3 & 0.46 & 0.50 & 0.44 \\\\\\\\\\n4 & 0.51 & 0.50 & 0.46 \\\\\\\\\\n5 & 0.32 & 0.50 & 0.42 \\\\\\\\\\n6 & 0.94 & 0.51 & 0.62 \\\\\\\\\\n7 & 0.80 & 0.50 & 0.61 \\\\\\\\\\n8 & 0.32 & 0.50 & 0.42 \\\\\\\\\\n9 & 0.87 & 0.51 & 0.62 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'"
|
||
]
|
||
},
|
||
"execution_count": 117,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.loc[(df['Label'] != -1), ['Model_0', 'Model_1', 'Model_2']][:10].reset_index(drop=True).to_latex()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 104,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"378"
|
||
]
|
||
},
|
||
"execution_count": 104,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"#len(df.loc[(df['Label'] != -1) & ((df['Model_0'] + df['Model_1'] + df['Model_2']) > 1.0), ['Estimated_0', 'Estimated_1', 'Estimated_2']])\n",
|
||
"indices_ambiguous = df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0), 'Index'].reset_index(drop=True)\n",
|
||
"len(indices_ambiguous)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 74,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"n = 0\n",
|
||
"for row in class_probs:\n",
|
||
" index = indices_estimated_1[n]\n",
|
||
" # save estimated label\n",
|
||
" df.loc[index, 'Estimated_1'] = row[0]\n",
|
||
" n += 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 84,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"n = 0\n",
|
||
"for row in class_probs:\n",
|
||
" index = indices_estimated_2[n]\n",
|
||
" # save estimated label\n",
|
||
" df.loc[index, 'Estimated_2'] = row[0]\n",
|
||
" n += 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df.loc[index, 'Estimated_2']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Apply Naive Bayes Model (10-fold-cross validation):"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"dataset = sampling_class0\n",
|
||
"\n",
|
||
"X = dataset['Title'] + '. ' + dataset['Text']\n",
|
||
"y = dataset['Label']\n",
|
||
"\n",
|
||
"cv = CountVectorizer()\n",
|
||
"\n",
|
||
"# use stratified k-fold cross-validation as split method\n",
|
||
"skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)\n",
|
||
"\n",
|
||
"classifier = GaussianNB()\n",
|
||
"\n",
|
||
"# metrics\n",
|
||
"recall_scores = []\n",
|
||
"precision_scores = []\n",
|
||
"\n",
|
||
"# probabilities of each class (of each fold)\n",
|
||
"class_probs = []\n",
|
||
"# counts number of training samples observed in each class \n",
|
||
"class_counts = []\n",
|
||
"\n",
|
||
"# for each fold\n",
|
||
"for train, test in skf.split(X,y):\n",
|
||
" \n",
|
||
" # fit the training data and then return the matrix\n",
|
||
" training_data = cv.fit_transform(X[train], y[train]).toarray()\n",
|
||
" # transform testing data and return the matrix\n",
|
||
" testing_data = cv.transform(X[test]).toarray()\n",
|
||
"\n",
|
||
" #fit classifier\n",
|
||
" classifier.fit(training_data, y[train])\n",
|
||
" #predict class\n",
|
||
" predictions_train = classifier.predict(training_data)\n",
|
||
" predictions_test = classifier.predict(testing_data)\n",
|
||
"\n",
|
||
" #print and store metrics\n",
|
||
" rec = recall_score(y[test], predictions_test)\n",
|
||
" recall_scores.append(rec)\n",
|
||
" prec = precision_score(y[test], predictions_test)\n",
|
||
" precision_scores.append(prec)\n",
|
||
"\n",
|
||
" class_probs.append(classifier.class_prior_)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Annotate label:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# series of indices of recently estimated articles \n",
|
||
"indices_estimated_0 = sampling_class0['Index'].tolist()\n",
|
||
"\n",
|
||
"# annotate probability\n",
|
||
"n = 0\n",
|
||
"for row in class_probs:\n",
|
||
" index = indices_estimated_0[n]\n",
|
||
" # save estimated label\n",
|
||
" df.loc[index, 'Estimated_0'] = row[1]\n",
|
||
" n += 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
|
||
"print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
|
||
"print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
|
||
"print()\n",
|
||
"print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
|
||
"print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
|
||
"print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Number of used samples:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"indices_all_samples = set((indices_estimated_0 + indices_estimated_1) + indices_estimated_2)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"len(indices_all_samples)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Check if there are samples where more than one class was marked with 1."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"len(df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0)])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 149,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Uuid 7f565b5746ba33be1784c7d3d6a7ede0c9cd81a2 \n",
|
||
"Title Toshiba to sell less than 20 pct of chip unit, but may opt for IPO later \n",
|
||
"Text Industrials 25am EST Toshiba to sell less than 20 pct of chip unit, but may opt for IPO later TOKYO Jan 27 Toshiba Corp is currently looking to sell less than 20 percent of its memory chip business as it looks to raise capital to offset an upcoming multi-billion dollar charge, but may eventually list it, executives said on Friday. Toshiba Chief Executive Officer Satoshi Tsunakawa said he will do all he can to ensure the company doesn't fall into negative net worth as a result of a writedown on its U.S. nuclear unit. The conglomerate will review its overseas nuclear business, Tsunakawa said, but added it has no plans to sell its infrastructure business. Toshiba's board on Friday approved plans to make its core memory chip business a separate company and seek outside investment in it. (Reporting by Makiko Yamazaki; Editing by Edwina Gibbs) Next In Industrials\n",
|
||
"Site reuters.com \n",
|
||
"SiteSection http://feeds.reuters.com/reuters/financialsNews \n",
|
||
"Url http://www.reuters.com/article/toshiba-accounting-chips-idUST9N1F103J \n",
|
||
"Timestamp 2017-01-27T15:25:00.000+02:00 \n",
|
||
"Index 0 \n",
|
||
"Round NaN \n",
|
||
"Label -1 \n",
|
||
"Probability 1 \n",
|
||
"Estimated 2 \n",
|
||
"Estimated_0 NaN \n",
|
||
"Estimated_1 NaN \n",
|
||
"Estimated_2 NaN \n",
|
||
"Name: 0, dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 149,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.loc[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 126,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"704"
|
||
]
|
||
},
|
||
"execution_count": 126,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) <= 1.0)])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 127,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# indices of 'clear' articles\n",
|
||
"clear_indices = df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) <= 1.0), 'Index'].tolist()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 144,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# save 3 arrays for NaiveBayes:\n",
|
||
"list_0 = df.loc[(df['Label'] != -1), 'Estimated_0'].tolist()\n",
|
||
"list_1 = df.loc[(df['Label'] != -1), 'Estimated_1'].tolist() \n",
|
||
"list_2 = df.loc[(df['Label'] != -1), 'Estimated_2'].tolist()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 150,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#len(list_2)\n",
|
||
"df['NewLabel'] = -1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 146,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"with open('../obj/'+ 'array_3model_svm_class0'.format(m) + '.pkl', 'wb') as f:\n",
|
||
" pickle.dump(list_0, f, pickle.HIGHEST_PROTOCOL)\n",
|
||
"with open('../obj/'+ 'array_3model_svm_class1'.format(m) + '.pkl', 'wb') as f:\n",
|
||
" pickle.dump(list_1, f, pickle.HIGHEST_PROTOCOL)\n",
|
||
"with open('../obj/'+ 'array_3model_svm_class2'.format(m) + '.pkl', 'wb') as f:\n",
|
||
" pickle.dump(list_2, f, pickle.HIGHEST_PROTOCOL)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 133,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"690"
|
||
]
|
||
},
|
||
"execution_count": 133,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(df.loc[(df['NewLabel'] == 0) | (df['NewLabel'] == 1) | (df['NewLabel'] == 2)])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 168,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# annotate estimated label\n",
|
||
"estimated_articles = df.loc[(df['Label'] != -1), 'Index'].tolist()\n",
|
||
"for index in estimated_articles:\n",
|
||
" # save estimated label\n",
|
||
" df.loc[(df['Index'] == index), 'NewLabel'] = -1\n",
|
||
" df.loc[(df['Index'] == index) & (df['Estimated_0'] == 1.0) & (df['Estimated_1'] == 0.0) & (df['Estimated_2'] == 0.0), 'NewLabel'] = 0\n",
|
||
" df.loc[(df['Index'] == index) & (df['Estimated_0'] == 0.0) & (df['Estimated_1'] == 1.0) & (df['Estimated_2'] == 0.0), 'NewLabel'] = 1\n",
|
||
" df.loc[(df['Index'] == index) & (df['Estimated_0'] == 0.0) & (df['Estimated_1'] == 0.0) & (df['Estimated_2'] == 1.0), 'NewLabel'] = 2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 174,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Estimated_0</th>\n",
|
||
" <th>Estimated_1</th>\n",
|
||
" <th>Estimated_2</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>59</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>155</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>175</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>188</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>365</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>407</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>475</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>516</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>546</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>623</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>630</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>714</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>944</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1042</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1140</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1204</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1541</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2020</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2207</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2361</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2383</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2460</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2549</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2604</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2735</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2938</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3050</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3057</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3082</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3114</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6356</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6422</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6475</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6575</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6701</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6784</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6983</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7228</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7342</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7424</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7458</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7546</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7792</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8044</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8241</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8448</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8462</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8491</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8761</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9057</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9133</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9197</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9261</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9279</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9667</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9714</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9877</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9912</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9956</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9977</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>90 rows × 3 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Estimated_0 Estimated_1 Estimated_2\n",
|
||
"59 0.0 1.0 1.0 \n",
|
||
"155 0.0 0.0 0.0 \n",
|
||
"175 0.0 1.0 1.0 \n",
|
||
"188 0.0 1.0 1.0 \n",
|
||
"365 0.0 1.0 1.0 \n",
|
||
"407 0.0 1.0 1.0 \n",
|
||
"475 0.0 1.0 1.0 \n",
|
||
"516 0.0 1.0 1.0 \n",
|
||
"546 0.0 0.0 0.0 \n",
|
||
"623 0.0 1.0 1.0 \n",
|
||
"630 0.0 1.0 1.0 \n",
|
||
"714 0.0 1.0 1.0 \n",
|
||
"944 0.0 1.0 1.0 \n",
|
||
"1042 0.0 1.0 1.0 \n",
|
||
"1140 0.0 0.0 0.0 \n",
|
||
"1204 0.0 0.0 0.0 \n",
|
||
"1541 0.0 1.0 1.0 \n",
|
||
"2020 0.0 1.0 1.0 \n",
|
||
"2207 0.0 1.0 1.0 \n",
|
||
"2361 0.0 1.0 1.0 \n",
|
||
"2383 0.0 1.0 1.0 \n",
|
||
"2460 0.0 1.0 1.0 \n",
|
||
"2549 0.0 1.0 1.0 \n",
|
||
"2604 0.0 1.0 1.0 \n",
|
||
"2735 0.0 1.0 1.0 \n",
|
||
"2938 0.0 1.0 1.0 \n",
|
||
"3050 0.0 1.0 1.0 \n",
|
||
"3057 0.0 1.0 1.0 \n",
|
||
"3082 0.0 1.0 1.0 \n",
|
||
"3114 0.0 1.0 1.0 \n",
|
||
"... ... ... ... \n",
|
||
"6356 0.0 0.0 0.0 \n",
|
||
"6422 0.0 1.0 1.0 \n",
|
||
"6475 0.0 1.0 1.0 \n",
|
||
"6575 0.0 0.0 0.0 \n",
|
||
"6701 0.0 0.0 0.0 \n",
|
||
"6784 0.0 1.0 1.0 \n",
|
||
"6983 0.0 1.0 1.0 \n",
|
||
"7228 0.0 1.0 1.0 \n",
|
||
"7342 0.0 1.0 1.0 \n",
|
||
"7424 0.0 1.0 1.0 \n",
|
||
"7458 0.0 1.0 1.0 \n",
|
||
"7546 0.0 1.0 1.0 \n",
|
||
"7792 0.0 1.0 1.0 \n",
|
||
"8044 0.0 0.0 0.0 \n",
|
||
"8241 0.0 1.0 1.0 \n",
|
||
"8448 0.0 1.0 1.0 \n",
|
||
"8462 0.0 1.0 1.0 \n",
|
||
"8491 0.0 1.0 1.0 \n",
|
||
"8761 0.0 0.0 0.0 \n",
|
||
"9057 0.0 1.0 1.0 \n",
|
||
"9133 0.0 1.0 1.0 \n",
|
||
"9197 0.0 1.0 1.0 \n",
|
||
"9261 0.0 1.0 1.0 \n",
|
||
"9279 0.0 1.0 1.0 \n",
|
||
"9667 0.0 1.0 1.0 \n",
|
||
"9714 0.0 1.0 1.0 \n",
|
||
"9877 0.0 1.0 1.0 \n",
|
||
"9912 0.0 1.0 1.0 \n",
|
||
"9956 0.0 0.0 0.0 \n",
|
||
"9977 0.0 1.0 1.0 \n",
|
||
"\n",
|
||
"[90 rows x 3 columns]"
|
||
]
|
||
},
|
||
"execution_count": 174,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.loc[(df['NewLabel'] == -1) & (df['Estimated_0'] != (0 | 1)), ['Estimated_0', 'Estimated_1', 'Estimated_2' ]][:100]\n",
|
||
"#df['NewLabel'] = np.nan"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 86,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# save tri-model to csv \n",
|
||
"df.to_csv('../data/interactive_labeling_triple_model_on_1082_NaiveBayes.csv',\n",
|
||
" sep='|',\n",
|
||
" mode='w',\n",
|
||
" encoding='utf-8',\n",
|
||
" quoting=csv.QUOTE_NONNUMERIC,\n",
|
||
" quotechar='\\'')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 147,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# read current data set from csv\n",
|
||
"df = pd.read_csv('../data/interactive_labeling_triple_model_on_1082_NaiveBayes.csv',\n",
|
||
" sep='|',\n",
|
||
" usecols=range(1,16), # drop first column 'unnamed'\n",
|
||
" encoding='utf-8',\n",
|
||
" quoting=csv.QUOTE_NONNUMERIC,\n",
|
||
" quotechar='\\'')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"for index in indices:\n",
|
||
" show_next(index)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.7.1"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|