interactive labeling: documentation round 0-9

This commit is contained in:
annealias 2019-03-01 12:28:29 +01:00
parent 213bb148de
commit a2c7a7279e
27 changed files with 8412 additions and 31932 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,689 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Notebook for calculation of the resubstitution error...\n",
"\n",
"Note:\n",
"\n",
"class 0: unrelated news\n",
"\n",
"class 1: mergers\n",
"\n",
"class 2: other deals, non-mergers, etc."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import operator\n",
"import pickle\n",
"import random\n",
"\n",
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
"import ipywidgets as widgets\n",
"from IPython.core.interactiveshell import InteractiveShell\n",
"from IPython.display import display\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_selection import SelectPercentile\n",
"from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.semi_supervised import label_propagation\n",
"\n",
"from BagOfWords import BagOfWords\n",
"from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
"from MNBInteractive import MNBInteractive"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# initialize random => reproducible sequence\n",
"random_state = 5\n",
"\n",
"# set up wider display area\n",
"pd.set_option('display.max_colwidth', -1)\n",
"\n",
"# show full text for print statement\n",
"InteractiveShell.ast_node_interactivity = \"all\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Last iteration number: 11\n",
"Number of manually labeled articles: 1082\n",
"Number of manually unlabeled articles: 8918\n"
]
}
],
"source": [
"# read current data set from csv\n",
"df = pd.read_csv('../data/interactive_labeling_round_11.csv',\n",
" sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')\n",
"\n",
"# find current iteration/round number\n",
"m = int(df['Round'].max())\n",
"print('Last iteration number: {}'.format(m))\n",
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Calculate the resubstitution error for iteration 0-9 with stratified sampling.\n",
"Start with iteration number 0."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"m = 0"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m = 3\n",
"m"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [],
"source": [
"# select all samples that were labeled with 0/1/2\n",
"set_0 = df.loc[(df['Round'] == m) & (df['Label'] == 0)]\n",
"set_1 = df.loc[(df['Round'] == m) & (df['Label'] == 1)]\n",
"set_2 = df.loc[(df['Round'] == m) & (df['Label'] == 2)]"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"number of labeled samples by class (0/1/2): 82/4/14\n",
"minimum of new labeled samples: 4\n",
"length of current data set for resubstitution error: 12\n"
]
}
],
"source": [
"# find minimum\n",
"print('number of labeled samples by class (0/1/2): {}/{}/{}'.format(len(set_0), len(set_1), len(set_2)))\n",
"strat_len = min(len(set_0), len(set_1), len(set_2))\n",
"print('minimum of new labeled samples: {}'.format(strat_len))\n",
"# length of current data set for resubstitution error\n",
"print('length of current data set for resubstitution error: {}'.format(strat_len * 3))"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"# random sampling for selection\n",
"selec_0 = set_0.sample(n=strat_len, random_state=random_state)\n",
"selec_1 = set_1.sample(n=strat_len, random_state=random_state)\n",
"selec_2 = set_2.sample(n=strat_len, random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [],
"source": [
"# newly added training data of the current round\n",
"# training_data_0 = pd.concat([selec_0, selec_1, selec_2])\n",
"# training_data_1 = pd.concat([selec_0, selec_1, selec_2])\n",
"# training_data_2 = pd.concat([selec_0, selec_1, selec_2])\n",
"# training_data_3 = pd.concat([selec_0, selec_1, selec_2])\n",
"training_data_4 = pd.concat([selec_0, selec_1, selec_2])"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[5789.0,\n",
" 4237.0,\n",
" 2202.0,\n",
" 4913.0,\n",
" 821.0,\n",
" 5973.0,\n",
" 6198.0,\n",
" 8490.0,\n",
" 4815.0,\n",
" 2386.0,\n",
" 5177.0,\n",
" 2482.0]"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# indices of training samples\n",
"# idx_0 = training_data_0['Index'].tolist()\n",
"# idx_1 = training_data_1['Index'].tolist()\n",
"# idx_2 = training_data_2['Index'].tolist()\n",
"# idx_3 = training_data_3['Index'].tolist()\n",
"idx_4 = training_data_4['Index'].tolist()\n",
"\n",
"train_all = train_all.append(training_data_4)\n",
"idx_all = train_all['Index'].tolist()\n",
"idx_4"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"36"
]
},
"execution_count": 140,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_0_2 = train_0_1.append(training_data_2)\n",
"len(train_0_2)"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"stratified number in round 3: 12\n",
"stratified number in total: 48\n"
]
}
],
"source": [
"print('stratified number in round {}: {}'.format(m, len(idx_3)))\n",
"print('stratified number in total: {}'.format(len(idx_all)))"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"# STEP 1:\n",
"# resubstitution error round\n",
"training_data = training_data_3\n",
"testing_data = training_data_3"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"400"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# STEP 2: \n",
"# resubstitution error all labeled articles in round\n",
"training_data = training_data_3\n",
"testing_data = df.loc[(df['Round'] <= m)]\n",
"len(testing_data)"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"# STEP 3:\n",
"training_data = train_all\n",
"testing_data = df.loc[(df['Round'] <= m)]"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [],
"source": [
"# STEP 4:\n",
"training_data = train_0_2\n",
"testing_data = training_data_3"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# MNB: starting interactive multinomial naives bayes...\n",
"\n",
"# MNB: ending multinomial naive bayes\n"
]
}
],
"source": [
"# call script\n",
"classes, class_count, class_probs = MNBInteractive.estimate_mnb(training_data, testing_data, True)\n",
"\n",
"# series of indices of recently estimated articles \n",
"indices_estimated = testing_data['Index'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"confusion matrix:\n",
"###############\n"
]
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"/\n"
]
},
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"/\n"
]
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n",
"\n",
"class 0:\n",
"\n",
"TP: 1\n",
"TN: 7\n",
"FP: 1\n",
"FN: 3\n",
"\n",
"class 1:\n",
"\n",
"TP: 4\n",
"TN: 3\n",
"FP: 5\n",
"FN: 0\n",
"\n",
"class 2:\n",
"\n",
"TP: 0\n",
"TN: 7\n",
"FP: 1\n",
"FN: 4\n",
"###############\n",
"\n",
"METRICS:\n",
"\n",
"class 0:\n",
"\n",
"precision: 50.0\n",
"recall: 25.0\n",
"accuracy: 66.667\n",
"\n",
"class 1:\n",
"\n",
"precision: 44.444\n",
"recall: 100.0\n",
"accuracy: 58.333\n",
"\n",
"class 2:\n",
"\n",
"precision: 0.0\n",
"recall: 0.0\n",
"accuracy: 58.333\n",
"\n",
"Average Metrics:\n",
"\n",
"precision: 31\n",
"recall: 42\n",
"accuracy: 61\n"
]
}
],
"source": [
"n = 0 \n",
"for row in class_probs:\n",
" for i in range(0, len(classes)):\n",
" index = indices_estimated[n]\n",
" # save estimated label\n",
" if np.amax(row) == row[i]:\n",
" testing_data.loc[index, 'Estimated'] = classes[i]\n",
" # annotate probability\n",
" testing_data.loc[index, 'Probability'] = row[i]\n",
" n += 1\n",
"\n",
"print('confusion matrix:')\n",
"print('###############')\n",
"zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])\n",
"zero_0\n",
"zero_1 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 1)])\n",
"zero_1\n",
"zero_2 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 2)])\n",
"zero_2\n",
"print('/')\n",
"one_0 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 0)])\n",
"one_0\n",
"one_1 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 1)])\n",
"one_1\n",
"one_2 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 2)])\n",
"one_2\n",
"print('/')\n",
"\n",
"two_0 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 0)])\n",
"two_0\n",
"two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])\n",
"two_1\n",
"two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])\n",
"two_2\n",
"print('###############')\n",
"print()\n",
"total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
"print('class 0:')\n",
"print()\n",
"tp_0 = zero_0\n",
"print('TP: {}'.format(tp_0))\n",
"tn_0 = one_1 + one_2 + two_1 + two_2\n",
"print('TN: {}'.format(tn_0))\n",
"fp_0 = zero_1 + zero_2\n",
"print('FP: {}'.format(fp_0))\n",
"fn_0 = one_0 + two_0\n",
"print('FN: {}'.format(fn_0))\n",
"print()\n",
"print('class 1:')\n",
"print()\n",
"tp_1 = one_1\n",
"print('TP: {}'.format(tp_1))\n",
"tn_1 = zero_0 + zero_2 + two_0 + two_2\n",
"print('TN: {}'.format(tn_1))\n",
"fp_1 = one_0 + one_2\n",
"print('FP: {}'.format(fp_1))\n",
"fn_1 = zero_1 + two_1\n",
"print('FN: {}'.format(fn_1))\n",
"print()\n",
"print('class 2:')\n",
"print()\n",
"tp_2 = two_2\n",
"print('TP: {}'.format(tp_2))\n",
"tn_2 = zero_0 + zero_1 + one_0 + one_1\n",
"print('TN: {}'.format(tn_2))\n",
"fp_2 = two_0 + two_1\n",
"print('FP: {}'.format(fp_2))\n",
"fn_2 = zero_2 + one_2\n",
"print('FN: {}'.format(fn_2))\n",
"print('###############')\n",
"print()\n",
"print('METRICS:')\n",
"print()\n",
"print('class 0:')\n",
"print()\n",
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
"print('precision: {}'.format(round(prec_0, 3)))\n",
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
"print('recall: {}'.format(round(rec_0, 3)))\n",
"acc_0 = (tp_0 + tn_0) / total * 100\n",
"print('accuracy: {}'.format(round(acc_0, 3)))\n",
"print()\n",
"print('class 1:')\n",
"print()\n",
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
"print('precision: {}'.format(round(prec_1, 3)))\n",
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
"print('recall: {}'.format(round(rec_1, 3)))\n",
"acc_1 = (tp_1 + tn_1) / total * 100\n",
"print('accuracy: {}'.format(round(acc_1, 3)))\n",
"print()\n",
"print('class 2:')\n",
"print()\n",
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
"print('precision: {}'.format(round(prec_2, 3)))\n",
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
"print('recall: {}'.format(round(rec_2, 3)))\n",
"acc_2 = (tp_2 + tn_2) / total * 100\n",
"print('accuracy: {}'.format(round(acc_2, 3)))\n",
"print()\n",
"print('Average Metrics:')\n",
"print()\n",
"print('precision: {}'.format(round((prec_1 + prec_2 + prec_0) / 3), 3))\n",
"print('recall: {}'.format(round((rec_1 + rec_2 + rec_0) / 3), 3))\n",
"print('accuracy: {}'.format(round((acc_1 + acc_2 + acc_0) / 3), 3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,6 +1,8 @@
import csv
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
class LabelingPlotter():
@ -54,7 +56,7 @@ class LabelingPlotter():
def plot_cumulative():
# load pickle object
with open('../obj/array_class_probs_round_9.pkl', 'rb') as input:
with open('../obj/array_class_probs_round_11.pkl', 'rb') as input:
list = pickle.load(input)
# sort list in descending order
@ -79,14 +81,43 @@ class LabelingPlotter():
ax.grid(True)
ax.legend(loc='right')
#ax.legend(loc='right')
#ax.set_title('Cumulative distribution of highest estimated probability')
ax.set_xlabel('Highest estimated probability')
ax.set_ylabel('Fraction of articles with this highest estimated probability')
#plt.axis([0.5, 0.99, 0, 0.006])
#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
#plt.axis([0.65, 1, 0, 0.003]) # round 10
plt.axis([0.7, 1, 0, 0.002]) # round 11
#ax.set_xbound(lower=0.5, upper=0.99)
plt.savefig('..\\visualization\\proba_round_11.png')
plt.savefig('..\\visualization\\proba_round_11.eps')
plt.show()
def plot_correlation():
m = 10
df = pd.read_csv('../data/interactive_labeling_round_{}_temp.csv'.format(m),
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# add boolean, if estimation was true
df['EstCorrect'] = np.nan
df.loc[(df['Label'] != -1) & (df['Label'] == df['Estimated']), 'EstCorrect'] = 1
df.loc[(df['Label'] != -1) & (df['Label'] != df['Estimated']), 'EstCorrect'] = 0
print('estimation was correct: {}'.format(len(df.loc[df['EstCorrect'] == 1])))
print('estimation was wrong: {}'.format(len(df.loc[df['EstCorrect'] == 0])))
x = df.loc[df['Label'] != -1, 'Probability'].tolist()
y = df.loc[df['Label'] != -1, 'EstCorrect'].tolist()
plt.plot(x, y, 'bo')
plt.axis([0.4, 1, -0.1, 1.1])
plt.show()
if __name__ == '__main__':
#LabelingPlotter.plot_correlation()
LabelingPlotter.plot_cumulative()

View File

@ -24,7 +24,7 @@ class MNBInteractive:
'''fits naive bayes model
'''
print('# MNB: starting multinomial naives bayes...')
print('# MNB: starting interactive multinomial naives bayes...')
print()
# split labeled data into text and label set

View File

@ -0,0 +1,175 @@
'''
Multinomial Naive Bayes Classifier
======================
'''
from BagOfWords import BagOfWords
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
class MultinomialNaiveBayes:
def make_mnb(dataset, sklearn_cv=True, percentile=100):
'''fits naive bayes model with StratifiedKFold
'''
print('# starting classical multinomial naive bayes')
print('# ...')
# split data into text and label set
# join title and text
X = dataset['Title'] + '. ' + dataset['Text']
y = dataset['Label']
if sklearn_cv:
cv = CountVectorizer()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
#class_prob = []
# counts number of training samples observed in each class
#class_counts = []
# for each fold
n = 0
for train, test in skf.split(X,y):
n += 1
print('# split no. ' + str(n))
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = True
extracted_words = BagOfWords.extract_all_words(X[train])
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
extracted_words = BagOfWords.extract_all_words(X[test])
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# apply select percentile
selector = SelectPercentile(percentile=percentile)
selector.fit(training_data, y[train])
# new reduced data sets
training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data)
#fit classifier
classifier.fit(training_data_r, y[train])
#predict class
predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r)
# print('train:')
# print(y[train])
# print('test:')
# print(y[test])
# print()
# print('pred')
# print(predictions_test)
#print and store metrics
rec = recall_score(y[test], predictions_test, average='weighted')
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[test], predictions_test, average='weighted')
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
#class_prob.append(classifier.class_prior_)
#class_counts.append(classifier.class_count_)
##########################
# probability estimates for the test vector (testing_data)
class_probs = classifier.predict_proba(testing_data)
# number of samples encountered for each class during fitting
# this value is weighted by the sample weight when provided
class_count = classifier.class_count_
# classes in order used
classes = classifier.classes_
# return classes and vector of class estimates
return recall_scores, precision_scores, f1_scores
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(training, testing):
'''calculates resubstitution error
shows indices of false classified articles
uses Gaussian Bayes with train test split
'''
X_train = training['Title'] + ' ' + training['Text']
y_train = training['Label']
X_test = testing['Title'] + ' ' + testing['Text']
y_test = testing['Label']
count_vector = CountVectorizer()
# fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train).toarray()
# transform testing data and return the matrix
testing_data = count_vector.transform(X_test).toarray()
# Naive Bayes
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
# fit classifier
classifier.fit(training_data, y_train)
# Predict class
predictions = classifier.predict(testing_data)
print(type(y_test))
print(len(y_test))
print(type(predictions))
print(len(predictions))
print('Errors at index:')
print()
n = 0
for i in range(len(y_test)):
if y_test[i] != predictions[i]:
n += 1
print('error no.{}'.format(n))
print('prediction at index {} is: {}, but actual is: {}'
.format(i, predictions[i], y_test[i]))
print(X_test[i])
print(y_test[i])
print()
#print metrics
print('F1 score: ', format(f1_score(y_test, predictions)))

6
src/Untitled.ipynb Normal file
View File

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB