update labeling / documentation

This commit is contained in:
annealias 2019-05-06 11:18:38 +02:00
parent 8ddf23d801
commit 7c3353edab
46 changed files with 18153 additions and 51041 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -146,46 +146,25 @@
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'm' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-4-9a40b379906c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mm\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: name 'm' is not defined"
]
}
],
"source": [
"m"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"m=15"
"m=16"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"This round number: 15\n",
"Number of manually labeled articles: 1122\n",
"Number of manually unlabeled articles: 8878\n"
"This round number: 16\n",
"Number of manually labeled articles: 1132\n",
"Number of manually unlabeled articles: 8868\n"
]
}
],
@ -205,6 +184,24 @@
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1082"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
@ -214,14 +211,14 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"52\n"
"50\n"
]
}
],
@ -242,16 +239,16 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8878"
"0"
]
},
"execution_count": 9,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}

File diff suppressed because one or more lines are too long

View File

@ -1,80 +0,0 @@
'''
Cosine Similarity
=================
CosineSimilarity measures the similarity between to articles.
It calculates c: the cosine of the angle between the articles
vectors text_1 and text_2.
c = (text_1 * text_2) / (|text_1| * |text_2|).
c = 1, if articles are equal => identicalness is 100%
0 > c > 1, else => identicalness is (c*100)%
(The greater c, the more similar two articles are.)
'''
from BagOfWords import BagOfWords
import csv
import math
import pandas as pd
class CosineSimilarity:
def calc_similarity(text_1, text_2, rel_freq=True, stemming=True):
''' calculates cosine similarity of two input articles
'''
print('# calculating cosine similarity...')
print()
# extract words from articles
extracted_words_1 = BagOfWords.extract_words(text_1, stemming)
extracted_words_2 = BagOfWords.extract_words(text_2, stemming)
print(extracted_words_1)
print(extracted_words_2)
# insert words into vocab
both_extracted = []
both_extracted.append(extracted_words_1)
both_extracted.append(extracted_words_2)
vocab = BagOfWords.make_vocab(both_extracted, stemming)
# create vectors
matrix = BagOfWords.make_matrix(both_extracted, vocab,\
rel_freq, stemming)
# start calculation
# calculate numerator of formula
sum_1 = 0
for i in range (0,len(matrix.iloc[0])):
sum_1 += matrix.iloc[0][i] * matrix.iloc[1][i]
# calculate denominator of formula
sum_2 = 0
for entry in matrix.iloc[0]:
sum_2 += entry ** 2
sum_3 = 0
for entry in matrix.iloc[1]:
sum_3 += entry ** 2
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
if __name__ == '__main__':
# read data set
file = '..\\data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
usecols=[1,2],
nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
texts = df[1] + '. ' + df[2]
# compare first and second article in data set
print(CosineSimilarity.calc_similarity(texts.iloc[0], texts.iloc[1],\
rel_freq=True, stemming=True))

View File

@ -15,7 +15,7 @@ from BagOfWords import BagOfWords
import csv
import operator
import graphviz
#import graphviz
import numpy as np
import pandas as pd
from sklearn import tree
@ -26,7 +26,7 @@ from sklearn.model_selection import StratifiedKFold
class DecisionTree:
def make_tree(dataset, sklearn_cv=False, stemming=False, percentile=100):
def make_tree(dataset, sklearn_cv=True, stemming=False, percentile=100):
print('# fitting model')
print('# ...')
@ -131,18 +131,18 @@ class DecisionTree:
print('# starting decision tree')
print('# ...')
file = '..\\data\\classification_labelled_corrected.csv'
file = '..\\data\\interactive_labeling_round_17_20190502.csv'
# read csv file
print('# reading dataset')
print('# ...')
data = pd.read_csv(file,
sep='|',
engine='python',
decimal='.',
quotechar='\'',
quoting=csv.QUOTE_NONE)
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
make_tree(data)

View File

@ -1,86 +0,0 @@
'''
Label Propagation Algorithm for Interactive Labeling
====================================================
Uses scikit learn's implementation of label propagation:
Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled
data with label propagation.
(Technical Report CMU-CALD-02-107, Carnegie Mellon University, 2002.)
Prints out probabilities for classes needed for interactive labeling.
'''
from BagOfWords import BagOfWords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import recall_score, precision_score
from sklearn.semi_supervised import label_propagation
class LabelPropagation:
def propagate_labels(labeled_data, unlabeled_data, sklearn_cv=False):
print('# MNB: starting label propagation')
# assign algorithm
classifier = label_propagation.LabelSpreading()
# split labeled data into text and label set
# join title and text
X = labeled_data['Title'] + '. ' + labeled_data['Text']
y = labeled_data['Label']
# split unlabeled data into text and label set
# join title and text
U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
l = unlabeled_data['Label']
if sklearn_cv:
cv = CountVectorizer()
# probabilities of each class (of each fold)
class_probs = []
# number of training samples observed in each class
class_counts = []
if sklearn_cv:
# fit the training data and then return the matrix
training_data = cv.fit_transform(X, y).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(U).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = False
extracted_words = BagOfWords.extract_all_words(X)
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
print('# MNB: fit training data and calculate matrix...')
print()
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
print('# MNB: transform testing data to matrix...')
print()
extracted_words = BagOfWords.extract_all_words(U)
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
#fit classifier
classifier.fit(training_data, y)
# probability estimates for the test vector (testing_data)
class_probs = classifier.predict_proba(testing_data)
predictions = classifier.predict(testing_data)
print('# MNB: ending label propagation')
# return vector of class estimates
return class_probs, predictions

View File

@ -139,7 +139,7 @@ class LabelingPlotter():
def plot_cumulative():
# load pickle object
with open('../obj/array_3model_svm_class2.pkl', 'rb') as input:
with open('../obj/array_class_probs_round_15_svm_190502.pkl', 'rb') as input:
list = pickle.load(input)
# sort list in descending order
@ -165,12 +165,12 @@ class LabelingPlotter():
#ax.grid(True)
#ax.legend(loc='right')
ax.set_title('Predictions class 2 (SVM)')
#ax.set_title('Predictions class 2 (SVM)')
# for iterations
#ax.set_xlabel('Highest estimated probability')
#ax.set_ylabel('Fraction of articles with this highest estimated probability')
# for 3-models
ax.set_xlabel('Estimated probability for class 2')
ax.set_xlabel('Estimated probabilities after iteration 14')
ax.set_ylabel('Fraction of articles with this probability')
#plt.axis([0.97, 1, 0.95, 1.01])
#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
@ -180,8 +180,8 @@ class LabelingPlotter():
#ax.set_xbound(lower=0.5, upper=0.99)
#plt.savefig('..\\visualization\\proba_stratified_round_9.png')
#plt.savefig('..\\visualization\\proba_stratified_round_9.eps')
plt.savefig('..\\visualization\\3model_svm_class2.png')
plt.savefig('..\\visualization\\3model_svm_class2.eps')
#plt.savefig('..\\visualization\\3model_svm_class2.png')
#plt.savefig('..\\visualization\\3model_svm_class2.eps')
plt.show()
@ -211,5 +211,5 @@ class LabelingPlotter():
if __name__ == '__main__':
#LabelingPlotter.plot_correlation()
#LabelingPlotter.plot_cumulative()
LabelingPlotter.plot_labeling_rounds_naive()
LabelingPlotter.plot_cumulative()
#LabelingPlotter.plot_labeling_rounds_naive()

View File

@ -19,7 +19,7 @@ import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import f1_score, make_scorer, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
@ -56,12 +56,12 @@ class SVM:
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75],
grid = GridSearchCV(pipeline, {'perc__percentile': [100],
'SVC__kernel': ['linear'],
'SVC__gamma': [0.00001, 0.0001],
'SVC__C': [0.1, 1]},
cv=skf,
scoring=make_scorer(f1_score))
scoring=make_scorer(recall_score))
print('# fit classifier')
print('# ...')

70
src/ThreeModelApproach.py Normal file
View File

@ -0,0 +1,70 @@
'''
Comparing Three Model Approach to MNB
'''
from BagOfWords import BagOfWords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
class ThreeModelApproach:
def calc_model_1(labeled_data):
print('# MNB: starting interactive multinomial naives bayes...')
print()
# split labeled data into text and label set
# join title and text
X = labeled_data['Title'] + '. ' + labeled_data['Text']
y = labeled_data['Label']
cv = CountVectorizer()
# fit_prior=False: a uniform prior will be used instead
# of learning class prior probabilities
classifier = GaussianNB()
# metrics
recall_scores = []
precision_scores = []
X_train, X_test, y_train, y_test = train_test_split(X, y,
stratify=y,
test_size=0.25)
# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X_train, y_train).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X_test).toarray()
#fit classifier
classifier.fit(training_data, y_train)
predictions_test = classifier.predict(testing_data)
#print and store metrics
rec = recall_score(y_test, predictions_test)
print('rec: ' + str(rec))
prec = precision_score(y_test, predictions_test)
print('prec: ' + str(prec))
print('#')
if __name__ == '__main__':
file = '..\\data\\interactive_labeling_round_17_20190502.csv'
data = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
ThreeModelApproach.calc_model_1(df.loc[df['Label'] != -1].reset_index(drop=True))

View File

@ -618,27 +618,27 @@
},
{
"cell_type": "code",
"execution_count": 158,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Nachberechnung fürs Latex:\n",
"zero_0 = 80\n",
"zero_1 = 2\n",
"zero_2 = 14\n",
"zero_0 = 0\n",
"zero_1 = 0\n",
"zero_2 = 0\n",
"\n",
"one_0 = 0\n",
"one_1 = 0\n",
"one_2 = 1\n",
"one_0 = 58\n",
"one_1 = 22\n",
"one_2 = 20\n",
"\n",
"two_0 = 0\n",
"two_1 = 0\n",
"two_2 = 3"
"two_2 = 0"
]
},
{
"cell_type": "code",
"execution_count": 129,
"execution_count": 2,
"metadata": {},
"outputs": [
{
@ -650,108 +650,15 @@
]
},
{
"data": {
"text/plain": [
"68"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"6"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"/\n"
"ename": "NameError",
"evalue": "name 'testing_data' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-2-2e477f7d128e>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'confusion matrix:'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'###############'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mzero_0\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Estimated'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m&\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Label'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[0mzero_0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mzero_1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Estimated'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m&\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Label'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'testing_data' is not defined"
]
},
{
"data": {
"text/plain": [
"8"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"11"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"/\n"
]
},
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
@ -782,7 +689,7 @@
},
{
"cell_type": "code",
"execution_count": 159,
"execution_count": 11,
"metadata": {
"scrolled": false
},
@ -795,51 +702,51 @@
"\n",
"class 0:\n",
"\n",
"TP: 80\n",
"TN: 4\n",
"FP: 16\n",
"FN: 0\n",
"TP: 0\n",
"TN: 42\n",
"FP: 0\n",
"FN: 58\n",
"\n",
"class 1:\n",
"\n",
"TP: 0\n",
"TN: 97\n",
"FP: 1\n",
"FN: 2\n",
"TP: 22\n",
"TN: 0\n",
"FP: 78\n",
"FN: 0\n",
"\n",
"class 2:\n",
"\n",
"TP: 3\n",
"TN: 82\n",
"TP: 0\n",
"TN: 80\n",
"FP: 0\n",
"FN: 15\n",
"FN: 20\n",
"###############\n",
"\n",
"METRICS:\n",
"\n",
"class 0:\n",
"\n",
"precision: 83.33\n",
"recall: 100.0\n",
"accuracy: 84.0\n",
"precision: 0\n",
"recall: 0.0\n",
"accuracy: 42.0\n",
"\n",
"class 1:\n",
"\n",
"precision: 0.0\n",
"recall: 0.0\n",
"accuracy: 97.0\n",
"precision: 22.0\n",
"recall: 100.0\n",
"accuracy: 22.0\n",
"\n",
"class 2:\n",
"\n",
"precision: 100.0\n",
"recall: 16.67\n",
"accuracy: 85.0\n",
"precision: 0\n",
"recall: 0.0\n",
"accuracy: 80.0\n",
"\n",
"Average Metrics:\n",
"\n",
"precision: 61.111111111111114\n",
"recall: 38.888888888888886\n",
"accuracy: 88.66666666666667\n"
"precision: 7.333333333333333\n",
"recall: 33.333333333333336\n",
"accuracy: 48.0\n"
]
}
],
@ -885,7 +792,7 @@
"print()\n",
"print('class 0:')\n",
"print()\n",
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
"prec_0 = tp_0 #/ (tp_0 + fp_0) * 100\n",
"print('precision: {}'.format(round(prec_0, 2)))\n",
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
"print('recall: {}'.format(round(rec_0, 2)))\n",
@ -903,7 +810,7 @@
"print()\n",
"print('class 2:')\n",
"print()\n",
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
"prec_2 = tp_2 #/ (tp_2 + fp_2) * 100\n",
"print('precision: {}'.format(round(prec_2, 2)))\n",
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
"print('recall: {}'.format(round(rec_2, 2)))\n",

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,713 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Last round number: 17\n",
"Number of manually labeled articles: 1412\n",
"Number of manually unlabeled articles: 8588\n"
]
}
],
"source": [
"import csv\n",
"import operator\n",
"import pickle\n",
"import random\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"\n",
"# initialize random => reproducible sequence\n",
"random.seed(5)\n",
"random_state=5\n",
"\n",
"# set up wider display area\n",
"pd.set_option('display.max_colwidth', -1)\n",
"\n",
"# read current data set from csv\n",
"df = pd.read_csv('../../data/interactive_labeling_round_17_20190502.csv',\n",
" sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')\n",
"\n",
"# find current iteration/round number\n",
"m = int(df['Round'].max())\n",
"print('Last round number: {}'.format(m))\n",
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"m = 10\n",
"df.loc[(df['Round'] >= m), 'Label'] = -1\n",
"df.loc[(df['Round'] >= m), 'Round'] = np.nan\n",
"\n",
"len(df.loc[df['Label'] != -1])\n",
"\n",
"labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"737\n",
"35\n",
"128\n",
"655\n",
"31\n",
"114\n",
"573\n",
"27\n",
"100\n",
"491\n",
"23\n",
"86\n",
"409\n",
"19\n",
"72\n",
"327\n",
"15\n",
"58\n",
"245\n",
"11\n",
"44\n",
"163\n",
"7\n",
"30\n",
"81\n",
"3\n",
"16\n",
"0\n",
"0\n",
"0\n"
]
}
],
"source": [
"sampling_0_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_0_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_0_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_0_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_0_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_0_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_1_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_1_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_1_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_1_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_1_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_1_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_2_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_2_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_2_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_2_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_2_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_2_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_3_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_3_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_3_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_3_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_3_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_3_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_4_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_4_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_4_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_4_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_4_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_4_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_5_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_5_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_5_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_5_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_5_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_5_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_6_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_6_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_6_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_6_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_6_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_6_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_7_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_7_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_7_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_7_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_7_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_7_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_8_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
"sampling_8_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
"sampling_8_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_8_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_8_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_8_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_9_class0 = labeled_pos_0.sample(n=81, replace=False, random_state=random_state) # 737\n",
"sampling_9_class1 = labeled_pos_1.sample(n=3, replace=False, random_state=random_state) # 35\n",
"sampling_9_class2 = labeled_pos_2.sample(n=16, replace=False, random_state=random_state) # 128\n",
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_9_class0['Index'].tolist())]\n",
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_9_class1['Index'].tolist())]\n",
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_9_class2['Index'].tolist())]\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) "
]
},
{
"cell_type": "code",
"execution_count": 238,
"metadata": {},
"outputs": [],
"source": [
"# TESTING DATA\n",
"#testing_data = pd.concat([sampling_0_class0, sampling_0_class1, sampling_0_class2])\n",
"#testing_data = pd.concat([sampling_1_class0, sampling_1_class1, sampling_1_class2])\n",
"#testing_data = pd.concat([sampling_2_class0, sampling_2_class1, sampling_2_class2])\n",
"#testing_data = pd.concat([sampling_3_class0, sampling_3_class1, sampling_3_class2])\n",
"#testing_data = pd.concat([sampling_4_class0, sampling_4_class1, sampling_4_class2])\n",
"#testing_data = pd.concat([sampling_5_class0, sampling_5_class1, sampling_5_class2])\n",
"#testing_data = pd.concat([sampling_6_class0, sampling_6_class1, sampling_6_class2])\n",
"#testing_data = pd.concat([sampling_7_class0, sampling_7_class1, sampling_7_class2])\n",
"#testing_data = pd.concat([sampling_8_class0, sampling_8_class1, sampling_8_class2])\n",
"testing_data = pd.concat([sampling_9_class0, sampling_9_class1, sampling_9_class2])"
]
},
{
"cell_type": "code",
"execution_count": 239,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"100"
]
},
"execution_count": 239,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indices_testing_data = testing_data['Index'].tolist()\n",
"len(testing_data)"
]
},
{
"cell_type": "code",
"execution_count": 240,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"900"
]
},
"execution_count": 240,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# TRAINING DATA\n",
"training_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_testing_data))].reset_index(drop=True)\n",
"indices_training_data = training_data['Index'].tolist()\n",
"len(training_data)"
]
},
{
"cell_type": "code",
"execution_count": 241,
"metadata": {},
"outputs": [],
"source": [
"# Model 2:\n",
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
"sampling_class0 = labeled_pos_0.sample(n=18, random_state=random_state) # 737\n",
"sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 0\n",
"sampling_class1 = labeled_pos_1.sample(n=18, random_state=random_state) # 35\n",
"sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 0\n",
"sampling_class2 = labeled_pos_2.sample(n=35, random_state=random_state) # 128\n",
"sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 1\n",
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
"\n",
"testing_data.loc[testing_data['Label'] == 0, 'Label'] = 0\n",
"testing_data.loc[testing_data['Label'] == 1, 'Label'] = 0\n",
"testing_data.loc[testing_data['Label'] == 2, 'Label'] = 1\n",
"classifier = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [],
"source": [
"# Model 1:\n",
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
"sampling_class0 = labeled_pos_0.sample(n=18, random_state=random_state) # 737\n",
"sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 0\n",
"sampling_class1 = labeled_pos_1.sample(n=35, random_state=random_state) # 35\n",
"sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 1\n",
"sampling_class2 = labeled_pos_2.sample(n=18, random_state=random_state) # 128\n",
"sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 0\n",
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
"\n",
"testing_data.loc[testing_data['Label'] == 0, 'Label'] = 0\n",
"testing_data.loc[testing_data['Label'] == 1, 'Label'] = 1\n",
"testing_data.loc[testing_data['Label'] == 2, 'Label'] = 0\n",
"classifier = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"# Model 0:\n",
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
"sampling_class0 = labeled_pos_0.sample(n=35, random_state=random_state) # 737\n",
"sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 1\n",
"sampling_class1 = labeled_pos_1.sample(n=18, random_state=random_state) # 35\n",
"sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 0\n",
"sampling_class2 = labeled_pos_2.sample(n=18, random_state=random_state) # 128\n",
"sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 0\n",
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
"\n",
"testing_data.loc[testing_data['Label'] == 0, 'Label'] = 1\n",
"testing_data.loc[testing_data['Label'] == 1, 'Label'] = 0\n",
"testing_data.loc[testing_data['Label'] == 2, 'Label'] = 0\n",
"classifier = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"737\n",
"36\n",
"126\n"
]
}
],
"source": [
"# MNB:\n",
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
"print(len(labeled_pos_0)) # 33\n",
"print(len(labeled_pos_1)) # 33\n",
"print(len(labeled_pos_2)) \n",
"sampling_class0 = labeled_pos_0.sample(n=24, random_state=random_state) # 737\n",
"sampling_class1 = labeled_pos_1.sample(n=24, random_state=random_state) # 35\n",
"sampling_class2 = labeled_pos_2.sample(n=24, random_state=random_state) # 128\n",
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
"indices_training_data = training_data['Index'].tolist()\n",
"len(training_data)\n",
"classifier = MultinomialNB()"
]
},
{
"cell_type": "code",
"execution_count": 242,
"metadata": {},
"outputs": [],
"source": [
"# split training data into text and label set\n",
"# join title and text\n",
"X = training_data['Title'] + '. ' + training_data['Text']\n",
"y = training_data['Label']\n",
"\n",
"# split testing data into text and label set\n",
"U = testing_data['Title'] + '. ' + testing_data['Text']\n",
"v = testing_data['Label']\n",
"\n",
"cv = CountVectorizer()\n",
"# fit the training data and then return the matrix\n",
"training_data = cv.fit_transform(X, y).toarray()\n",
"# transform testing data and return the matrix\n",
"testing_data = cv.transform(U).toarray()\n",
"#fit classifier\n",
"classifier.fit(training_data, y)\n",
"#predict class\n",
"predictions_test = classifier.predict(testing_data)\n",
"\n",
"# annotate estimated labels\n",
"df['Estimated'] = np.nan\n",
"for i, value in enumerate(indices_testing_data):\n",
" df.loc[(df['Index'] == value), 'Estimated'] = predictions_test[i]"
]
},
{
"cell_type": "code",
"execution_count": 243,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n",
"69\n",
"1\n",
"###############\n",
"12\n",
"2\n",
"###############\n",
"metrics:\n",
"\n",
"69\n",
"2\n",
"1\n",
"12\n",
"###############\n",
"2\n",
"69\n",
"12\n",
"1\n",
"###############\n",
"98.57142857142858\n",
"85.18518518518519\n",
"84.52380952380952\n",
"###############\n",
"14.285714285714285\n",
"66.66666666666666\n",
"84.52380952380952\n",
"###############\n",
"56.42857142857143\n",
"75.92592592592592\n",
"84.52380952380952\n"
]
}
],
"source": [
"# Model 0-2:\n",
"print('###############')\n",
"zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
"print(zero_0)\n",
"zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
"print(zero_1)\n",
"print('###############')\n",
"one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
"print(one_0)\n",
"one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
"print(one_1)\n",
"print('###############')\n",
"\n",
"print('metrics:')\n",
"print()\n",
"\n",
"total = zero_0 + zero_1 + one_0 + one_1\n",
"\n",
"tp_0 = zero_0\n",
"print(tp_0)\n",
"tn_0 = one_1\n",
"print(tn_0)\n",
"fp_0 = zero_1\n",
"print(fp_0)\n",
"fn_0 = one_0\n",
"print(fn_0)\n",
"print('###############')\n",
"\n",
"tp_1 = one_1\n",
"print(tp_1)\n",
"tn_1 = zero_0\n",
"print(tn_1)\n",
"fp_1 = one_0\n",
"print(fp_1)\n",
"fn_1 = zero_1\n",
"print(fn_1)\n",
"print('###############')\n",
"\n",
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
"print(prec_0)\n",
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
"print(rec_0)\n",
"acc_0 = (tp_0 + tn_0) / total * 100\n",
"print(acc_0)\n",
"print('###############')\n",
"\n",
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
"print(prec_1)\n",
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
"print(rec_1)\n",
"acc_1 = (tp_1 + tn_1) / total * 100\n",
"print(acc_1)\n",
"print('###############')\n",
"\n",
"print((prec_1 + prec_0) / 2)\n",
"print((rec_1 + rec_0) / 2)\n",
"print((acc_1 + acc_0) / 2)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"confusion matrix:\n",
"###############\n",
"62\n",
"0\n",
"0\n",
"/\n",
"12\n",
"3\n",
"11\n",
"/\n",
"8\n",
"0\n",
"5\n",
"###############\n",
"\n",
"class 0:\n",
"\n",
"TP: 62\n",
"TN: 19\n",
"FP: 0\n",
"FN: 20\n",
"\n",
"class 1:\n",
"\n",
"TP: 3\n",
"TN: 75\n",
"FP: 23\n",
"FN: 0\n",
"\n",
"class 2:\n",
"\n",
"TP: 5\n",
"TN: 77\n",
"FP: 8\n",
"FN: 11\n",
"###############\n",
"\n",
"METRICS:\n",
"\n",
"class 0:\n",
"\n",
"precision: 100.0\n",
"recall: 75.61\n",
"accuracy: 80.2\n",
"\n",
"class 1:\n",
"\n",
"precision: 11.54\n",
"recall: 100.0\n",
"accuracy: 77.23\n",
"\n",
"class 2:\n",
"\n",
"precision: 38.46\n",
"recall: 31.25\n",
"accuracy: 81.19\n",
"\n",
"Average Metrics:\n",
"\n",
"precision: 50.0\n",
"recall: 68.95325203252033\n",
"accuracy: 79.53795379537955\n"
]
}
],
"source": [
"# MNB:\n",
"print('confusion matrix:')\n",
"print('###############')\n",
"zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
"print(zero_0)\n",
"zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
"print(zero_1)\n",
"zero_2 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 2)])\n",
"print(zero_2)\n",
"print('/')\n",
"one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
"print(one_0)\n",
"one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
"print(one_1)\n",
"one_2 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 2)])\n",
"print(one_2)\n",
"print('/')\n",
"\n",
"two_0 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 0)])\n",
"print(two_0)\n",
"two_1 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 1)])\n",
"print(two_1)\n",
"two_2 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 2)])\n",
"print(two_2)\n",
"\n",
"print('###############')\n",
"print()\n",
"total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
"print('class 0:')\n",
"print()\n",
"tp_0 = zero_0\n",
"print('TP: {}'.format(tp_0))\n",
"tn_0 = one_1 + one_2 + two_1 + two_2\n",
"print('TN: {}'.format(tn_0))\n",
"fp_0 = zero_1 + zero_2\n",
"print('FP: {}'.format(fp_0))\n",
"fn_0 = one_0 + two_0\n",
"print('FN: {}'.format(fn_0))\n",
"print()\n",
"print('class 1:')\n",
"print()\n",
"tp_1 = one_1\n",
"print('TP: {}'.format(tp_1))\n",
"tn_1 = zero_0 + zero_2 + two_0 + two_2\n",
"print('TN: {}'.format(tn_1))\n",
"fp_1 = one_0 + one_2\n",
"print('FP: {}'.format(fp_1))\n",
"fn_1 = zero_1 + two_1\n",
"print('FN: {}'.format(fn_1))\n",
"print()\n",
"print('class 2:')\n",
"print()\n",
"tp_2 = two_2\n",
"print('TP: {}'.format(tp_2))\n",
"tn_2 = zero_0 + zero_1 + one_0 + one_1\n",
"print('TN: {}'.format(tn_2))\n",
"fp_2 = two_0 + two_1\n",
"print('FP: {}'.format(fp_2))\n",
"fn_2 = zero_2 + one_2\n",
"print('FN: {}'.format(fn_2))\n",
"print('###############')\n",
"print()\n",
"print('METRICS:')\n",
"print()\n",
"print('class 0:')\n",
"print()\n",
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
"print('precision: {}'.format(round(prec_0, 2)))\n",
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
"print('recall: {}'.format(round(rec_0, 2)))\n",
"acc_0 = (tp_0 + tn_0) / total * 100\n",
"print('accuracy: {}'.format(round(acc_0, 2)))\n",
"print()\n",
"print('class 1:')\n",
"print()\n",
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
"print('precision: {}'.format(round(prec_1, 2)))\n",
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
"print('recall: {}'.format(round(rec_1, 2)))\n",
"acc_1 = (tp_1 + tn_1) / total * 100\n",
"print('accuracy: {}'.format(round(acc_1, 2)))\n",
"print()\n",
"print('class 2:')\n",
"print()\n",
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
"print('precision: {}'.format(round(prec_2, 2)))\n",
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
"print('recall: {}'.format(round(rec_2, 2)))\n",
"acc_2 = (tp_2 + tn_2) / total * 100\n",
"print('accuracy: {}'.format(round(acc_2, 2)))\n",
"print()\n",
"print('Average Metrics:')\n",
"print()\n",
"print('precision: {}'.format((prec_1 + prec_2 + prec_0) / 3))\n",
"print('recall: {}'.format((rec_1 + rec_2 + rec_0) / 3))\n",
"print('accuracy: {}'.format((acc_1 + acc_2 + acc_0) / 3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 61 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB