update labeling / documentation
|
@ -146,46 +146,25 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 9,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"ename": "NameError",
|
|
||||||
"evalue": "name 'm' is not defined",
|
|
||||||
"output_type": "error",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|
||||||
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
||||||
"\u001b[1;32m<ipython-input-4-9a40b379906c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mm\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
|
||||||
"\u001b[1;31mNameError\u001b[0m: name 'm' is not defined"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"m"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"m=15"
|
"m=16"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 10,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"This round number: 15\n",
|
"This round number: 16\n",
|
||||||
"Number of manually labeled articles: 1122\n",
|
"Number of manually labeled articles: 1132\n",
|
||||||
"Number of manually unlabeled articles: 8878\n"
|
"Number of manually unlabeled articles: 8868\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -205,6 +184,24 @@
|
||||||
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
|
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1082"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
@ -214,14 +211,14 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 13,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"52\n"
|
"50\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -242,16 +239,16 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 14,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"8878"
|
"0"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 9,
|
"execution_count": 14,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,80 +0,0 @@
|
||||||
'''
|
|
||||||
Cosine Similarity
|
|
||||||
=================
|
|
||||||
|
|
||||||
CosineSimilarity measures the similarity between to articles.
|
|
||||||
It calculates c: the cosine of the angle between the articles
|
|
||||||
vectors text_1 and text_2.
|
|
||||||
c = (text_1 * text_2) / (|text_1| * |text_2|).
|
|
||||||
c = 1, if articles are equal => identicalness is 100%
|
|
||||||
0 > c > 1, else => identicalness is (c*100)%
|
|
||||||
(The greater c, the more similar two articles are.)
|
|
||||||
'''
|
|
||||||
from BagOfWords import BagOfWords
|
|
||||||
|
|
||||||
import csv
|
|
||||||
import math
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
class CosineSimilarity:
|
|
||||||
|
|
||||||
def calc_similarity(text_1, text_2, rel_freq=True, stemming=True):
|
|
||||||
''' calculates cosine similarity of two input articles
|
|
||||||
'''
|
|
||||||
print('# calculating cosine similarity...')
|
|
||||||
print()
|
|
||||||
|
|
||||||
# extract words from articles
|
|
||||||
extracted_words_1 = BagOfWords.extract_words(text_1, stemming)
|
|
||||||
extracted_words_2 = BagOfWords.extract_words(text_2, stemming)
|
|
||||||
print(extracted_words_1)
|
|
||||||
print(extracted_words_2)
|
|
||||||
|
|
||||||
# insert words into vocab
|
|
||||||
both_extracted = []
|
|
||||||
both_extracted.append(extracted_words_1)
|
|
||||||
both_extracted.append(extracted_words_2)
|
|
||||||
vocab = BagOfWords.make_vocab(both_extracted, stemming)
|
|
||||||
|
|
||||||
# create vectors
|
|
||||||
matrix = BagOfWords.make_matrix(both_extracted, vocab,\
|
|
||||||
rel_freq, stemming)
|
|
||||||
|
|
||||||
# start calculation
|
|
||||||
# calculate numerator of formula
|
|
||||||
sum_1 = 0
|
|
||||||
|
|
||||||
for i in range (0,len(matrix.iloc[0])):
|
|
||||||
sum_1 += matrix.iloc[0][i] * matrix.iloc[1][i]
|
|
||||||
|
|
||||||
# calculate denominator of formula
|
|
||||||
sum_2 = 0
|
|
||||||
|
|
||||||
for entry in matrix.iloc[0]:
|
|
||||||
sum_2 += entry ** 2
|
|
||||||
|
|
||||||
sum_3 = 0
|
|
||||||
for entry in matrix.iloc[1]:
|
|
||||||
sum_3 += entry ** 2
|
|
||||||
|
|
||||||
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
# read data set
|
|
||||||
file = '..\\data\\cleaned_data_set_without_header.csv'
|
|
||||||
df = pd.read_csv(file,
|
|
||||||
delimiter='|',
|
|
||||||
header=None,
|
|
||||||
index_col=None,
|
|
||||||
engine='python',
|
|
||||||
usecols=[1,2],
|
|
||||||
nrows=100,
|
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
|
||||||
quotechar='\'')
|
|
||||||
|
|
||||||
texts = df[1] + '. ' + df[2]
|
|
||||||
|
|
||||||
# compare first and second article in data set
|
|
||||||
print(CosineSimilarity.calc_similarity(texts.iloc[0], texts.iloc[1],\
|
|
||||||
rel_freq=True, stemming=True))
|
|
|
@ -15,7 +15,7 @@ from BagOfWords import BagOfWords
|
||||||
import csv
|
import csv
|
||||||
import operator
|
import operator
|
||||||
|
|
||||||
import graphviz
|
#import graphviz
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn import tree
|
from sklearn import tree
|
||||||
|
@ -26,7 +26,7 @@ from sklearn.model_selection import StratifiedKFold
|
||||||
|
|
||||||
class DecisionTree:
|
class DecisionTree:
|
||||||
|
|
||||||
def make_tree(dataset, sklearn_cv=False, stemming=False, percentile=100):
|
def make_tree(dataset, sklearn_cv=True, stemming=False, percentile=100):
|
||||||
print('# fitting model')
|
print('# fitting model')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
|
@ -131,18 +131,18 @@ class DecisionTree:
|
||||||
print('# starting decision tree')
|
print('# starting decision tree')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
file = '..\\data\\classification_labelled_corrected.csv'
|
file = '..\\data\\interactive_labeling_round_17_20190502.csv'
|
||||||
|
|
||||||
# read csv file
|
# read csv file
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
data = pd.read_csv(file,
|
data = pd.read_csv(file,
|
||||||
sep='|',
|
sep='|',
|
||||||
engine='python',
|
usecols=range(1,13), # drop first column 'unnamed'
|
||||||
decimal='.',
|
encoding='utf-8',
|
||||||
quotechar='\'',
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quoting=csv.QUOTE_NONE)
|
quotechar='\'')
|
||||||
|
|
||||||
make_tree(data)
|
make_tree(data)
|
||||||
|
|
||||||
|
|
|
@ -1,86 +0,0 @@
|
||||||
'''
|
|
||||||
Label Propagation Algorithm for Interactive Labeling
|
|
||||||
====================================================
|
|
||||||
|
|
||||||
Uses scikit learn's implementation of label propagation:
|
|
||||||
Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled
|
|
||||||
data with label propagation.
|
|
||||||
(Technical Report CMU-CALD-02-107, Carnegie Mellon University, 2002.)
|
|
||||||
|
|
||||||
Prints out probabilities for classes needed for interactive labeling.
|
|
||||||
'''
|
|
||||||
|
|
||||||
from BagOfWords import BagOfWords
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
|
||||||
|
|
||||||
from sklearn.metrics import recall_score, precision_score
|
|
||||||
|
|
||||||
from sklearn.semi_supervised import label_propagation
|
|
||||||
|
|
||||||
class LabelPropagation:
|
|
||||||
|
|
||||||
def propagate_labels(labeled_data, unlabeled_data, sklearn_cv=False):
|
|
||||||
|
|
||||||
print('# MNB: starting label propagation')
|
|
||||||
|
|
||||||
# assign algorithm
|
|
||||||
classifier = label_propagation.LabelSpreading()
|
|
||||||
|
|
||||||
# split labeled data into text and label set
|
|
||||||
# join title and text
|
|
||||||
X = labeled_data['Title'] + '. ' + labeled_data['Text']
|
|
||||||
y = labeled_data['Label']
|
|
||||||
|
|
||||||
# split unlabeled data into text and label set
|
|
||||||
# join title and text
|
|
||||||
U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
|
|
||||||
l = unlabeled_data['Label']
|
|
||||||
|
|
||||||
if sklearn_cv:
|
|
||||||
cv = CountVectorizer()
|
|
||||||
|
|
||||||
# probabilities of each class (of each fold)
|
|
||||||
class_probs = []
|
|
||||||
|
|
||||||
# number of training samples observed in each class
|
|
||||||
class_counts = []
|
|
||||||
|
|
||||||
if sklearn_cv:
|
|
||||||
# fit the training data and then return the matrix
|
|
||||||
training_data = cv.fit_transform(X, y).toarray()
|
|
||||||
# transform testing data and return the matrix
|
|
||||||
testing_data = cv.transform(U).toarray()
|
|
||||||
else:
|
|
||||||
# use my own BagOfWords python implementation
|
|
||||||
stemming = True
|
|
||||||
rel_freq = False
|
|
||||||
extracted_words = BagOfWords.extract_all_words(X)
|
|
||||||
vocab = BagOfWords.make_vocab(extracted_words)
|
|
||||||
|
|
||||||
# fit the training data and then return the matrix
|
|
||||||
print('# MNB: fit training data and calculate matrix...')
|
|
||||||
print()
|
|
||||||
training_data = BagOfWords.make_matrix(extracted_words,
|
|
||||||
vocab, rel_freq, stemming)
|
|
||||||
|
|
||||||
# transform testing data and return the matrix
|
|
||||||
print('# MNB: transform testing data to matrix...')
|
|
||||||
print()
|
|
||||||
extracted_words = BagOfWords.extract_all_words(U)
|
|
||||||
testing_data = BagOfWords.make_matrix(extracted_words,
|
|
||||||
vocab, rel_freq, stemming)
|
|
||||||
|
|
||||||
#fit classifier
|
|
||||||
classifier.fit(training_data, y)
|
|
||||||
|
|
||||||
# probability estimates for the test vector (testing_data)
|
|
||||||
class_probs = classifier.predict_proba(testing_data)
|
|
||||||
|
|
||||||
predictions = classifier.predict(testing_data)
|
|
||||||
|
|
||||||
print('# MNB: ending label propagation')
|
|
||||||
|
|
||||||
# return vector of class estimates
|
|
||||||
return class_probs, predictions
|
|
|
@ -139,7 +139,7 @@ class LabelingPlotter():
|
||||||
|
|
||||||
def plot_cumulative():
|
def plot_cumulative():
|
||||||
# load pickle object
|
# load pickle object
|
||||||
with open('../obj/array_3model_svm_class2.pkl', 'rb') as input:
|
with open('../obj/array_class_probs_round_15_svm_190502.pkl', 'rb') as input:
|
||||||
list = pickle.load(input)
|
list = pickle.load(input)
|
||||||
|
|
||||||
# sort list in descending order
|
# sort list in descending order
|
||||||
|
@ -165,12 +165,12 @@ class LabelingPlotter():
|
||||||
|
|
||||||
#ax.grid(True)
|
#ax.grid(True)
|
||||||
#ax.legend(loc='right')
|
#ax.legend(loc='right')
|
||||||
ax.set_title('Predictions class 2 (SVM)')
|
#ax.set_title('Predictions class 2 (SVM)')
|
||||||
# for iterations
|
# for iterations
|
||||||
#ax.set_xlabel('Highest estimated probability')
|
#ax.set_xlabel('Highest estimated probability')
|
||||||
#ax.set_ylabel('Fraction of articles with this highest estimated probability')
|
#ax.set_ylabel('Fraction of articles with this highest estimated probability')
|
||||||
# for 3-models
|
# for 3-models
|
||||||
ax.set_xlabel('Estimated probability for class 2')
|
ax.set_xlabel('Estimated probabilities after iteration 14')
|
||||||
ax.set_ylabel('Fraction of articles with this probability')
|
ax.set_ylabel('Fraction of articles with this probability')
|
||||||
#plt.axis([0.97, 1, 0.95, 1.01])
|
#plt.axis([0.97, 1, 0.95, 1.01])
|
||||||
#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
|
#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
|
||||||
|
@ -180,8 +180,8 @@ class LabelingPlotter():
|
||||||
#ax.set_xbound(lower=0.5, upper=0.99)
|
#ax.set_xbound(lower=0.5, upper=0.99)
|
||||||
#plt.savefig('..\\visualization\\proba_stratified_round_9.png')
|
#plt.savefig('..\\visualization\\proba_stratified_round_9.png')
|
||||||
#plt.savefig('..\\visualization\\proba_stratified_round_9.eps')
|
#plt.savefig('..\\visualization\\proba_stratified_round_9.eps')
|
||||||
plt.savefig('..\\visualization\\3model_svm_class2.png')
|
#plt.savefig('..\\visualization\\3model_svm_class2.png')
|
||||||
plt.savefig('..\\visualization\\3model_svm_class2.eps')
|
#plt.savefig('..\\visualization\\3model_svm_class2.eps')
|
||||||
|
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
@ -211,5 +211,5 @@ class LabelingPlotter():
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
#LabelingPlotter.plot_correlation()
|
#LabelingPlotter.plot_correlation()
|
||||||
#LabelingPlotter.plot_cumulative()
|
LabelingPlotter.plot_cumulative()
|
||||||
LabelingPlotter.plot_labeling_rounds_naive()
|
#LabelingPlotter.plot_labeling_rounds_naive()
|
|
@ -19,7 +19,7 @@ import csv
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.feature_selection import SelectPercentile
|
from sklearn.feature_selection import SelectPercentile
|
||||||
from sklearn.metrics import f1_score, make_scorer
|
from sklearn.metrics import f1_score, make_scorer, recall_score
|
||||||
from sklearn.model_selection import StratifiedKFold
|
from sklearn.model_selection import StratifiedKFold
|
||||||
from sklearn.model_selection import GridSearchCV
|
from sklearn.model_selection import GridSearchCV
|
||||||
from sklearn.pipeline import Pipeline
|
from sklearn.pipeline import Pipeline
|
||||||
|
@ -56,12 +56,12 @@ class SVM:
|
||||||
|
|
||||||
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
||||||
|
|
||||||
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75],
|
grid = GridSearchCV(pipeline, {'perc__percentile': [100],
|
||||||
'SVC__kernel': ['linear'],
|
'SVC__kernel': ['linear'],
|
||||||
'SVC__gamma': [0.00001, 0.0001],
|
'SVC__gamma': [0.00001, 0.0001],
|
||||||
'SVC__C': [0.1, 1]},
|
'SVC__C': [0.1, 1]},
|
||||||
cv=skf,
|
cv=skf,
|
||||||
scoring=make_scorer(f1_score))
|
scoring=make_scorer(recall_score))
|
||||||
|
|
||||||
print('# fit classifier')
|
print('# fit classifier')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
'''
|
||||||
|
Comparing Three Model Approach to MNB
|
||||||
|
'''
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_selection import SelectPercentile
|
||||||
|
from sklearn.metrics import recall_score, precision_score
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
|
||||||
|
class ThreeModelApproach:
|
||||||
|
|
||||||
|
def calc_model_1(labeled_data):
|
||||||
|
|
||||||
|
print('# MNB: starting interactive multinomial naives bayes...')
|
||||||
|
print()
|
||||||
|
|
||||||
|
# split labeled data into text and label set
|
||||||
|
# join title and text
|
||||||
|
X = labeled_data['Title'] + '. ' + labeled_data['Text']
|
||||||
|
y = labeled_data['Label']
|
||||||
|
|
||||||
|
cv = CountVectorizer()
|
||||||
|
|
||||||
|
# fit_prior=False: a uniform prior will be used instead
|
||||||
|
# of learning class prior probabilities
|
||||||
|
classifier = GaussianNB()
|
||||||
|
|
||||||
|
# metrics
|
||||||
|
recall_scores = []
|
||||||
|
precision_scores = []
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y,
|
||||||
|
stratify=y,
|
||||||
|
test_size=0.25)
|
||||||
|
|
||||||
|
|
||||||
|
# use sklearn CountVectorizer
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = cv.fit_transform(X_train, y_train).toarray()
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
testing_data = cv.transform(X_test).toarray()
|
||||||
|
|
||||||
|
#fit classifier
|
||||||
|
classifier.fit(training_data, y_train)
|
||||||
|
|
||||||
|
predictions_test = classifier.predict(testing_data)
|
||||||
|
|
||||||
|
#print and store metrics
|
||||||
|
rec = recall_score(y_test, predictions_test)
|
||||||
|
print('rec: ' + str(rec))
|
||||||
|
|
||||||
|
prec = precision_score(y_test, predictions_test)
|
||||||
|
print('prec: ' + str(prec))
|
||||||
|
print('#')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
file = '..\\data\\interactive_labeling_round_17_20190502.csv'
|
||||||
|
|
||||||
|
data = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
||||||
|
sep='|',
|
||||||
|
usecols=range(1,13), # drop first column 'unnamed'
|
||||||
|
encoding='utf-8',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
|
ThreeModelApproach.calc_model_1(df.loc[df['Label'] != -1].reset_index(drop=True))
|
|
@ -618,27 +618,27 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 158,
|
"execution_count": 10,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Nachberechnung fürs Latex:\n",
|
"# Nachberechnung fürs Latex:\n",
|
||||||
"zero_0 = 80\n",
|
"zero_0 = 0\n",
|
||||||
"zero_1 = 2\n",
|
"zero_1 = 0\n",
|
||||||
"zero_2 = 14\n",
|
"zero_2 = 0\n",
|
||||||
"\n",
|
"\n",
|
||||||
"one_0 = 0\n",
|
"one_0 = 58\n",
|
||||||
"one_1 = 0\n",
|
"one_1 = 22\n",
|
||||||
"one_2 = 1\n",
|
"one_2 = 20\n",
|
||||||
"\n",
|
"\n",
|
||||||
"two_0 = 0\n",
|
"two_0 = 0\n",
|
||||||
"two_1 = 0\n",
|
"two_1 = 0\n",
|
||||||
"two_2 = 3"
|
"two_2 = 0"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 129,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -650,108 +650,15 @@
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"data": {
|
"ename": "NameError",
|
||||||
"text/plain": [
|
"evalue": "name 'testing_data' is not defined",
|
||||||
"68"
|
"output_type": "error",
|
||||||
]
|
"traceback": [
|
||||||
},
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
"execution_count": 129,
|
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||||
"metadata": {},
|
"\u001b[1;32m<ipython-input-2-2e477f7d128e>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'confusion matrix:'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'###############'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mzero_0\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Estimated'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m&\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Label'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[0mzero_0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mzero_1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Estimated'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m&\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mtesting_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Label'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
"output_type": "execute_result"
|
"\u001b[1;31mNameError\u001b[0m: name 'testing_data' is not defined"
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"0"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 129,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"6"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 129,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/\n"
|
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"8"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 129,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 129,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"11"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 129,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"4"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 129,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 129,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 129,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -782,7 +689,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 159,
|
"execution_count": 11,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": false
|
"scrolled": false
|
||||||
},
|
},
|
||||||
|
@ -795,51 +702,51 @@
|
||||||
"\n",
|
"\n",
|
||||||
"class 0:\n",
|
"class 0:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"TP: 80\n",
|
"TP: 0\n",
|
||||||
"TN: 4\n",
|
"TN: 42\n",
|
||||||
"FP: 16\n",
|
"FP: 0\n",
|
||||||
"FN: 0\n",
|
"FN: 58\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class 1:\n",
|
"class 1:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"TP: 0\n",
|
"TP: 22\n",
|
||||||
"TN: 97\n",
|
"TN: 0\n",
|
||||||
"FP: 1\n",
|
"FP: 78\n",
|
||||||
"FN: 2\n",
|
"FN: 0\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class 2:\n",
|
"class 2:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"TP: 3\n",
|
"TP: 0\n",
|
||||||
"TN: 82\n",
|
"TN: 80\n",
|
||||||
"FP: 0\n",
|
"FP: 0\n",
|
||||||
"FN: 15\n",
|
"FN: 20\n",
|
||||||
"###############\n",
|
"###############\n",
|
||||||
"\n",
|
"\n",
|
||||||
"METRICS:\n",
|
"METRICS:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class 0:\n",
|
"class 0:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"precision: 83.33\n",
|
"precision: 0\n",
|
||||||
"recall: 100.0\n",
|
"recall: 0.0\n",
|
||||||
"accuracy: 84.0\n",
|
"accuracy: 42.0\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class 1:\n",
|
"class 1:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"precision: 0.0\n",
|
"precision: 22.0\n",
|
||||||
"recall: 0.0\n",
|
"recall: 100.0\n",
|
||||||
"accuracy: 97.0\n",
|
"accuracy: 22.0\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class 2:\n",
|
"class 2:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"precision: 100.0\n",
|
"precision: 0\n",
|
||||||
"recall: 16.67\n",
|
"recall: 0.0\n",
|
||||||
"accuracy: 85.0\n",
|
"accuracy: 80.0\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Average Metrics:\n",
|
"Average Metrics:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"precision: 61.111111111111114\n",
|
"precision: 7.333333333333333\n",
|
||||||
"recall: 38.888888888888886\n",
|
"recall: 33.333333333333336\n",
|
||||||
"accuracy: 88.66666666666667\n"
|
"accuracy: 48.0\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -885,7 +792,7 @@
|
||||||
"print()\n",
|
"print()\n",
|
||||||
"print('class 0:')\n",
|
"print('class 0:')\n",
|
||||||
"print()\n",
|
"print()\n",
|
||||||
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
|
"prec_0 = tp_0 #/ (tp_0 + fp_0) * 100\n",
|
||||||
"print('precision: {}'.format(round(prec_0, 2)))\n",
|
"print('precision: {}'.format(round(prec_0, 2)))\n",
|
||||||
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
|
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
|
||||||
"print('recall: {}'.format(round(rec_0, 2)))\n",
|
"print('recall: {}'.format(round(rec_0, 2)))\n",
|
||||||
|
@ -903,7 +810,7 @@
|
||||||
"print()\n",
|
"print()\n",
|
||||||
"print('class 2:')\n",
|
"print('class 2:')\n",
|
||||||
"print()\n",
|
"print()\n",
|
||||||
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
|
"prec_2 = tp_2 #/ (tp_2 + fp_2) * 100\n",
|
||||||
"print('precision: {}'.format(round(prec_2, 2)))\n",
|
"print('precision: {}'.format(round(prec_2, 2)))\n",
|
||||||
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
|
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
|
||||||
"print('recall: {}'.format(round(rec_2, 2)))\n",
|
"print('recall: {}'.format(round(rec_2, 2)))\n",
|
|
@ -0,0 +1,713 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Last round number: 17\n",
|
||||||
|
"Number of manually labeled articles: 1412\n",
|
||||||
|
"Number of manually unlabeled articles: 8588\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import csv\n",
|
||||||
|
"import operator\n",
|
||||||
|
"import pickle\n",
|
||||||
|
"import random\n",
|
||||||
|
"\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||||
|
"from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
|
||||||
|
"from sklearn.model_selection import StratifiedKFold\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"from sklearn.naive_bayes import GaussianNB\n",
|
||||||
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||||
|
"\n",
|
||||||
|
"# initialize random => reproducible sequence\n",
|
||||||
|
"random.seed(5)\n",
|
||||||
|
"random_state=5\n",
|
||||||
|
"\n",
|
||||||
|
"# set up wider display area\n",
|
||||||
|
"pd.set_option('display.max_colwidth', -1)\n",
|
||||||
|
"\n",
|
||||||
|
"# read current data set from csv\n",
|
||||||
|
"df = pd.read_csv('../../data/interactive_labeling_round_17_20190502.csv',\n",
|
||||||
|
" sep='|',\n",
|
||||||
|
" usecols=range(1,13), # drop first column 'unnamed'\n",
|
||||||
|
" encoding='utf-8',\n",
|
||||||
|
" quoting=csv.QUOTE_NONNUMERIC,\n",
|
||||||
|
" quotechar='\\'')\n",
|
||||||
|
"\n",
|
||||||
|
"# find current iteration/round number\n",
|
||||||
|
"m = int(df['Round'].max())\n",
|
||||||
|
"print('Last round number: {}'.format(m))\n",
|
||||||
|
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
|
||||||
|
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"m = 10\n",
|
||||||
|
"df.loc[(df['Round'] >= m), 'Label'] = -1\n",
|
||||||
|
"df.loc[(df['Round'] >= m), 'Round'] = np.nan\n",
|
||||||
|
"\n",
|
||||||
|
"len(df.loc[df['Label'] != -1])\n",
|
||||||
|
"\n",
|
||||||
|
"labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
|
||||||
|
"labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
|
||||||
|
"labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"737\n",
|
||||||
|
"35\n",
|
||||||
|
"128\n",
|
||||||
|
"655\n",
|
||||||
|
"31\n",
|
||||||
|
"114\n",
|
||||||
|
"573\n",
|
||||||
|
"27\n",
|
||||||
|
"100\n",
|
||||||
|
"491\n",
|
||||||
|
"23\n",
|
||||||
|
"86\n",
|
||||||
|
"409\n",
|
||||||
|
"19\n",
|
||||||
|
"72\n",
|
||||||
|
"327\n",
|
||||||
|
"15\n",
|
||||||
|
"58\n",
|
||||||
|
"245\n",
|
||||||
|
"11\n",
|
||||||
|
"44\n",
|
||||||
|
"163\n",
|
||||||
|
"7\n",
|
||||||
|
"30\n",
|
||||||
|
"81\n",
|
||||||
|
"3\n",
|
||||||
|
"16\n",
|
||||||
|
"0\n",
|
||||||
|
"0\n",
|
||||||
|
"0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sampling_0_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
|
||||||
|
"sampling_0_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
|
||||||
|
"sampling_0_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
|
||||||
|
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_0_class0['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_0_class1['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_0_class2['Index'].tolist())]\n",
|
||||||
|
"print(len(labeled_pos_0)) # 33\n",
|
||||||
|
"print(len(labeled_pos_1)) # 33\n",
|
||||||
|
"print(len(labeled_pos_2)) \n",
|
||||||
|
"sampling_1_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
|
||||||
|
"sampling_1_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
|
||||||
|
"sampling_1_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
|
||||||
|
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_1_class0['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_1_class1['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_1_class2['Index'].tolist())]\n",
|
||||||
|
"print(len(labeled_pos_0)) # 33\n",
|
||||||
|
"print(len(labeled_pos_1)) # 33\n",
|
||||||
|
"print(len(labeled_pos_2)) \n",
|
||||||
|
"sampling_2_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
|
||||||
|
"sampling_2_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
|
||||||
|
"sampling_2_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
|
||||||
|
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_2_class0['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_2_class1['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_2_class2['Index'].tolist())]\n",
|
||||||
|
"print(len(labeled_pos_0)) # 33\n",
|
||||||
|
"print(len(labeled_pos_1)) # 33\n",
|
||||||
|
"print(len(labeled_pos_2)) \n",
|
||||||
|
"sampling_3_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
|
||||||
|
"sampling_3_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
|
||||||
|
"sampling_3_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
|
||||||
|
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_3_class0['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_3_class1['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_3_class2['Index'].tolist())]\n",
|
||||||
|
"print(len(labeled_pos_0)) # 33\n",
|
||||||
|
"print(len(labeled_pos_1)) # 33\n",
|
||||||
|
"print(len(labeled_pos_2)) \n",
|
||||||
|
"sampling_4_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
|
||||||
|
"sampling_4_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
|
||||||
|
"sampling_4_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
|
||||||
|
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_4_class0['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_4_class1['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_4_class2['Index'].tolist())]\n",
|
||||||
|
"print(len(labeled_pos_0)) # 33\n",
|
||||||
|
"print(len(labeled_pos_1)) # 33\n",
|
||||||
|
"print(len(labeled_pos_2)) \n",
|
||||||
|
"sampling_5_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
|
||||||
|
"sampling_5_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
|
||||||
|
"sampling_5_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
|
||||||
|
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_5_class0['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_5_class1['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_5_class2['Index'].tolist())]\n",
|
||||||
|
"print(len(labeled_pos_0)) # 33\n",
|
||||||
|
"print(len(labeled_pos_1)) # 33\n",
|
||||||
|
"print(len(labeled_pos_2)) \n",
|
||||||
|
"sampling_6_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
|
||||||
|
"sampling_6_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
|
||||||
|
"sampling_6_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
|
||||||
|
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_6_class0['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_6_class1['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_6_class2['Index'].tolist())]\n",
|
||||||
|
"print(len(labeled_pos_0)) # 33\n",
|
||||||
|
"print(len(labeled_pos_1)) # 33\n",
|
||||||
|
"print(len(labeled_pos_2)) \n",
|
||||||
|
"sampling_7_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
|
||||||
|
"sampling_7_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
|
||||||
|
"sampling_7_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
|
||||||
|
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_7_class0['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_7_class1['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_7_class2['Index'].tolist())]\n",
|
||||||
|
"print(len(labeled_pos_0)) # 33\n",
|
||||||
|
"print(len(labeled_pos_1)) # 33\n",
|
||||||
|
"print(len(labeled_pos_2)) \n",
|
||||||
|
"sampling_8_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n",
|
||||||
|
"sampling_8_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n",
|
||||||
|
"sampling_8_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n",
|
||||||
|
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_8_class0['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_8_class1['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_8_class2['Index'].tolist())]\n",
|
||||||
|
"print(len(labeled_pos_0)) # 33\n",
|
||||||
|
"print(len(labeled_pos_1)) # 33\n",
|
||||||
|
"print(len(labeled_pos_2)) \n",
|
||||||
|
"sampling_9_class0 = labeled_pos_0.sample(n=81, replace=False, random_state=random_state) # 737\n",
|
||||||
|
"sampling_9_class1 = labeled_pos_1.sample(n=3, replace=False, random_state=random_state) # 35\n",
|
||||||
|
"sampling_9_class2 = labeled_pos_2.sample(n=16, replace=False, random_state=random_state) # 128\n",
|
||||||
|
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_9_class0['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_9_class1['Index'].tolist())]\n",
|
||||||
|
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_9_class2['Index'].tolist())]\n",
|
||||||
|
"print(len(labeled_pos_0)) # 33\n",
|
||||||
|
"print(len(labeled_pos_1)) # 33\n",
|
||||||
|
"print(len(labeled_pos_2)) "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 238,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# TESTING DATA\n",
|
||||||
|
"#testing_data = pd.concat([sampling_0_class0, sampling_0_class1, sampling_0_class2])\n",
|
||||||
|
"#testing_data = pd.concat([sampling_1_class0, sampling_1_class1, sampling_1_class2])\n",
|
||||||
|
"#testing_data = pd.concat([sampling_2_class0, sampling_2_class1, sampling_2_class2])\n",
|
||||||
|
"#testing_data = pd.concat([sampling_3_class0, sampling_3_class1, sampling_3_class2])\n",
|
||||||
|
"#testing_data = pd.concat([sampling_4_class0, sampling_4_class1, sampling_4_class2])\n",
|
||||||
|
"#testing_data = pd.concat([sampling_5_class0, sampling_5_class1, sampling_5_class2])\n",
|
||||||
|
"#testing_data = pd.concat([sampling_6_class0, sampling_6_class1, sampling_6_class2])\n",
|
||||||
|
"#testing_data = pd.concat([sampling_7_class0, sampling_7_class1, sampling_7_class2])\n",
|
||||||
|
"#testing_data = pd.concat([sampling_8_class0, sampling_8_class1, sampling_8_class2])\n",
|
||||||
|
"testing_data = pd.concat([sampling_9_class0, sampling_9_class1, sampling_9_class2])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 239,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"100"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 239,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"indices_testing_data = testing_data['Index'].tolist()\n",
|
||||||
|
"len(testing_data)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 240,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"900"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 240,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# TRAINING DATA\n",
|
||||||
|
"training_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_testing_data))].reset_index(drop=True)\n",
|
||||||
|
"indices_training_data = training_data['Index'].tolist()\n",
|
||||||
|
"len(training_data)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 241,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Model 2:\n",
|
||||||
|
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
|
||||||
|
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
|
||||||
|
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
|
||||||
|
"sampling_class0 = labeled_pos_0.sample(n=18, random_state=random_state) # 737\n",
|
||||||
|
"sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 0\n",
|
||||||
|
"sampling_class1 = labeled_pos_1.sample(n=18, random_state=random_state) # 35\n",
|
||||||
|
"sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 0\n",
|
||||||
|
"sampling_class2 = labeled_pos_2.sample(n=35, random_state=random_state) # 128\n",
|
||||||
|
"sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 1\n",
|
||||||
|
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
|
||||||
|
"\n",
|
||||||
|
"testing_data.loc[testing_data['Label'] == 0, 'Label'] = 0\n",
|
||||||
|
"testing_data.loc[testing_data['Label'] == 1, 'Label'] = 0\n",
|
||||||
|
"testing_data.loc[testing_data['Label'] == 2, 'Label'] = 1\n",
|
||||||
|
"classifier = GaussianNB()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 181,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Model 1:\n",
|
||||||
|
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
|
||||||
|
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
|
||||||
|
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
|
||||||
|
"sampling_class0 = labeled_pos_0.sample(n=18, random_state=random_state) # 737\n",
|
||||||
|
"sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 0\n",
|
||||||
|
"sampling_class1 = labeled_pos_1.sample(n=35, random_state=random_state) # 35\n",
|
||||||
|
"sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 1\n",
|
||||||
|
"sampling_class2 = labeled_pos_2.sample(n=18, random_state=random_state) # 128\n",
|
||||||
|
"sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 0\n",
|
||||||
|
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
|
||||||
|
"\n",
|
||||||
|
"testing_data.loc[testing_data['Label'] == 0, 'Label'] = 0\n",
|
||||||
|
"testing_data.loc[testing_data['Label'] == 1, 'Label'] = 1\n",
|
||||||
|
"testing_data.loc[testing_data['Label'] == 2, 'Label'] = 0\n",
|
||||||
|
"classifier = GaussianNB()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 121,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Model 0:\n",
|
||||||
|
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
|
||||||
|
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
|
||||||
|
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
|
||||||
|
"sampling_class0 = labeled_pos_0.sample(n=35, random_state=random_state) # 737\n",
|
||||||
|
"sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 1\n",
|
||||||
|
"sampling_class1 = labeled_pos_1.sample(n=18, random_state=random_state) # 35\n",
|
||||||
|
"sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 0\n",
|
||||||
|
"sampling_class2 = labeled_pos_2.sample(n=18, random_state=random_state) # 128\n",
|
||||||
|
"sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 0\n",
|
||||||
|
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
|
||||||
|
"\n",
|
||||||
|
"testing_data.loc[testing_data['Label'] == 0, 'Label'] = 1\n",
|
||||||
|
"testing_data.loc[testing_data['Label'] == 1, 'Label'] = 0\n",
|
||||||
|
"testing_data.loc[testing_data['Label'] == 2, 'Label'] = 0\n",
|
||||||
|
"classifier = GaussianNB()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 61,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"737\n",
|
||||||
|
"36\n",
|
||||||
|
"126\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# MNB:\n",
|
||||||
|
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n",
|
||||||
|
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n",
|
||||||
|
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n",
|
||||||
|
"print(len(labeled_pos_0)) # 33\n",
|
||||||
|
"print(len(labeled_pos_1)) # 33\n",
|
||||||
|
"print(len(labeled_pos_2)) \n",
|
||||||
|
"sampling_class0 = labeled_pos_0.sample(n=24, random_state=random_state) # 737\n",
|
||||||
|
"sampling_class1 = labeled_pos_1.sample(n=24, random_state=random_state) # 35\n",
|
||||||
|
"sampling_class2 = labeled_pos_2.sample(n=24, random_state=random_state) # 128\n",
|
||||||
|
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
|
||||||
|
"indices_training_data = training_data['Index'].tolist()\n",
|
||||||
|
"len(training_data)\n",
|
||||||
|
"classifier = MultinomialNB()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 242,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# split training data into text and label set\n",
|
||||||
|
"# join title and text\n",
|
||||||
|
"X = training_data['Title'] + '. ' + training_data['Text']\n",
|
||||||
|
"y = training_data['Label']\n",
|
||||||
|
"\n",
|
||||||
|
"# split testing data into text and label set\n",
|
||||||
|
"U = testing_data['Title'] + '. ' + testing_data['Text']\n",
|
||||||
|
"v = testing_data['Label']\n",
|
||||||
|
"\n",
|
||||||
|
"cv = CountVectorizer()\n",
|
||||||
|
"# fit the training data and then return the matrix\n",
|
||||||
|
"training_data = cv.fit_transform(X, y).toarray()\n",
|
||||||
|
"# transform testing data and return the matrix\n",
|
||||||
|
"testing_data = cv.transform(U).toarray()\n",
|
||||||
|
"#fit classifier\n",
|
||||||
|
"classifier.fit(training_data, y)\n",
|
||||||
|
"#predict class\n",
|
||||||
|
"predictions_test = classifier.predict(testing_data)\n",
|
||||||
|
"\n",
|
||||||
|
"# annotate estimated labels\n",
|
||||||
|
"df['Estimated'] = np.nan\n",
|
||||||
|
"for i, value in enumerate(indices_testing_data):\n",
|
||||||
|
" df.loc[(df['Index'] == value), 'Estimated'] = predictions_test[i]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 243,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n",
|
||||||
|
"69\n",
|
||||||
|
"1\n",
|
||||||
|
"###############\n",
|
||||||
|
"12\n",
|
||||||
|
"2\n",
|
||||||
|
"###############\n",
|
||||||
|
"metrics:\n",
|
||||||
|
"\n",
|
||||||
|
"69\n",
|
||||||
|
"2\n",
|
||||||
|
"1\n",
|
||||||
|
"12\n",
|
||||||
|
"###############\n",
|
||||||
|
"2\n",
|
||||||
|
"69\n",
|
||||||
|
"12\n",
|
||||||
|
"1\n",
|
||||||
|
"###############\n",
|
||||||
|
"98.57142857142858\n",
|
||||||
|
"85.18518518518519\n",
|
||||||
|
"84.52380952380952\n",
|
||||||
|
"###############\n",
|
||||||
|
"14.285714285714285\n",
|
||||||
|
"66.66666666666666\n",
|
||||||
|
"84.52380952380952\n",
|
||||||
|
"###############\n",
|
||||||
|
"56.42857142857143\n",
|
||||||
|
"75.92592592592592\n",
|
||||||
|
"84.52380952380952\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Model 0-2:\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
|
||||||
|
"print(zero_0)\n",
|
||||||
|
"zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
|
||||||
|
"print(zero_1)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
|
||||||
|
"print(one_0)\n",
|
||||||
|
"one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
|
||||||
|
"print(one_1)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"\n",
|
||||||
|
"print('metrics:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"\n",
|
||||||
|
"total = zero_0 + zero_1 + one_0 + one_1\n",
|
||||||
|
"\n",
|
||||||
|
"tp_0 = zero_0\n",
|
||||||
|
"print(tp_0)\n",
|
||||||
|
"tn_0 = one_1\n",
|
||||||
|
"print(tn_0)\n",
|
||||||
|
"fp_0 = zero_1\n",
|
||||||
|
"print(fp_0)\n",
|
||||||
|
"fn_0 = one_0\n",
|
||||||
|
"print(fn_0)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"\n",
|
||||||
|
"tp_1 = one_1\n",
|
||||||
|
"print(tp_1)\n",
|
||||||
|
"tn_1 = zero_0\n",
|
||||||
|
"print(tn_1)\n",
|
||||||
|
"fp_1 = one_0\n",
|
||||||
|
"print(fp_1)\n",
|
||||||
|
"fn_1 = zero_1\n",
|
||||||
|
"print(fn_1)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"\n",
|
||||||
|
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
|
||||||
|
"print(prec_0)\n",
|
||||||
|
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
|
||||||
|
"print(rec_0)\n",
|
||||||
|
"acc_0 = (tp_0 + tn_0) / total * 100\n",
|
||||||
|
"print(acc_0)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"\n",
|
||||||
|
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
|
||||||
|
"print(prec_1)\n",
|
||||||
|
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
|
||||||
|
"print(rec_1)\n",
|
||||||
|
"acc_1 = (tp_1 + tn_1) / total * 100\n",
|
||||||
|
"print(acc_1)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"\n",
|
||||||
|
"print((prec_1 + prec_0) / 2)\n",
|
||||||
|
"print((rec_1 + rec_0) / 2)\n",
|
||||||
|
"print((acc_1 + acc_0) / 2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 63,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"confusion matrix:\n",
|
||||||
|
"###############\n",
|
||||||
|
"62\n",
|
||||||
|
"0\n",
|
||||||
|
"0\n",
|
||||||
|
"/\n",
|
||||||
|
"12\n",
|
||||||
|
"3\n",
|
||||||
|
"11\n",
|
||||||
|
"/\n",
|
||||||
|
"8\n",
|
||||||
|
"0\n",
|
||||||
|
"5\n",
|
||||||
|
"###############\n",
|
||||||
|
"\n",
|
||||||
|
"class 0:\n",
|
||||||
|
"\n",
|
||||||
|
"TP: 62\n",
|
||||||
|
"TN: 19\n",
|
||||||
|
"FP: 0\n",
|
||||||
|
"FN: 20\n",
|
||||||
|
"\n",
|
||||||
|
"class 1:\n",
|
||||||
|
"\n",
|
||||||
|
"TP: 3\n",
|
||||||
|
"TN: 75\n",
|
||||||
|
"FP: 23\n",
|
||||||
|
"FN: 0\n",
|
||||||
|
"\n",
|
||||||
|
"class 2:\n",
|
||||||
|
"\n",
|
||||||
|
"TP: 5\n",
|
||||||
|
"TN: 77\n",
|
||||||
|
"FP: 8\n",
|
||||||
|
"FN: 11\n",
|
||||||
|
"###############\n",
|
||||||
|
"\n",
|
||||||
|
"METRICS:\n",
|
||||||
|
"\n",
|
||||||
|
"class 0:\n",
|
||||||
|
"\n",
|
||||||
|
"precision: 100.0\n",
|
||||||
|
"recall: 75.61\n",
|
||||||
|
"accuracy: 80.2\n",
|
||||||
|
"\n",
|
||||||
|
"class 1:\n",
|
||||||
|
"\n",
|
||||||
|
"precision: 11.54\n",
|
||||||
|
"recall: 100.0\n",
|
||||||
|
"accuracy: 77.23\n",
|
||||||
|
"\n",
|
||||||
|
"class 2:\n",
|
||||||
|
"\n",
|
||||||
|
"precision: 38.46\n",
|
||||||
|
"recall: 31.25\n",
|
||||||
|
"accuracy: 81.19\n",
|
||||||
|
"\n",
|
||||||
|
"Average Metrics:\n",
|
||||||
|
"\n",
|
||||||
|
"precision: 50.0\n",
|
||||||
|
"recall: 68.95325203252033\n",
|
||||||
|
"accuracy: 79.53795379537955\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# MNB:\n",
|
||||||
|
"print('confusion matrix:')\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
|
||||||
|
"print(zero_0)\n",
|
||||||
|
"zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
|
||||||
|
"print(zero_1)\n",
|
||||||
|
"zero_2 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 2)])\n",
|
||||||
|
"print(zero_2)\n",
|
||||||
|
"print('/')\n",
|
||||||
|
"one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
|
||||||
|
"print(one_0)\n",
|
||||||
|
"one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
|
||||||
|
"print(one_1)\n",
|
||||||
|
"one_2 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 2)])\n",
|
||||||
|
"print(one_2)\n",
|
||||||
|
"print('/')\n",
|
||||||
|
"\n",
|
||||||
|
"two_0 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 0)])\n",
|
||||||
|
"print(two_0)\n",
|
||||||
|
"two_1 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 1)])\n",
|
||||||
|
"print(two_1)\n",
|
||||||
|
"two_2 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 2)])\n",
|
||||||
|
"print(two_2)\n",
|
||||||
|
"\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"print()\n",
|
||||||
|
"total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
|
||||||
|
"print('class 0:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"tp_0 = zero_0\n",
|
||||||
|
"print('TP: {}'.format(tp_0))\n",
|
||||||
|
"tn_0 = one_1 + one_2 + two_1 + two_2\n",
|
||||||
|
"print('TN: {}'.format(tn_0))\n",
|
||||||
|
"fp_0 = zero_1 + zero_2\n",
|
||||||
|
"print('FP: {}'.format(fp_0))\n",
|
||||||
|
"fn_0 = one_0 + two_0\n",
|
||||||
|
"print('FN: {}'.format(fn_0))\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('class 1:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"tp_1 = one_1\n",
|
||||||
|
"print('TP: {}'.format(tp_1))\n",
|
||||||
|
"tn_1 = zero_0 + zero_2 + two_0 + two_2\n",
|
||||||
|
"print('TN: {}'.format(tn_1))\n",
|
||||||
|
"fp_1 = one_0 + one_2\n",
|
||||||
|
"print('FP: {}'.format(fp_1))\n",
|
||||||
|
"fn_1 = zero_1 + two_1\n",
|
||||||
|
"print('FN: {}'.format(fn_1))\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('class 2:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"tp_2 = two_2\n",
|
||||||
|
"print('TP: {}'.format(tp_2))\n",
|
||||||
|
"tn_2 = zero_0 + zero_1 + one_0 + one_1\n",
|
||||||
|
"print('TN: {}'.format(tn_2))\n",
|
||||||
|
"fp_2 = two_0 + two_1\n",
|
||||||
|
"print('FP: {}'.format(fp_2))\n",
|
||||||
|
"fn_2 = zero_2 + one_2\n",
|
||||||
|
"print('FN: {}'.format(fn_2))\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('METRICS:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('class 0:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
|
||||||
|
"print('precision: {}'.format(round(prec_0, 2)))\n",
|
||||||
|
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
|
||||||
|
"print('recall: {}'.format(round(rec_0, 2)))\n",
|
||||||
|
"acc_0 = (tp_0 + tn_0) / total * 100\n",
|
||||||
|
"print('accuracy: {}'.format(round(acc_0, 2)))\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('class 1:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
|
||||||
|
"print('precision: {}'.format(round(prec_1, 2)))\n",
|
||||||
|
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
|
||||||
|
"print('recall: {}'.format(round(rec_1, 2)))\n",
|
||||||
|
"acc_1 = (tp_1 + tn_1) / total * 100\n",
|
||||||
|
"print('accuracy: {}'.format(round(acc_1, 2)))\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('class 2:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
|
||||||
|
"print('precision: {}'.format(round(prec_2, 2)))\n",
|
||||||
|
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
|
||||||
|
"print('recall: {}'.format(round(rec_2, 2)))\n",
|
||||||
|
"acc_2 = (tp_2 + tn_2) / total * 100\n",
|
||||||
|
"print('accuracy: {}'.format(round(acc_2, 2)))\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('Average Metrics:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"print('precision: {}'.format((prec_1 + prec_2 + prec_0) / 3))\n",
|
||||||
|
"print('recall: {}'.format((rec_1 + rec_2 + rec_0) / 3))\n",
|
||||||
|
"print('accuracy: {}'.format((acc_1 + acc_2 + acc_0) / 3))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
Before Width: | Height: | Size: 61 KiB |
Before Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.2 KiB |
After Width: | Height: | Size: 20 KiB |