@ -1,80 +0,0 @@ |
|||
''' |
|||
Cosine Similarity |
|||
================= |
|||
|
|||
CosineSimilarity measures the similarity between to articles. |
|||
It calculates c: the cosine of the angle between the articles |
|||
vectors text_1 and text_2. |
|||
c = (text_1 * text_2) / (|text_1| * |text_2|). |
|||
c = 1, if articles are equal => identicalness is 100% |
|||
0 > c > 1, else => identicalness is (c*100)% |
|||
(The greater c, the more similar two articles are.) |
|||
''' |
|||
from BagOfWords import BagOfWords |
|||
|
|||
import csv |
|||
import math |
|||
|
|||
import pandas as pd |
|||
|
|||
class CosineSimilarity: |
|||
|
|||
def calc_similarity(text_1, text_2, rel_freq=True, stemming=True): |
|||
''' calculates cosine similarity of two input articles |
|||
''' |
|||
print('# calculating cosine similarity...') |
|||
print() |
|||
|
|||
# extract words from articles |
|||
extracted_words_1 = BagOfWords.extract_words(text_1, stemming) |
|||
extracted_words_2 = BagOfWords.extract_words(text_2, stemming) |
|||
print(extracted_words_1) |
|||
print(extracted_words_2) |
|||
|
|||
# insert words into vocab |
|||
both_extracted = [] |
|||
both_extracted.append(extracted_words_1) |
|||
both_extracted.append(extracted_words_2) |
|||
vocab = BagOfWords.make_vocab(both_extracted, stemming) |
|||
|
|||
# create vectors |
|||
matrix = BagOfWords.make_matrix(both_extracted, vocab,\ |
|||
rel_freq, stemming) |
|||
|
|||
# start calculation |
|||
# calculate numerator of formula |
|||
sum_1 = 0 |
|||
|
|||
for i in range (0,len(matrix.iloc[0])): |
|||
sum_1 += matrix.iloc[0][i] * matrix.iloc[1][i] |
|||
|
|||
# calculate denominator of formula |
|||
sum_2 = 0 |
|||
|
|||
for entry in matrix.iloc[0]: |
|||
sum_2 += entry ** 2 |
|||
|
|||
sum_3 = 0 |
|||
for entry in matrix.iloc[1]: |
|||
sum_3 += entry ** 2 |
|||
|
|||
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3)) |
|||
|
|||
if __name__ == '__main__': |
|||
# read data set |
|||
file = '..\\data\\cleaned_data_set_without_header.csv' |
|||
df = pd.read_csv(file, |
|||
delimiter='|', |
|||
header=None, |
|||
index_col=None, |
|||
engine='python', |
|||
usecols=[1,2], |
|||
nrows=100, |
|||
quoting=csv.QUOTE_NONNUMERIC, |
|||
quotechar='\'') |
|||
|
|||
texts = df[1] + '. ' + df[2] |
|||
|
|||
# compare first and second article in data set |
|||
print(CosineSimilarity.calc_similarity(texts.iloc[0], texts.iloc[1],\ |
|||
rel_freq=True, stemming=True)) |
@ -1,86 +0,0 @@ |
|||
''' |
|||
Label Propagation Algorithm for Interactive Labeling |
|||
==================================================== |
|||
|
|||
Uses scikit learn's implementation of label propagation: |
|||
Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled |
|||
data with label propagation. |
|||
(Technical Report CMU-CALD-02-107, Carnegie Mellon University, 2002.) |
|||
|
|||
Prints out probabilities for classes needed for interactive labeling. |
|||
''' |
|||
|
|||
from BagOfWords import BagOfWords |
|||
|
|||
import pandas as pd |
|||
from sklearn.feature_extraction.text import CountVectorizer |
|||
|
|||
from sklearn.metrics import recall_score, precision_score |
|||
|
|||
from sklearn.semi_supervised import label_propagation |
|||
|
|||
class LabelPropagation: |
|||
|
|||
def propagate_labels(labeled_data, unlabeled_data, sklearn_cv=False): |
|||
|
|||
print('# MNB: starting label propagation') |
|||
|
|||
# assign algorithm |
|||
classifier = label_propagation.LabelSpreading() |
|||
|
|||
# split labeled data into text and label set |
|||
# join title and text |
|||
X = labeled_data['Title'] + '. ' + labeled_data['Text'] |
|||
y = labeled_data['Label'] |
|||
|
|||
# split unlabeled data into text and label set |
|||
# join title and text |
|||
U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text'] |
|||
l = unlabeled_data['Label'] |
|||
|
|||
if sklearn_cv: |
|||
cv = CountVectorizer() |
|||
|
|||
# probabilities of each class (of each fold) |
|||
class_probs = [] |
|||
|
|||
# number of training samples observed in each class |
|||
class_counts = [] |
|||
|
|||
if sklearn_cv: |
|||
# fit the training data and then return the matrix |
|||
training_data = cv.fit_transform(X, y).toarray() |
|||
# transform testing data and return the matrix |
|||
testing_data = cv.transform(U).toarray() |
|||
else: |
|||
# use my own BagOfWords python implementation |
|||
stemming = True |
|||
rel_freq = False |
|||
extracted_words = BagOfWords.extract_all_words(X) |
|||
vocab = BagOfWords.make_vocab(extracted_words) |
|||
|
|||
# fit the training data and then return the matrix |
|||
print('# MNB: fit training data and calculate matrix...') |
|||
print() |
|||
training_data = BagOfWords.make_matrix(extracted_words, |
|||
vocab, rel_freq, stemming) |
|||
|
|||
# transform testing data and return the matrix |
|||
print('# MNB: transform testing data to matrix...') |
|||
print() |
|||
extracted_words = BagOfWords.extract_all_words(U) |
|||
testing_data = BagOfWords.make_matrix(extracted_words, |
|||
vocab, rel_freq, stemming) |
|||
|
|||
#fit classifier |
|||
classifier.fit(training_data, y) |
|||
|
|||
# probability estimates for the test vector (testing_data) |
|||
class_probs = classifier.predict_proba(testing_data) |
|||
|
|||
predictions = classifier.predict(testing_data) |
|||
|
|||
print('# MNB: ending label propagation') |
|||
|
|||
# return vector of class estimates |
|||
return class_probs, predictions |
@ -0,0 +1,70 @@ |
|||
''' |
|||
Comparing Three Model Approach to MNB |
|||
''' |
|||
from BagOfWords import BagOfWords |
|||
|
|||
import pandas as pd |
|||
from sklearn.feature_extraction.text import CountVectorizer |
|||
from sklearn.feature_selection import SelectPercentile |
|||
from sklearn.metrics import recall_score, precision_score |
|||
from sklearn.model_selection import train_test_split |
|||
from sklearn.naive_bayes import GaussianNB |
|||
|
|||
class ThreeModelApproach: |
|||
|
|||
def calc_model_1(labeled_data): |
|||
|
|||
print('# MNB: starting interactive multinomial naives bayes...') |
|||
print() |
|||
|
|||
# split labeled data into text and label set |
|||
# join title and text |
|||
X = labeled_data['Title'] + '. ' + labeled_data['Text'] |
|||
y = labeled_data['Label'] |
|||
|
|||
cv = CountVectorizer() |
|||
|
|||
# fit_prior=False: a uniform prior will be used instead |
|||
# of learning class prior probabilities |
|||
classifier = GaussianNB() |
|||
|
|||
# metrics |
|||
recall_scores = [] |
|||
precision_scores = [] |
|||
|
|||
X_train, X_test, y_train, y_test = train_test_split(X, y, |
|||
stratify=y, |
|||
test_size=0.25) |
|||
|
|||
|
|||
# use sklearn CountVectorizer |
|||
# fit the training data and then return the matrix |
|||
training_data = cv.fit_transform(X_train, y_train).toarray() |
|||
# transform testing data and return the matrix |
|||
testing_data = cv.transform(X_test).toarray() |
|||
|
|||
#fit classifier |
|||
classifier.fit(training_data, y_train) |
|||
|
|||
predictions_test = classifier.predict(testing_data) |
|||
|
|||
#print and store metrics |
|||
rec = recall_score(y_test, predictions_test) |
|||
print('rec: ' + str(rec)) |
|||
|
|||
prec = precision_score(y_test, predictions_test) |
|||
print('prec: ' + str(prec)) |
|||
print('#') |
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
file = '..\\data\\interactive_labeling_round_17_20190502.csv' |
|||
|
|||
data = pd.read_csv('../data/interactive_labeling_round_11.csv', |
|||
sep='|', |
|||
usecols=range(1,13), # drop first column 'unnamed' |
|||
encoding='utf-8', |
|||
quoting=csv.QUOTE_NONNUMERIC, |
|||
quotechar='\'') |
|||
|
|||
ThreeModelApproach.calc_model_1(df.loc[df['Label'] != -1].reset_index(drop=True)) |
@ -0,0 +1,713 @@ |
|||
{ |
|||
"cells": [ |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 1, |
|||
"metadata": {}, |
|||
"outputs": [ |
|||
{ |
|||
"name": "stdout", |
|||
"output_type": "stream", |
|||
"text": [ |
|||
"Last round number: 17\n", |
|||
"Number of manually labeled articles: 1412\n", |
|||
"Number of manually unlabeled articles: 8588\n" |
|||
] |
|||
} |
|||
], |
|||
"source": [ |
|||
"import csv\n", |
|||
"import operator\n", |
|||
"import pickle\n", |
|||
"import random\n", |
|||
"\n", |
|||
"import numpy as np\n", |
|||
"import pandas as pd\n", |
|||
"from sklearn.feature_extraction.text import CountVectorizer\n", |
|||
"from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n", |
|||
"from sklearn.model_selection import StratifiedKFold\n", |
|||
"from sklearn.model_selection import train_test_split\n", |
|||
"from sklearn.naive_bayes import GaussianNB\n", |
|||
"from sklearn.naive_bayes import MultinomialNB\n", |
|||
"\n", |
|||
"# initialize random => reproducible sequence\n", |
|||
"random.seed(5)\n", |
|||
"random_state=5\n", |
|||
"\n", |
|||
"# set up wider display area\n", |
|||
"pd.set_option('display.max_colwidth', -1)\n", |
|||
"\n", |
|||
"# read current data set from csv\n", |
|||
"df = pd.read_csv('../../data/interactive_labeling_round_17_20190502.csv',\n", |
|||
" sep='|',\n", |
|||
" usecols=range(1,13), # drop first column 'unnamed'\n", |
|||
" encoding='utf-8',\n", |
|||
" quoting=csv.QUOTE_NONNUMERIC,\n", |
|||
" quotechar='\\'')\n", |
|||
"\n", |
|||
"# find current iteration/round number\n", |
|||
"m = int(df['Round'].max())\n", |
|||
"print('Last round number: {}'.format(m))\n", |
|||
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n", |
|||
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 2, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"m = 10\n", |
|||
"df.loc[(df['Round'] >= m), 'Label'] = -1\n", |
|||
"df.loc[(df['Round'] >= m), 'Round'] = np.nan\n", |
|||
"\n", |
|||
"len(df.loc[df['Label'] != -1])\n", |
|||
"\n", |
|||
"labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n", |
|||
"labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n", |
|||
"labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 3, |
|||
"metadata": {}, |
|||
"outputs": [ |
|||
{ |
|||
"name": "stdout", |
|||
"output_type": "stream", |
|||
"text": [ |
|||
"737\n", |
|||
"35\n", |
|||
"128\n", |
|||
"655\n", |
|||
"31\n", |
|||
"114\n", |
|||
"573\n", |
|||
"27\n", |
|||
"100\n", |
|||
"491\n", |
|||
"23\n", |
|||
"86\n", |
|||
"409\n", |
|||
"19\n", |
|||
"72\n", |
|||
"327\n", |
|||
"15\n", |
|||
"58\n", |
|||
"245\n", |
|||
"11\n", |
|||
"44\n", |
|||
"163\n", |
|||
"7\n", |
|||
"30\n", |
|||
"81\n", |
|||
"3\n", |
|||
"16\n", |
|||
"0\n", |
|||
"0\n", |
|||
"0\n" |
|||
] |
|||
} |
|||
], |
|||
"source": [ |
|||
"sampling_0_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n", |
|||
"sampling_0_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n", |
|||
"sampling_0_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n", |
|||
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_0_class0['Index'].tolist())]\n", |
|||
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_0_class1['Index'].tolist())]\n", |
|||
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_0_class2['Index'].tolist())]\n", |
|||
"print(len(labeled_pos_0)) # 33\n", |
|||
"print(len(labeled_pos_1)) # 33\n", |
|||
"print(len(labeled_pos_2)) \n", |
|||
"sampling_1_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n", |
|||
"sampling_1_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n", |
|||
"sampling_1_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n", |
|||
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_1_class0['Index'].tolist())]\n", |
|||
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_1_class1['Index'].tolist())]\n", |
|||
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_1_class2['Index'].tolist())]\n", |
|||
"print(len(labeled_pos_0)) # 33\n", |
|||
"print(len(labeled_pos_1)) # 33\n", |
|||
"print(len(labeled_pos_2)) \n", |
|||
"sampling_2_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n", |
|||
"sampling_2_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n", |
|||
"sampling_2_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n", |
|||
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_2_class0['Index'].tolist())]\n", |
|||
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_2_class1['Index'].tolist())]\n", |
|||
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_2_class2['Index'].tolist())]\n", |
|||
"print(len(labeled_pos_0)) # 33\n", |
|||
"print(len(labeled_pos_1)) # 33\n", |
|||
"print(len(labeled_pos_2)) \n", |
|||
"sampling_3_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n", |
|||
"sampling_3_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n", |
|||
"sampling_3_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n", |
|||
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_3_class0['Index'].tolist())]\n", |
|||
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_3_class1['Index'].tolist())]\n", |
|||
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_3_class2['Index'].tolist())]\n", |
|||
"print(len(labeled_pos_0)) # 33\n", |
|||
"print(len(labeled_pos_1)) # 33\n", |
|||
"print(len(labeled_pos_2)) \n", |
|||
"sampling_4_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n", |
|||
"sampling_4_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n", |
|||
"sampling_4_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n", |
|||
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_4_class0['Index'].tolist())]\n", |
|||
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_4_class1['Index'].tolist())]\n", |
|||
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_4_class2['Index'].tolist())]\n", |
|||
"print(len(labeled_pos_0)) # 33\n", |
|||
"print(len(labeled_pos_1)) # 33\n", |
|||
"print(len(labeled_pos_2)) \n", |
|||
"sampling_5_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n", |
|||
"sampling_5_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n", |
|||
"sampling_5_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n", |
|||
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_5_class0['Index'].tolist())]\n", |
|||
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_5_class1['Index'].tolist())]\n", |
|||
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_5_class2['Index'].tolist())]\n", |
|||
"print(len(labeled_pos_0)) # 33\n", |
|||
"print(len(labeled_pos_1)) # 33\n", |
|||
"print(len(labeled_pos_2)) \n", |
|||
"sampling_6_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n", |
|||
"sampling_6_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n", |
|||
"sampling_6_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n", |
|||
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_6_class0['Index'].tolist())]\n", |
|||
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_6_class1['Index'].tolist())]\n", |
|||
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_6_class2['Index'].tolist())]\n", |
|||
"print(len(labeled_pos_0)) # 33\n", |
|||
"print(len(labeled_pos_1)) # 33\n", |
|||
"print(len(labeled_pos_2)) \n", |
|||
"sampling_7_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n", |
|||
"sampling_7_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n", |
|||
"sampling_7_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n", |
|||
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_7_class0['Index'].tolist())]\n", |
|||
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_7_class1['Index'].tolist())]\n", |
|||
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_7_class2['Index'].tolist())]\n", |
|||
"print(len(labeled_pos_0)) # 33\n", |
|||
"print(len(labeled_pos_1)) # 33\n", |
|||
"print(len(labeled_pos_2)) \n", |
|||
"sampling_8_class0 = labeled_pos_0.sample(n=82, replace=False, random_state=random_state) # 737\n", |
|||
"sampling_8_class1 = labeled_pos_1.sample(n=4, replace=False, random_state=random_state) # 35\n", |
|||
"sampling_8_class2 = labeled_pos_2.sample(n=14, replace=False, random_state=random_state) # 128\n", |
|||
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_8_class0['Index'].tolist())]\n", |
|||
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_8_class1['Index'].tolist())]\n", |
|||
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_8_class2['Index'].tolist())]\n", |
|||
"print(len(labeled_pos_0)) # 33\n", |
|||
"print(len(labeled_pos_1)) # 33\n", |
|||
"print(len(labeled_pos_2)) \n", |
|||
"sampling_9_class0 = labeled_pos_0.sample(n=81, replace=False, random_state=random_state) # 737\n", |
|||
"sampling_9_class1 = labeled_pos_1.sample(n=3, replace=False, random_state=random_state) # 35\n", |
|||
"sampling_9_class2 = labeled_pos_2.sample(n=16, replace=False, random_state=random_state) # 128\n", |
|||
"labeled_pos_0 = labeled_pos_0.loc[~labeled_pos_0['Index'].isin(sampling_9_class0['Index'].tolist())]\n", |
|||
"labeled_pos_1 = labeled_pos_1.loc[~labeled_pos_1['Index'].isin(sampling_9_class1['Index'].tolist())]\n", |
|||
"labeled_pos_2 = labeled_pos_2.loc[~labeled_pos_2['Index'].isin(sampling_9_class2['Index'].tolist())]\n", |
|||
"print(len(labeled_pos_0)) # 33\n", |
|||
"print(len(labeled_pos_1)) # 33\n", |
|||
"print(len(labeled_pos_2)) " |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 238, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"# TESTING DATA\n", |
|||
"#testing_data = pd.concat([sampling_0_class0, sampling_0_class1, sampling_0_class2])\n", |
|||
"#testing_data = pd.concat([sampling_1_class0, sampling_1_class1, sampling_1_class2])\n", |
|||
"#testing_data = pd.concat([sampling_2_class0, sampling_2_class1, sampling_2_class2])\n", |
|||
"#testing_data = pd.concat([sampling_3_class0, sampling_3_class1, sampling_3_class2])\n", |
|||
"#testing_data = pd.concat([sampling_4_class0, sampling_4_class1, sampling_4_class2])\n", |
|||
"#testing_data = pd.concat([sampling_5_class0, sampling_5_class1, sampling_5_class2])\n", |
|||
"#testing_data = pd.concat([sampling_6_class0, sampling_6_class1, sampling_6_class2])\n", |
|||
"#testing_data = pd.concat([sampling_7_class0, sampling_7_class1, sampling_7_class2])\n", |
|||
"#testing_data = pd.concat([sampling_8_class0, sampling_8_class1, sampling_8_class2])\n", |
|||
"testing_data = pd.concat([sampling_9_class0, sampling_9_class1, sampling_9_class2])" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 239, |
|||
"metadata": {}, |
|||
"outputs": [ |
|||
{ |
|||
"data": { |
|||
"text/plain": [ |
|||
"100" |
|||
] |
|||
}, |
|||
"execution_count": 239, |
|||
"metadata": {}, |
|||
"output_type": "execute_result" |
|||
} |
|||
], |
|||
"source": [ |
|||
"indices_testing_data = testing_data['Index'].tolist()\n", |
|||
"len(testing_data)" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 240, |
|||
"metadata": {}, |
|||
"outputs": [ |
|||
{ |
|||
"data": { |
|||
"text/plain": [ |
|||
"900" |
|||
] |
|||
}, |
|||
"execution_count": 240, |
|||
"metadata": {}, |
|||
"output_type": "execute_result" |
|||
} |
|||
], |
|||
"source": [ |
|||
"# TRAINING DATA\n", |
|||
"training_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_testing_data))].reset_index(drop=True)\n", |
|||
"indices_training_data = training_data['Index'].tolist()\n", |
|||
"len(training_data)" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 241, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"# Model 2:\n", |
|||
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n", |
|||
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n", |
|||
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n", |
|||
"sampling_class0 = labeled_pos_0.sample(n=18, random_state=random_state) # 737\n", |
|||
"sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 0\n", |
|||
"sampling_class1 = labeled_pos_1.sample(n=18, random_state=random_state) # 35\n", |
|||
"sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 0\n", |
|||
"sampling_class2 = labeled_pos_2.sample(n=35, random_state=random_state) # 128\n", |
|||
"sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 1\n", |
|||
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n", |
|||
"\n", |
|||
"testing_data.loc[testing_data['Label'] == 0, 'Label'] = 0\n", |
|||
"testing_data.loc[testing_data['Label'] == 1, 'Label'] = 0\n", |
|||
"testing_data.loc[testing_data['Label'] == 2, 'Label'] = 1\n", |
|||
"classifier = GaussianNB()" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 181, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"# Model 1:\n", |
|||
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n", |
|||
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n", |
|||
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n", |
|||
"sampling_class0 = labeled_pos_0.sample(n=18, random_state=random_state) # 737\n", |
|||
"sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 0\n", |
|||
"sampling_class1 = labeled_pos_1.sample(n=35, random_state=random_state) # 35\n", |
|||
"sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 1\n", |
|||
"sampling_class2 = labeled_pos_2.sample(n=18, random_state=random_state) # 128\n", |
|||
"sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 0\n", |
|||
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n", |
|||
"\n", |
|||
"testing_data.loc[testing_data['Label'] == 0, 'Label'] = 0\n", |
|||
"testing_data.loc[testing_data['Label'] == 1, 'Label'] = 1\n", |
|||
"testing_data.loc[testing_data['Label'] == 2, 'Label'] = 0\n", |
|||
"classifier = GaussianNB()" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 121, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"# Model 0:\n", |
|||
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n", |
|||
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n", |
|||
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n", |
|||
"sampling_class0 = labeled_pos_0.sample(n=35, random_state=random_state) # 737\n", |
|||
"sampling_class0.loc[sampling_class0['Label'] == 0, 'Label'] = 1\n", |
|||
"sampling_class1 = labeled_pos_1.sample(n=18, random_state=random_state) # 35\n", |
|||
"sampling_class1.loc[sampling_class1['Label'] == 1, 'Label'] = 0\n", |
|||
"sampling_class2 = labeled_pos_2.sample(n=18, random_state=random_state) # 128\n", |
|||
"sampling_class2.loc[sampling_class2['Label'] == 2, 'Label'] = 0\n", |
|||
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n", |
|||
"\n", |
|||
"testing_data.loc[testing_data['Label'] == 0, 'Label'] = 1\n", |
|||
"testing_data.loc[testing_data['Label'] == 1, 'Label'] = 0\n", |
|||
"testing_data.loc[testing_data['Label'] == 2, 'Label'] = 0\n", |
|||
"classifier = GaussianNB()" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 61, |
|||
"metadata": {}, |
|||
"outputs": [ |
|||
{ |
|||
"name": "stdout", |
|||
"output_type": "stream", |
|||
"text": [ |
|||
"737\n", |
|||
"36\n", |
|||
"126\n" |
|||
] |
|||
} |
|||
], |
|||
"source": [ |
|||
"# MNB:\n", |
|||
"labeled_pos_0 = training_data.loc[training_data['Label'] == 0].reset_index(drop=True)\n", |
|||
"labeled_pos_1 = training_data.loc[training_data['Label'] == 1].reset_index(drop=True)\n", |
|||
"labeled_pos_2 = training_data.loc[training_data['Label'] == 2].reset_index(drop=True)\n", |
|||
"print(len(labeled_pos_0)) # 33\n", |
|||
"print(len(labeled_pos_1)) # 33\n", |
|||
"print(len(labeled_pos_2)) \n", |
|||
"sampling_class0 = labeled_pos_0.sample(n=24, random_state=random_state) # 737\n", |
|||
"sampling_class1 = labeled_pos_1.sample(n=24, random_state=random_state) # 35\n", |
|||
"sampling_class2 = labeled_pos_2.sample(n=24, random_state=random_state) # 128\n", |
|||
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n", |
|||
"indices_training_data = training_data['Index'].tolist()\n", |
|||
"len(training_data)\n", |
|||
"classifier = MultinomialNB()" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 242, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"# split training data into text and label set\n", |
|||
"# join title and text\n", |
|||
"X = training_data['Title'] + '. ' + training_data['Text']\n", |
|||
"y = training_data['Label']\n", |
|||
"\n", |
|||
"# split testing data into text and label set\n", |
|||
"U = testing_data['Title'] + '. ' + testing_data['Text']\n", |
|||
"v = testing_data['Label']\n", |
|||
"\n", |
|||
"cv = CountVectorizer()\n", |
|||
"# fit the training data and then return the matrix\n", |
|||
"training_data = cv.fit_transform(X, y).toarray()\n", |
|||
"# transform testing data and return the matrix\n", |
|||
"testing_data = cv.transform(U).toarray()\n", |
|||
"#fit classifier\n", |
|||
"classifier.fit(training_data, y)\n", |
|||
"#predict class\n", |
|||
"predictions_test = classifier.predict(testing_data)\n", |
|||
"\n", |
|||
"# annotate estimated labels\n", |
|||
"df['Estimated'] = np.nan\n", |
|||
"for i, value in enumerate(indices_testing_data):\n", |
|||
" df.loc[(df['Index'] == value), 'Estimated'] = predictions_test[i]" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 243, |
|||
"metadata": { |
|||
"scrolled": true |
|||
}, |
|||
"outputs": [ |
|||
{ |
|||
"name": "stdout", |
|||
"output_type": "stream", |
|||
"text": [ |
|||
"###############\n", |
|||
"69\n", |
|||
"1\n", |
|||
"###############\n", |
|||
"12\n", |
|||
"2\n", |
|||
"###############\n", |
|||
"metrics:\n", |
|||
"\n", |
|||
"69\n", |
|||
"2\n", |
|||
"1\n", |
|||
"12\n", |
|||
"###############\n", |
|||
"2\n", |
|||
"69\n", |
|||
"12\n", |
|||
"1\n", |
|||
"###############\n", |
|||
"98.57142857142858\n", |
|||
"85.18518518518519\n", |
|||
"84.52380952380952\n", |
|||
"###############\n", |
|||
"14.285714285714285\n", |
|||
"66.66666666666666\n", |
|||
"84.52380952380952\n", |
|||
"###############\n", |
|||
"56.42857142857143\n", |
|||
"75.92592592592592\n", |
|||
"84.52380952380952\n" |
|||
] |
|||
} |
|||
], |
|||
"source": [ |
|||
"# Model 0-2:\n", |
|||
"print('###############')\n", |
|||
"zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n", |
|||
"print(zero_0)\n", |
|||
"zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n", |
|||
"print(zero_1)\n", |
|||
"print('###############')\n", |
|||
"one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n", |
|||
"print(one_0)\n", |
|||
"one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n", |
|||
"print(one_1)\n", |
|||
"print('###############')\n", |
|||
"\n", |
|||
"print('metrics:')\n", |
|||
"print()\n", |
|||
"\n", |
|||
"total = zero_0 + zero_1 + one_0 + one_1\n", |
|||
"\n", |
|||
"tp_0 = zero_0\n", |
|||
"print(tp_0)\n", |
|||
"tn_0 = one_1\n", |
|||
"print(tn_0)\n", |
|||
"fp_0 = zero_1\n", |
|||
"print(fp_0)\n", |
|||
"fn_0 = one_0\n", |
|||
"print(fn_0)\n", |
|||
"print('###############')\n", |
|||
"\n", |
|||
"tp_1 = one_1\n", |
|||
"print(tp_1)\n", |
|||
"tn_1 = zero_0\n", |
|||
"print(tn_1)\n", |
|||
"fp_1 = one_0\n", |
|||
"print(fp_1)\n", |
|||
"fn_1 = zero_1\n", |
|||
"print(fn_1)\n", |
|||
"print('###############')\n", |
|||
"\n", |
|||
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n", |
|||
"print(prec_0)\n", |
|||
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n", |
|||
"print(rec_0)\n", |
|||
"acc_0 = (tp_0 + tn_0) / total * 100\n", |
|||
"print(acc_0)\n", |
|||
"print('###############')\n", |
|||
"\n", |
|||
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n", |
|||
"print(prec_1)\n", |
|||
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n", |
|||
"print(rec_1)\n", |
|||
"acc_1 = (tp_1 + tn_1) / total * 100\n", |
|||
"print(acc_1)\n", |
|||
"print('###############')\n", |
|||
"\n", |
|||
"print((prec_1 + prec_0) / 2)\n", |
|||
"print((rec_1 + rec_0) / 2)\n", |
|||
"print((acc_1 + acc_0) / 2)" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 63, |
|||
"metadata": { |
|||
"scrolled": true |
|||
}, |
|||
"outputs": [ |
|||
{ |
|||
"name": "stdout", |
|||
"output_type": "stream", |
|||
"text": [ |
|||
"confusion matrix:\n", |
|||
"###############\n", |
|||
"62\n", |
|||
"0\n", |
|||
"0\n", |
|||
"/\n", |
|||
"12\n", |
|||
"3\n", |
|||
"11\n", |
|||
"/\n", |
|||
"8\n", |
|||
"0\n", |
|||
"5\n", |
|||
"###############\n", |
|||
"\n", |
|||
"class 0:\n", |
|||
"\n", |
|||
"TP: 62\n", |
|||
"TN: 19\n", |
|||
"FP: 0\n", |
|||
"FN: 20\n", |
|||
"\n", |
|||
"class 1:\n", |
|||
"\n", |
|||
"TP: 3\n", |
|||
"TN: 75\n", |
|||
"FP: 23\n", |
|||
"FN: 0\n", |
|||
"\n", |
|||
"class 2:\n", |
|||
"\n", |
|||
"TP: 5\n", |
|||
"TN: 77\n", |
|||
"FP: 8\n", |
|||
"FN: 11\n", |
|||
"###############\n", |
|||
"\n", |
|||
"METRICS:\n", |
|||
"\n", |
|||
"class 0:\n", |
|||
"\n", |
|||
"precision: 100.0\n", |
|||
"recall: 75.61\n", |
|||
"accuracy: 80.2\n", |
|||
"\n", |
|||
"class 1:\n", |
|||
"\n", |
|||
"precision: 11.54\n", |
|||
"recall: 100.0\n", |
|||
"accuracy: 77.23\n", |
|||
"\n", |
|||
"class 2:\n", |
|||
"\n", |
|||
"precision: 38.46\n", |
|||
"recall: 31.25\n", |
|||
"accuracy: 81.19\n", |
|||
"\n", |
|||
"Average Metrics:\n", |
|||
"\n", |
|||
"precision: 50.0\n", |
|||
"recall: 68.95325203252033\n", |
|||
"accuracy: 79.53795379537955\n" |
|||
] |
|||
} |
|||
], |
|||
"source": [ |
|||
"# MNB:\n", |
|||
"print('confusion matrix:')\n", |
|||
"print('###############')\n", |
|||
"zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n", |
|||
"print(zero_0)\n", |
|||
"zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n", |
|||
"print(zero_1)\n", |
|||
"zero_2 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 2)])\n", |
|||
"print(zero_2)\n", |
|||
"print('/')\n", |
|||
"one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n", |
|||
"print(one_0)\n", |
|||
"one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n", |
|||
"print(one_1)\n", |
|||
"one_2 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 2)])\n", |
|||
"print(one_2)\n", |
|||
"print('/')\n", |
|||
"\n", |
|||
"two_0 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 0)])\n", |
|||
"print(two_0)\n", |
|||
"two_1 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 1)])\n", |
|||
"print(two_1)\n", |
|||
"two_2 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 2)])\n", |
|||
"print(two_2)\n", |
|||
"\n", |
|||
"print('###############')\n", |
|||
"print()\n", |
|||
"total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n", |
|||
"print('class 0:')\n", |
|||
"print()\n", |
|||
"tp_0 = zero_0\n", |
|||
"print('TP: {}'.format(tp_0))\n", |
|||
"tn_0 = one_1 + one_2 + two_1 + two_2\n", |
|||
"print('TN: {}'.format(tn_0))\n", |
|||
"fp_0 = zero_1 + zero_2\n", |
|||
"print('FP: {}'.format(fp_0))\n", |
|||
"fn_0 = one_0 + two_0\n", |
|||
"print('FN: {}'.format(fn_0))\n", |
|||
"print()\n", |
|||
"print('class 1:')\n", |
|||
"print()\n", |
|||
"tp_1 = one_1\n", |
|||
"print('TP: {}'.format(tp_1))\n", |
|||
"tn_1 = zero_0 + zero_2 + two_0 + two_2\n", |
|||
"print('TN: {}'.format(tn_1))\n", |
|||
"fp_1 = one_0 + one_2\n", |
|||
"print('FP: {}'.format(fp_1))\n", |
|||
"fn_1 = zero_1 + two_1\n", |
|||
"print('FN: {}'.format(fn_1))\n", |
|||
"print()\n", |
|||
"print('class 2:')\n", |
|||
"print()\n", |
|||
"tp_2 = two_2\n", |
|||
"print('TP: {}'.format(tp_2))\n", |
|||
"tn_2 = zero_0 + zero_1 + one_0 + one_1\n", |
|||
"print('TN: {}'.format(tn_2))\n", |
|||
"fp_2 = two_0 + two_1\n", |
|||
"print('FP: {}'.format(fp_2))\n", |
|||
"fn_2 = zero_2 + one_2\n", |
|||
"print('FN: {}'.format(fn_2))\n", |
|||
"print('###############')\n", |
|||
"print()\n", |
|||
"print('METRICS:')\n", |
|||
"print()\n", |
|||
"print('class 0:')\n", |
|||
"print()\n", |
|||
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n", |
|||
"print('precision: {}'.format(round(prec_0, 2)))\n", |
|||
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n", |
|||
"print('recall: {}'.format(round(rec_0, 2)))\n", |
|||
"acc_0 = (tp_0 + tn_0) / total * 100\n", |
|||
"print('accuracy: {}'.format(round(acc_0, 2)))\n", |
|||
"print()\n", |
|||
"print('class 1:')\n", |
|||
"print()\n", |
|||
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n", |
|||
"print('precision: {}'.format(round(prec_1, 2)))\n", |
|||
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n", |
|||
"print('recall: {}'.format(round(rec_1, 2)))\n", |
|||
"acc_1 = (tp_1 + tn_1) / total * 100\n", |
|||
"print('accuracy: {}'.format(round(acc_1, 2)))\n", |
|||
"print()\n", |
|||
"print('class 2:')\n", |
|||
"print()\n", |
|||
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n", |
|||
"print('precision: {}'.format(round(prec_2, 2)))\n", |
|||
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n", |
|||
"print('recall: {}'.format(round(rec_2, 2)))\n", |
|||
"acc_2 = (tp_2 + tn_2) / total * 100\n", |
|||
"print('accuracy: {}'.format(round(acc_2, 2)))\n", |
|||
"print()\n", |
|||
"print('Average Metrics:')\n", |
|||
"print()\n", |
|||
"print('precision: {}'.format((prec_1 + prec_2 + prec_0) / 3))\n", |
|||
"print('recall: {}'.format((rec_1 + rec_2 + rec_0) / 3))\n", |
|||
"print('accuracy: {}'.format((acc_1 + acc_2 + acc_0) / 3))" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": null, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [] |
|||
} |
|||
], |
|||
"metadata": { |
|||
"kernelspec": { |
|||
"display_name": "Python 3", |
|||
"language": "python", |
|||
"name": "python3" |
|||
}, |
|||
"language_info": { |
|||
"codemirror_mode": { |
|||
"name": "ipython", |
|||
"version": 3 |
|||
}, |
|||
"file_extension": ".py", |
|||
"mimetype": "text/x-python", |
|||
"name": "python", |
|||
"nbconvert_exporter": "python", |
|||
"pygments_lexer": "ipython3", |
|||
"version": "3.7.1" |
|||
} |
|||
}, |
|||
"nbformat": 4, |
|||
"nbformat_minor": 2 |
|||
} |
Before Width: | Height: | Size: 61 KiB |
Before Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.2 KiB |
After Width: | Height: | Size: 20 KiB |