## Model Evaluation

In [1]:
import csv
import operator
import pickle
import random

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

from MNBInteractive import MNBInteractive
from MultinomialNaiveBayes import MultinomialNaiveBayes
from NaiveBayes import NaiveBayes

In [2]:
# initialize random => reproducible sequence
random.seed(5)

# set up wider display area
pd.set_option('display.max_colwidth', -1)

In [3]:
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
 sep='|',
 usecols=range(1,13), # drop first column 'unnamed'
 encoding='utf-8',
 quoting=csv.QUOTE_NONNUMERIC,
 quotechar='\'')

# find current iteration/round number
m = int(df['Round'].max())
print('Last round number: {}'.format(m))
print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))
print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))

Last round number: 11
Number of manually labeled articles: 1082
Number of manually unlabeled articles: 8918


In [4]:
def show_next(index):
 ''' this method displays an article's text and an interactive slider to set its label manually
 '''
 print('News article no. {}:'.format(index))
 print()
 print('HEADLINE:')
 print(df.loc[df['Index'] == index, 'Title'])
 print()
 print('TEXT:')
 print(df.loc[df['Index'] == index, 'Text'])
 print()
 print('ESTIMATED_0:')
 print(df.loc[df['Index'] == index, 'Estimated_0'])
 print()
 print('ESTIMATED_1:')
 print(df.loc[df['Index'] == index, 'Estimated_1'])
 print()
 print('ESTIMATED_2:')
 print(df.loc[df['Index'] == index, 'Estimated_2'])
 
 def f(x):
 # save user input
 df.loc[df['Index'] == index, 'Label'] = x

 # create slider widget for labels
 interact(f, x = widgets.IntSlider(min=-1, max=2, step=1, value=df.loc[df['Index'] == index, 'Label']))
 print('0: Other/Unrelated news, 1: Merger,')
 print('2: Topics related to deals, investments and mergers')
 print('___________________________________________________________________________________________________________')
 print()
 print()

# list of article indices that will be shown next
label_next = []

## How to find a better model:

A) Multinomial Naive Bayes Algorithm

In [None]:
recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True))

In [None]:
# toDo: läuft noch nicht

# series of indices of recently estimated articles 
indices_estimated = df.loc[df['Label'] != -1, 'Index'].tolist()

# annotate probability
n = 0
for row in class_probs[0]:
 index = indices_estimated[n]
 # save estimated label
 df.loc[index, 'Estimated_2'] = row[1]
 n += 1

In [None]:
print("Recall (Min): {}".format(min(recall_scores)))
print("Recall (Max): {}".format(max(recall_scores)))
print("Recall (Average): {}".format(sum(recall_scores)/10))
print()
print("Precision (Min): {}".format(min(precision_scores)))
print("Precision (Max): {}".format(max(precision_scores)))
print("Precision (Average): {}".format(sum(precision_scores)/10))

In [None]:
print('confusion matrix:')
print('###############')
zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])
zero_0
zero_1 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 1)])
zero_1
zero_2 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 2)])
zero_2
print('/')
one_0 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 0)])
one_0
one_1 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 1)])
one_1
one_2 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 2)])
one_2
print('/')

two_0 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 0)])
two_0
two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])
two_1
two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])
two_2

## Building three separate models:

B) One model per class: Funktioniert es besser wenn man 3 Modelle hat.
Begründung: wir sind interessiert an Klasse 1
Pro Klasse 1 Modell bauen (Hier ist das ziel das beste Modell zu finden. Dafür nehmen wir 1082 gelabelte Daten.)
3 Modelle => Ergebnis für 1 Sample: (70%, 40%, 80%) unklar => überprüfen
=> (90%, 90%, 90%) => überprüfen
liefert das bessere ambiguity Samples als oben
Stratified sample: (50 + 50 (1/2 von der anderen Klasse 1/2 der dritten Klasse))

In [5]:
labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)
labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)
labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)

In [6]:
len(labeled_pos_0)

847

In [7]:
len(labeled_pos_1)

50

In [8]:
len(labeled_pos_2)

185

In [9]:
# add three new columns for the three models, initialize with nans
df['Estimated_0'] = np.nan
df['Estimated_1'] = np.nan
df['Estimated_2'] = np.nan

In [10]:
sampling_class0_0 = labeled_pos_0.sample(n=100, random_state=5)
sampling_class0_1 = labeled_pos_1.sample(n=50, random_state=5)
sampling_class0_2 = labeled_pos_2.sample(n=50, random_state=5)

sampling_class1_0 = labeled_pos_0.sample(n=25, random_state=5)
sampling_class1_1 = sampling_class0_1
sampling_class1_2 = labeled_pos_2.sample(n=25, random_state=5)

sampling_class2_0 = labeled_pos_0.sample(n=50, random_state=5)
sampling_class2_1 = sampling_class0_1
sampling_class2_2 = labeled_pos_2.sample(n=100, random_state=5)

In [11]:
sampling_class0_complement = pd.concat([sampling_class0_1, sampling_class0_2])
sampling_class1_complement = pd.concat([sampling_class1_0, sampling_class1_2])
sampling_class2_complement = pd.concat([sampling_class2_0, sampling_class2_1])

In [12]:
# prepare for binary classification:
# pos_label = 3
sampling_class0_0['Label'] = 3
sampling_class1_1['Label'] = 3
sampling_class2_2['Label'] = 3
# neg_label = 4
sampling_class0_complement['Label'] = 4
sampling_class1_complement['Label'] = 4
sampling_class2_complement['Label'] = 4

In [13]:
sampling_class0 = pd.concat([sampling_class0_0, sampling_class0_complement]).reset_index(drop=True)
sampling_class1 = pd.concat([sampling_class1_1, sampling_class1_complement]).reset_index(drop=True)
sampling_class2 = pd.concat([sampling_class2_2, sampling_class2_complement]).reset_index(drop=True)

### Apply Naive Bayes Model to estimate all labeled articles (1082 samples):

In [22]:
train_data = sampling_class2
indices_train = train_data['Index'].tolist()
len(indices_train)

200

In [21]:
test_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_train))].reset_index(drop=True)
len(test_data)

882

In [23]:
test_data.loc[(test_data['Label'] == 0), 'Label'] = 3
test_data.loc[(test_data['Label'] == 1) | (test_data['Label'] == 2), 'Label'] = 4

In [24]:
# split training data into text and label set
# join title and text
X = train_data['Title'] + '. ' + train_data['Text']
y = train_data['Label']

In [25]:
# split testing data into text and label set
U = test_data['Title'] + '. ' + test_data['Text']
v = test_data['Label']

In [26]:
classifier = GaussianNB()

cv = CountVectorizer()

# probabilities of each class
class_probs = []

# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X, y).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(U).toarray()

#fit classifier
classifier.fit(training_data, y)

#predict class
predictions_test = classifier.predict(testing_data)

class_probs = classifier.predict_proba(testing_data)

#print and store metrics
rec = recall_score(v, predictions_test, pos_label=3)
print('recall: ' + str(rec))
prec = precision_score(v, predictions_test, pos_label=3)
print('precision: ' + str(prec))

recall: 0.19949811794228356
precision: 0.803030303030303


In [None]:
class_probs[:10]

In [None]:
# series of indices of recently estimated articles 
indices_estimated_2 = test_data['Index'].tolist()

# annotate probability
n = 0
for row in class_probs:
 index = indices_estimated_2[n]
 # save estimated label
 df.loc[index, 'Estimated_2'] = row[0]
 n += 1

### Apply Naive Bayes Model (10-fold-cross validation):

In [None]:
dataset = sampling_class0

X = dataset['Title'] + '. ' + dataset['Text']
y = dataset['Label']

cv = CountVectorizer()

# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)

classifier = GaussianNB()

# metrics
recall_scores = []
precision_scores = []

# probabilities of each class (of each fold)
class_probs = []
# counts number of training samples observed in each class 
class_counts = []

# for each fold
for train, test in skf.split(X,y):
 
 # fit the training data and then return the matrix
 training_data = cv.fit_transform(X[train], y[train]).toarray()
 # transform testing data and return the matrix
 testing_data = cv.transform(X[test]).toarray()

 #fit classifier
 classifier.fit(training_data, y[train])
 #predict class
 predictions_train = classifier.predict(training_data)
 predictions_test = classifier.predict(testing_data)

 #print and store metrics
 rec = recall_score(y[test], predictions_test)
 recall_scores.append(rec)
 prec = precision_score(y[test], predictions_test)
 precision_scores.append(prec)

 class_probs.append(classifier.class_prior_)

In [None]:
# series of indices of recently estimated articles 
indices_estimated_0 = sampling_class0['Index'].tolist()

# annotate probability
n = 0
for row in class_probs:
 index = indices_estimated_0[n]
 # save estimated label
 df.loc[index, 'Estimated_0'] = row[1]
 n += 1

In [None]:
print("Recall (Min): {}".format(min(recall_scores)))
print("Recall (Max): {}".format(max(recall_scores)))
print("Recall (Average): {}".format(sum(recall_scores)/10))
print()
print("Precision (Min): {}".format(min(precision_scores)))
print("Precision (Max): {}".format(max(precision_scores)))
print("Precision (Average): {}".format(sum(precision_scores)/10))

Number of used samples:

In [None]:
indices_all_samples = set((indices_estimated_0 + indices_estimated_1) + indices_estimated_2)

In [None]:
len(indices_all_samples)

Check if there are samples where more than one class was marked with 1.

In [None]:
len(df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0)])

In [None]:
indices = df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0), 'Index'].tolist()

In [None]:
# save tri-model to csv 
df.to_csv('../data/interactive_labeling_triple_model_auf_1082.csv',
 sep='|',
 mode='w',
 encoding='utf-8',
 quoting=csv.QUOTE_NONNUMERIC,
 quotechar='\'')

In [None]:
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_triple_model_auf_1082.csv',
 sep='|',
 usecols=range(1,16), # drop first column 'unnamed'
 encoding='utf-8',
 quoting=csv.QUOTE_NONNUMERIC,
 quotechar='\'')

In [None]:
for index in indices:
 show_next(index)