thesis-anne/DecisionTree.py

129 lines
4.3 KiB
Python

'''
Decision Tree Classifier
========================
Decision Tree Classifier takes as input two arrays:
array X of size [n_samples, n_features], holding the training samples,
and array y of integer values, size [n_samples],
holding the class labels for the training samples.
'''
from BagOfWords import BagOfWords
import csv
import operator
import graphviz
import numpy as np
import pandas as pd
from sklearn import tree
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
class DecisionTree:
def make_tree(dataset):
print('# fitting model')
print('# ...')
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
#count_vector = CountVectorizer()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# lists for metrics predicted on test/train set
f1_scores = []
f1_scores_train = []
classifier = tree.DecisionTreeClassifier()
# dict of most important words of each fold
important_words = {}
# for each fold
for train, test in skf.split(X,y):
# BOW
vocab = BagOfWords.make_vocab(X[train])
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(X[train], vocab)
# transform testing data and return the matrix
testing_data = BagOfWords.make_matrix(X[test], vocab)
# #fit the training data and then return the matrix
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
# #transform testing data and return the matrix
# testing_data = count_vector.transform(X[test]).toarray()
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
# fit classifier
classifier.fit(training_data, y[train])
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#store metrics predicted on test/train set
f1_scores.append(f1_score(y[test], predictions_test))
f1_scores_train.append(f1_score(y[train], predictions_train))
# search for important features
feature_importances = np.array(classifier.feature_importances_)
important_indices = feature_importances.argsort()[-50:][::-1]
for i in important_indices:
if vocab[i] in important_words:
important_words[vocab[i]] += feature_importances[i]
else:
important_words[vocab[i]] = feature_importances[i]
print('20 most important words in training set:')
print()
sorted_i_w = sorted(important_words.items(), key=operator.itemgetter(1))
#print(sorted_i_w)[:20]
i_w = [x[0] for x in sorted_i_w]
print(i_w[:20])
print()
#print metrics of test set
print('prediction of testing set:')
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
print()
# print('overfit testing: prediction of training set')
# print('F1 score: min = {}, max = {}, average = {}'.
# format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# print()
if __name__ == '__main__':
print('# starting decision tree')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
data = pd.read_csv(file,
sep='|',
engine='python',
decimal='.',
quotechar='\'',
quoting=csv.QUOTE_NONE)
make_tree(data)
print('# ending decision tree')