thesis-anne/DecisionTree.py

112 lines
4.3 KiB
Python

'''
Decision Tree Classifier
========================
Decision Tree Classifier takes as input two arrays:
array X of size [n_samples, n_features], holding the training samples,
and array y of integer values, size [n_samples],
holding the class labels for the training samples.
'''
import operator
from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
import graphviz
import numpy as np
from sklearn import tree
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
class DecisionTree():
def make_tree(dataset):
print('# starting decision tree')
print()
# note: better results with only title, but other important words
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
#count_vector = CountVectorizer()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# lists for metrics predicted on test/train set
f1_scores = []
f1_scores_train = []
classifier = tree.DecisionTreeClassifier()
# dict of most important words of each fold
important_words = {}
# for each fold
for train, test in skf.split(X,y):
# BOW
vocab = BagOfWords.make_vocab(X[train])
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(X[train], vocab)
# transform testing data and return the matrix
testing_data = BagOfWords.make_matrix(X[test], vocab)
# #fit the training data and then return the matrix
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
# #transform testing data and return the matrix
# testing_data = count_vector.transform(X[test]).toarray()
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
# fit classifier
classifier.fit(training_data, y[train])
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#store metrics predicted on test/train set
f1_scores.append(f1_score(y[test], predictions_test))
f1_scores_train.append(f1_score(y[train], predictions_train))
# search for important features
feature_importances = np.array(classifier.feature_importances_)
important_indices = feature_importances.argsort()[-50:][::-1]
for i in important_indices:
if vocab[i] in important_words:
important_words[vocab[i]] += feature_importances[i]
else:
important_words[vocab[i]] = feature_importances[i]
print('20 most important words in training set:')
print()
sorted_i_w = sorted(important_words.items(), key=operator.itemgetter(1))
#print(sorted_i_w)[:20]
i_w = [x[0] for x in sorted_i_w]
print(i_w[:20])
print()
#print metrics of test set
print('prediction of testing set:')
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
print()
# print('overfit testing: prediction of training set')
# print('F1 score: min = {}, max = {}, average = {}'.
# format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# print()
print('# ending decision tree')
print()