149 lines
5.1 KiB
Python
149 lines
5.1 KiB
Python
'''
|
|
Decision Tree Classifier
|
|
========================
|
|
|
|
Decision Tree Classifier takes as input two arrays:
|
|
array X of size [n_samples, n_features], holding the training samples,
|
|
and array y of integer values, size [n_samples],
|
|
holding the class labels for the training samples.
|
|
'''
|
|
# toDo: replace old dataset!!!
|
|
# CountVectorizer funktioniert noch nicht
|
|
|
|
from BagOfWords import BagOfWords
|
|
|
|
import csv
|
|
import operator
|
|
|
|
import graphviz
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn import tree
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.feature_selection import SelectPercentile
|
|
from sklearn.metrics import f1_score
|
|
from sklearn.model_selection import StratifiedKFold
|
|
|
|
class DecisionTree:
|
|
|
|
def make_tree(dataset, sklearn_cv=False, stemming=False, percentile=100):
|
|
print('# fitting model')
|
|
print('# ...')
|
|
|
|
X = dataset['Title'] + ' ' + dataset['Text']
|
|
y = dataset['Label']
|
|
|
|
if sklearn_cv:
|
|
cv = CountVectorizer()
|
|
|
|
# use stratified k-fold cross-validation as split method
|
|
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
|
|
|
# lists for metrics predicted on test/train set
|
|
f1_scores = []
|
|
f1_scores_train = []
|
|
|
|
classifier = tree.DecisionTreeClassifier()
|
|
|
|
# dict of most important words of each fold
|
|
important_words = {}
|
|
|
|
# for each fold
|
|
n = 0
|
|
for train, test in skf.split(X,y):
|
|
|
|
n += 1
|
|
vocab = []
|
|
print('# split no. ' + str(n))
|
|
|
|
if sklearn_cv:
|
|
# use sklearn CountVectorizer
|
|
# fit the training data and then return the matrix
|
|
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
|
# transform testing data and return the matrix
|
|
testing_data = cv.transform(X[test]).toarray()
|
|
else:
|
|
# use my own BagOfWords python implementation
|
|
rel_freq = True
|
|
extracted_words = BagOfWords.extract_all_words(X[train], stemming)
|
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
|
print(vocab)
|
|
|
|
# fit the training data and then return the matrix
|
|
training_data = BagOfWords.make_matrix(extracted_words,
|
|
vocab, rel_freq, stemming)
|
|
# transform testing data and return the matrix
|
|
extracted_words = BagOfWords.extract_all_words(X[test], stemming)
|
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
|
vocab, rel_freq, stemming)
|
|
|
|
# apply select percentile
|
|
selector = SelectPercentile(percentile=percentile)
|
|
selector.fit(training_data, y[train])
|
|
|
|
# new reduced data sets
|
|
training_data_r = selector.transform(training_data)
|
|
testing_data_r = selector.transform(testing_data)
|
|
|
|
# fit classifier
|
|
classifier.fit(training_data_r, y[train])
|
|
|
|
#predict class
|
|
predictions_train = classifier.predict(training_data_r)
|
|
predictions_test = classifier.predict(testing_data_r)
|
|
|
|
#store metrics predicted on test/train set
|
|
f1_scores.append(f1_score(y[test], predictions_test))
|
|
f1_scores_train.append(f1_score(y[train], predictions_train))
|
|
|
|
# search for important features
|
|
feature_importances = np.array(classifier.feature_importances_)
|
|
important_indices = feature_importances.argsort()[-50:][::-1]
|
|
print(important_indices)
|
|
|
|
for i in important_indices:
|
|
if vocab[i] in important_words:
|
|
important_words[vocab[i]] += feature_importances[i]
|
|
else:
|
|
important_words[vocab[i]] = feature_importances[i]
|
|
|
|
print('20 most important words in training set:')
|
|
print()
|
|
sorted_i_w = sorted(important_words.items(), key=operator.itemgetter(1))
|
|
#print(sorted_i_w)[:20]
|
|
i_w = [x[0] for x in sorted_i_w]
|
|
print(i_w[:20])
|
|
print()
|
|
|
|
#print metrics of test set
|
|
print('prediction of testing set:')
|
|
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
|
format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
|
|
print()
|
|
# print('overfit testing: prediction of training set')
|
|
# print('F1 score: min = {}, max = {}, average = {}'.
|
|
# format(min(f1_scores_train), max(f1_scores_train),
|
|
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
|
# print()
|
|
|
|
if __name__ == '__main__':
|
|
|
|
print('# starting decision tree')
|
|
print('# ...')
|
|
|
|
file = '..\\data\\classification_labelled_corrected.csv'
|
|
|
|
# read csv file
|
|
print('# reading dataset')
|
|
print('# ...')
|
|
|
|
data = pd.read_csv(file,
|
|
sep='|',
|
|
engine='python',
|
|
decimal='.',
|
|
quotechar='\'',
|
|
quoting=csv.QUOTE_NONE)
|
|
|
|
make_tree(data)
|
|
|
|
print('# ending decision tree') |