''' Decision Tree Classifier ======================== Decision Tree Classifier takes as input two arrays: array X of size [n_samples, n_features], holding the training samples, and array y of integer values, size [n_samples], holding the class labels for the training samples. ''' import operator from BagOfWords import BagOfWords from CsvHandler import CsvHandler import graphviz import numpy as np from sklearn import tree #from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectPercentile from sklearn.metrics import f1_score from sklearn.model_selection import StratifiedKFold class DecisionTree(): def make_tree(dataset): print('# starting decision tree') print() # note: better results with only title, but other important words X = dataset['Title'] + ' ' + dataset['Text'] y = dataset['Label'] #count_vector = CountVectorizer() # use stratified k-fold cross-validation as split method skf = StratifiedKFold(n_splits = 10, shuffle=True) # lists for metrics predicted on test/train set f1_scores = [] f1_scores_train = [] classifier = tree.DecisionTreeClassifier() # dict of most important words of each fold important_words = {} # for each fold for train, test in skf.split(X,y): # BOW vocab = BagOfWords.make_vocab(X[train]) # fit the training data and then return the matrix training_data = BagOfWords.make_matrix(X[train], vocab) # transform testing data and return the matrix testing_data = BagOfWords.make_matrix(X[test], vocab) # #fit the training data and then return the matrix # training_data = count_vector.fit_transform(X[train], y[train]).toarray() # #transform testing data and return the matrix # testing_data = count_vector.transform(X[test]).toarray() # # apply select percentile # selector = SelectPercentile(percentile=25) # selector.fit(training_data, y[train]) # training_data_r = selector.transform(training_data) # testing_data_r = selector.transform(testing_data) # fit classifier classifier.fit(training_data, y[train]) #predict class predictions_train = classifier.predict(training_data) predictions_test = classifier.predict(testing_data) #store metrics predicted on test/train set f1_scores.append(f1_score(y[test], predictions_test)) f1_scores_train.append(f1_score(y[train], predictions_train)) # search for important features feature_importances = np.array(classifier.feature_importances_) important_indices = feature_importances.argsort()[-50:][::-1] for i in important_indices: if vocab[i] in important_words: important_words[vocab[i]] += feature_importances[i] else: important_words[vocab[i]] = feature_importances[i] print('20 most important words in training set:') print() sorted_i_w = sorted(important_words.items(), key=operator.itemgetter(1)) #print(sorted_i_w)[:20] i_w = [x[0] for x in sorted_i_w] print(i_w[:20]) print() #print metrics of test set print('prediction of testing set:') print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'. format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores)))) print() # print('overfit testing: prediction of training set') # print('F1 score: min = {}, max = {}, average = {}'. # format(min(f1_scores_train), max(f1_scores_train), # sum(f1_scores_train)/float(len(f1_scores_train)))) # print() print('# ending decision tree') print()