''' Decision Tree Classifier ======================== Decision Tree Classifier takes as input two arrays: array X of size [n_samples, n_features], holding the training samples, and array y of integer values, size [n_samples], holding the class labels for the training samples. ''' from BagOfWords import BagOfWords import csv import operator import graphviz import numpy as np import pandas as pd from sklearn import tree #from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectPercentile from sklearn.metrics import f1_score from sklearn.model_selection import StratifiedKFold class DecisionTree: def make_tree(dataset): print('# fitting model') print('# ...') X = dataset['Title'] + ' ' + dataset['Text'] y = dataset['Label'] #count_vector = CountVectorizer() # use stratified k-fold cross-validation as split method skf = StratifiedKFold(n_splits = 10, shuffle=True) # lists for metrics predicted on test/train set f1_scores = [] f1_scores_train = [] classifier = tree.DecisionTreeClassifier() # dict of most important words of each fold important_words = {} # for each fold for train, test in skf.split(X,y): # BOW vocab = BagOfWords.make_vocab(X[train]) # fit the training data and then return the matrix training_data = BagOfWords.make_matrix(X[train], vocab) # transform testing data and return the matrix testing_data = BagOfWords.make_matrix(X[test], vocab) # #fit the training data and then return the matrix # training_data = count_vector.fit_transform(X[train], y[train]).toarray() # #transform testing data and return the matrix # testing_data = count_vector.transform(X[test]).toarray() # # apply select percentile # selector = SelectPercentile(percentile=25) # selector.fit(training_data, y[train]) # training_data_r = selector.transform(training_data) # testing_data_r = selector.transform(testing_data) # fit classifier classifier.fit(training_data, y[train]) #predict class predictions_train = classifier.predict(training_data) predictions_test = classifier.predict(testing_data) #store metrics predicted on test/train set f1_scores.append(f1_score(y[test], predictions_test)) f1_scores_train.append(f1_score(y[train], predictions_train)) # search for important features feature_importances = np.array(classifier.feature_importances_) important_indices = feature_importances.argsort()[-50:][::-1] for i in important_indices: if vocab[i] in important_words: important_words[vocab[i]] += feature_importances[i] else: important_words[vocab[i]] = feature_importances[i] print('20 most important words in training set:') print() sorted_i_w = sorted(important_words.items(), key=operator.itemgetter(1)) #print(sorted_i_w)[:20] i_w = [x[0] for x in sorted_i_w] print(i_w[:20]) print() #print metrics of test set print('prediction of testing set:') print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'. format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores)))) print() # print('overfit testing: prediction of training set') # print('F1 score: min = {}, max = {}, average = {}'. # format(min(f1_scores_train), max(f1_scores_train), # sum(f1_scores_train)/float(len(f1_scores_train)))) # print() if __name__ == '__main__': print('# starting decision tree') print('# ...') file = 'classification_labelled_corrected.csv' # read csv file print('# reading dataset') print('# ...') data = pd.read_csv(file, sep='|', engine='python', decimal='.', quotechar='\'', quoting=csv.QUOTE_NONE) make_tree(data) print('# ending decision tree')