diff --git a/BagOfWords.py b/BagOfWords.py index f358e64..e640c78 100644 --- a/BagOfWords.py +++ b/BagOfWords.py @@ -14,6 +14,12 @@ from nltk.stem.porter import PorterStemmer class BagOfWords: + def fit_transform(X, relative_word_frequencies=True): + ''' similar to CountVectorizer's fit_transform method + ''' + vocab = BagOfWords.make_vocab(X) + return BagOfWords.make_matrix(X, vocab, relative_word_frequencies) + def extract_words(text): '''takes article as argument, removes numbers, returns list of single words, recurrences included. @@ -37,17 +43,17 @@ class BagOfWords: return words_cleaned def reduce_word_to_stem(word): - '''takes normal word as input, returns the word's word stem + '''takes normal word as input, returns the word's stem ''' stemmer = PorterStemmer() # replace word by its stem word = stemmer.stem(word) return word - def make_matrix(series, vocab): + def make_matrix(series, vocab, relative_word_frequencies): '''calculates word stem frequencies in input articles. returns matrix (DataFrame) with relative word frequencies - (0 <= values < 1) + (0 <= values < 1) or absolute word frequencies (int). (rows: different articles, colums: different words in vocab) ''' # create list of tuples @@ -64,8 +70,13 @@ class BagOfWords: vector.append(0) for w in words: if w == v: - # add relative word frequency - vector[i] += 1/word_count + if relative_word_frequencies: + # relative word frequency + vector[i] += 1/word_count + else: + # absolute word frequency + vector[i] += 1 + # add single vector as tuple vectors.append(tuple(vector)) df_vectors = pd.DataFrame.from_records(vectors, @@ -89,7 +100,7 @@ class BagOfWords: def set_stop_words(): '''creates list of all words that will be ignored ''' - # standard stopwords from nltk.corpus stopwords('english') + # stopwords stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t', 'as', 'at', 'be', 'because', 'been', @@ -119,13 +130,12 @@ class BagOfWords: 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves'] - - # add specific words - stop_words.extend(['reuters', 'also', 'monday', 'tuesday', - 'wednesday', 'thursday', 'friday']) - - # => does this make sense?: - # remove the word 'not' from stop words + + ##=> ist das sinnvoll?: + #add specific words + #stop_words.extend(['reuters', 'also', 'monday', 'tuesday', + # 'wednesday', 'thursday', 'friday']) + #remove the word 'not' from stop words #stop_words.remove('not') for i in range(len(stop_words)): diff --git a/DecisionTree.py b/DecisionTree.py index 0830b00..cb78097 100644 --- a/DecisionTree.py +++ b/DecisionTree.py @@ -9,8 +9,7 @@ holding the class labels for the training samples. ''' import operator -from BagOfWords import BagOfWords -from CsvHandler import CsvHandler +from BagOfWords import BagOfWords import graphviz import numpy as np @@ -25,9 +24,8 @@ class DecisionTree: def make_tree(dataset): print('# starting decision tree') - print() + print('#') - # note: better results with only title, but other important words X = dataset['Title'] + ' ' + dataset['Text'] y = dataset['Label'] @@ -94,7 +92,6 @@ class DecisionTree: #print(sorted_i_w)[:20] i_w = [x[0] for x in sorted_i_w] print(i_w[:20]) - print() #print metrics of test set @@ -109,4 +106,4 @@ class DecisionTree: # print() print('# ending decision tree') - print() \ No newline at end of file + print('#') \ No newline at end of file diff --git a/NaiveBayes.py b/NaiveBayes.py index 663c4d6..d92c003 100644 --- a/NaiveBayes.py +++ b/NaiveBayes.py @@ -11,246 +11,129 @@ given the label. It considers each of these features to contribute independently to the probability that it belongs to its category, regardless of any possible correlations between these features. ''' -from BagOfWords import BagOfWords -from CsvHandler import CsvHandler -#from sklearn.feature_extraction.text import CountVectorizer -#from sklearn.feature_selection import SelectPercentile -from sklearn.metrics import recall_score, precision_score +#!! +# The multinomial Naive Bayes classifier is suitable +#for classification with discrete features (e.g., +#word counts for text classification). +#The multinomial distribution normally requires +#integer feature counts. However, in practice, +#fractional counts such as tf-idf may also work. + +# => nur bei eigenem BOW berücksichtigt + +from BagOfWords import BagOfWords + +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_selection import SelectPercentile +from sklearn.metrics import f1_score, make_scorer from sklearn.model_selection import StratifiedKFold -#from sklearn.model_selection import train_test_split -from sklearn.naive_bayes import GaussianNB +from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import Pipeline +from sklearn.naive_bayes import MultinomialNB + +# MultinomialNB statt GaussianNB benutzt => OK? +#from sklearn.naive_bayes import GaussianNB class NaiveBayes: def make_naive_bayes(dataset): - '''fits naive bayes model with StratifiedKFold, - uses my BOW + '''fits naive bayes model ''' print('# starting naive bayes') - print() + print('#') - # join title and text + # split data into text and label set X = dataset['Title'] + ' ' + dataset['Text'] y = dataset['Label'] + # Bag of Words + print('# calculating bag of words') + print('#') + + # fit the training data and then return the matrix + + # toDO: warum so andere (schlechte) werte mit meinem BOW? + #X = BagOfWords.fit_transform(X, False) + + X = CountVectorizer().fit_transform(X).toarray() + # use stratified k-fold cross-validation as split method - skf = StratifiedKFold(n_splits = 10, shuffle=True) - - classifier = GaussianNB() - - # lists for metrics - recall_scores = [] - precision_scores = [] - f1_scores = [] - - # for each fold - n = 0 - for train, test in skf.split(X,y): - # BOW - vocab = BagOfWords.make_vocab(X[train]) - # fit the training data and then return the matrix - training_data = BagOfWords.make_matrix(X[train], vocab) - # transform testing data and return the matrix - testing_data = BagOfWords.make_matrix(X[test], vocab) - - #fit classifier - classifier.fit(training_data, y[train]) - #predict class - predictions_train = classifier.predict(training_data) - predictions_test = classifier.predict(testing_data) - - #store metrics - rec = recall_score(y[test], predictions_test) - recall_scores.append(rec) - prec = precision_score(y[train], predictions_train) - precision_scores.append(prec) - # equation for f1 score - f1_scores.append(2 * (prec * rec)/(prec + rec)) - - #print metrics of test set - print('prediction of testing set:') - print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}' - .format(min(f1_scores), max(f1_scores), - sum(f1_scores)/float(len(f1_scores)))) - print() - #print('overfit testing: prediction of training set') - #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'. - #format(min(f1_scores_train), max(f1_scores_train), - #sum(f1_scores_train)/float(len(f1_scores_train)))) - #print() + skf = StratifiedKFold(n_splits = 10, shuffle=True) + # use only most important features + selector = SelectPercentile() + + pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())]) + + grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100], + 'NB__alpha': [0.00000001, 0.0000001, + 0.000001, 0.00001, + 0.0001, 0.001, 0.01, + 0.1]}, + cv=skf, + scoring=make_scorer(f1_score)) + + print('# fit classifier') + print('#') + + grid.fit(X,y) + + # DataFrame of results + df_results = grid.cv_results_ + + # print results + ###################### + print('RESULTS:') + print('#') + print('mean_test_score:') + print(df_results['mean_test_score']) + print('#') + print('mean of means:') + print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score'])) + print('#') + print('best score:') + print(grid.best_score_) + print('#') + print('best parameters set found on development set:') + print(grid.best_params_) + print('#') + print('# ending naive bayes') + print('#') + + def analyze_errors(dataset): + '''calculates resubstitution error + shows indices of false classified articles + uses Gaussian Bayes with train test split + ''' + X_train_test = dataset['Title'] + ' ' + dataset['Text'] + y_train_test = dataset['Label'] + + count_vector = CountVectorizer() + # fit the training data and then return the matrix + training_data = count_vector.fit_transform(X_train_test).toarray() + # transform testing data and return the matrix + testing_data = count_vector.transform(X_train_test).toarray() + + # Naive Bayes + classifier = GaussianNB() + # fit classifier + classifier.fit(training_data, y_train_test) + + # Predict class + predictions = classifier.predict(testing_data) + print('Errors at index:') print() - - # def make_naive_bayes_selectpercentile(dataset): - # '''fits naive bayes model with StratifiedKFold, uses my BOW - # feature selection: select 0.25-percentile - # ''' - - # print('# starting naive bayes') - # print() - - # # alternative: use only articles' header => may give better results - # X = dataset['Title'] + ' ' + dataset['Text'] - # y = dataset['Label'] - - # # use stratified k-fold cross-validation as split method - # skf = StratifiedKFold(n_splits = 10, shuffle=True) - - # classifier = GaussianNB() - - # # lists for metrics - # recall_scores = [] - # precision_scores = [] - # f1_scores = [] - - # # for each fold - # n = 0 - # for train, test in skf.split(X,y): - # # BOW - # vocab = BagOfWords.make_vocab(X[train]) - # # fit the training data and then return the matrix - # training_data = BagOfWords.make_matrix(X[train], vocab) - # # transform testing data and return the matrix - # testing_data = BagOfWords.make_matrix(X[test], vocab) - - # # apply select percentile - # selector = SelectPercentile(percentile=25) - # selector.fit(training_data, y[train]) - - # training_data_r = selector.transform(training_data) - # testing_data_r = selector.transform(testing_data) - - # #fit classifier - # classifier.fit(training_data_r, y[train]) - # #predict class - # predictions_train = classifier.predict(training_data_r) - # predictions_test = classifier.predict(testing_data_r) - - # #store metrics - # rec = recall_score(y[test], predictions_test) - # recall_scores.append(rec) - # prec = precision_score(y[train], predictions_train) - # precision_scores.append(prec) - # # equation for f1 score - # f1_scores.append(2 * (prec * rec)/(prec + rec)) - - # #print metrics of test set - # print('prediction of testing set:') - # print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}' - # .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores)))) - # print() - # #print('overfit testing: prediction of training set') - # #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'. - # #format(min(f1_scores_train), max(f1_scores_train), - # sum(f1_scores_train)/float(len(f1_scores_train)))) - # #print() - - # print('# ending naive bayes') - # print() - - - # def make_naive_bayes_CV(dataset): - # '''alternative: uses CountVectorizer (faster) - # ''' - # # alternative: use only articles' header => may give better results - # X = dataset['Title'] + '.' + dataset['Text'] + '.' - # y = dataset['Label'] - - # # use stratified k-fold cross-validation as split method - # skf = StratifiedKFold(n_splits = 10, shuffle=True) - - # count_vector = CountVectorizer() - - # classifier = GaussianNB() - - # # lists for metrics predicted on test/train set - # f1_scores, f1_scores_train = [] - - # # for each fold (10 times) - # # fold number - # n = 0 - # for train, test in skf.split(X,y): - - # # fit the training data and then return the matrix - # training_data = count_vector.fit_transform(X[train], y[train]).toarray() - # # transform testing data and return the matrix - # testing_data = count_vector.transform(X[test]).toarray() - - # # apply select percentile - # selector = SelectPercentile(percentile=25) - # selector.fit(training_data, y[train]) - - # training_data_r = selector.transform(training_data) - # testing_data_r = selector.transform(testing_data) - - # #fit classifier - # classifier.fit(training_data_r, y[train]) - - # #predict class - # predictions_train = classifier.predict(training_data_r) - # predictions_test = classifier.predict(testing_data_r) - - # #store metrics predicted on test set - # f1_scores.append(f1_score(y[test], predictions_test)) - - # #store metrics predicted on train set - # f1_scores_train.append(f1_score(y[train], predictions_train)) - - # #print metrics of test set - # print('--------------------') - # print('prediction of testing set:') - # print('F1 score: min = {}, max = {}, average = {}' - # .format(min(f1_scores), max(f1_scores), - # sum(f1_scores)/float(len(f1_scores)))) - - # print() - # print('prediction of training set:') - # print('F1 score: min = {}, max = {}, average = {}' - # .format(min(f1_scores_train), max(f1_scores_train), - # sum(f1_scores_train)/float(len(f1_scores_train)))) - # print() - - # def analyze_errors_cv(dataset): - # '''calculates resubstitution error - # shows indices of false classified articles - # uses Gaussian Bayes with train test split - # ''' - - # X_train_test = dataset['Text'] - # y_train_test = dataset['Label'] - - # count_vector = CountVectorizer() - - # # fit the training data and then return the matrix - # training_data = count_vector.fit_transform(X_train_test).toarray() - - # # transform testing data and return the matrix - # testing_data = count_vector.transform(X_train_test).toarray() - - # # Naive Bayes - # classifier = GaussianNB() - - # # fit classifier - # classifier.fit(training_data, y_train_test) - - # # Predict class - # predictions = classifier.predict(testing_data) - - # print() - # print('errors at index:') - # n = 0 - # for i in range(len(y_train_test)): - # if y_train_test[i] != predictions[i]: - # n += 1 - # print('error no.{}'.format(n)) - # print('prediction at index {} is: {}, but actual is: {}' - # .format(i, predictions[i], y_train_test[i])) - # print(X_train_test[i]) - # print(y_train_test[i]) - # print() - - # print() - # #print metrics - # print('F1 score: ', format(f1_score(y_train_test, predictions))) \ No newline at end of file + n = 0 + for i in range(len(y_train_test)): + if y_train_test[i] != predictions[i]: + n += 1 + print('error no.{}'.format(n)) + print('prediction at index {} is: {}, but actual is: {}' + .format(i, predictions[i], y_train_test[i])) + print(X_train_test[i]) + print(y_train_test[i]) + print() + #print metrics + print('F1 score: ', format(f1_score(y_train_test, predictions))) \ No newline at end of file diff --git a/SVM.py b/SVM.py new file mode 100644 index 0000000..d958114 --- /dev/null +++ b/SVM.py @@ -0,0 +1,87 @@ +''' +Support Vector Machines (SVM) Classifier +======================================== + +The SVM training algorithm builds a model from the training data that assigns +the test samples to one category ('merger' or 'not merger'), +making it a non-probabilistic binary linear classifier. +An SVM model is a representation of the samples as points in space, +mapped so that the examples of the separate categories are divided +by a clear gap that is as wide as possible. +New samples are then mapped into that same space and predicted +to belong to a category based on which side of the gap they fall. +''' + +from BagOfWords import BagOfWords + +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_selection import SelectPercentile +from sklearn.metrics import f1_score, make_scorer +from sklearn.model_selection import StratifiedKFold +from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import Pipeline +from sklearn.svm import SVC + +class SVM: + + def make_svm(dataset): + + print('# starting SVM') + print('#') + + # split data into text and label set + + # articles' text (title + text) + X = dataset['Title'] + ' ' + dataset['Text'] + # articles' labels + y = dataset['Label'] + + # Bag of Words + print('# calculating bag of words') + print('#') + # fit the training data and then return the matrix + #X = BagOfWords.fit_transform(X) + X = CountVectorizer().fit_transform(X).toarray() + + # use stratified k-fold cross-validation as split method + skf = StratifiedKFold(n_splits = 10, shuffle=True) + + # use only most important features + selector = SelectPercentile() + + pipeline = Pipeline([('perc', selector), ('SVC', SVC())]) + + grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100], + 'SVC__kernel': ['linear','poly','rbf','sigmoid'], + 'SVC__gamma': [0.0001, 0.001, 0.01, 0.1, 1], + 'SVC__C': [0.0001, 0.001, 0.01, 0.1, 1]}, + cv=skf, + scoring=make_scorer(f1_score)) + + print('# fit classifier') + print('#') + + grid.fit(X,y) + + # DataFrame of results + df_results = grid.cv_results_ + + # print results + ###################### + print('RESULTS:') + print('') + print('mean_test_score:') + print(df_results['mean_test_score']) + print('') + print('mean of means:') + print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score'])) + print('') + print('best score:') + print(grid.best_score_) + print() + print('best parameters set found on development set:') + print(grid.best_params_) + print() + + print('# ending SVM') + print('#') \ No newline at end of file diff --git a/Starter.py b/Starter.py index a36b112..432064f 100644 --- a/Starter.py +++ b/Starter.py @@ -10,19 +10,23 @@ from CsvHandler import CsvHandler from DecisionTree import DecisionTree from NaiveBayes import NaiveBayes #from Requester import Requester -#from SVM import SVM +from SVM import SVM print('# starting program') -print() +print('#') +# only if new unlabeled(!) data set is required: # Requester.save_articles_from_webhoseio() + file = 'classification_labelled_corrected.csv' # read csv file +print('# reading dataset') +print('#') dataset = CsvHandler.read_csv(file) # DecisionTree.make_tree(dataset) NaiveBayes.make_naive_bayes(dataset) -# SVM.make_svm(dataset) +SVM.make_svm(dataset) print('# ending program') \ No newline at end of file