diff --git a/BagOfWords.py b/BagOfWords.py
index b98bc6f..069d6ae 100644
--- a/BagOfWords.py
+++ b/BagOfWords.py
@@ -9,7 +9,7 @@ vocabulary. As the multinomial Naive Bayes classifier is suitable for
 classification with discrete features (e.g., word counts for text
 classification). The multinomial distribution normally requires integer
 feature counts. However, in practice, fractional counts such as tf-idf may
-also work. => considered by 'relative_word_frequencies' as parameter.
+also work => considered by 'rel_freq'(relative word frequencies) as parameter.
 '''
 from collections import OrderedDict
 import csv
@@ -21,11 +21,14 @@ from nltk.stem.porter import PorterStemmer
 
 class BagOfWords:
 
-    def fit_transform(X, relative_word_frequencies=True):
+    def fit_transform(corpus, rel_freq=True, stemming=True):
         ''' similar to CountVectorizer's fit_transform method
         '''
-        vocab = BagOfWords.make_vocab(X)
-        return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)
+        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
+        vocab = BagOfWords.make_vocab(extracted_words, stemming)
+        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
+                                        stemming)
+        return matrix
 
     def extract_words(text, stemming=True):
         '''takes article as argument, removes numbers,
@@ -46,52 +49,25 @@ class BagOfWords:
                 if stemming:
                     # reduce word to its stem
                     word = stemmer.stem(word)
+                    # filter out spam chars
+                    word = word.replace('â', '').replace('œ', '')\
+                               .replace('ã', '')
                 words_cleaned.append(word)
         return words_cleaned
 
-    # def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
-        # '''calculates word stem frequencies in input articles. returns
-        # document term matrix(DataFrame) with relative word frequencies
-        # (0 <= values < 1) if relative_word_frequencies=True or absolute
-        # word frequencies (int) if relative_word_frequencies=False.
-        # (rows: different articles, colums: different words in vocab)
-        # returns matrix as DataFrame
-        # '''
-        # print('# BOW: calculating matrix...')
-        # print()
-        # # create list of tuples
-        # vectors = []
-        # # for every text in series
-        # for i in range(len(series)):
-            # # extract text of single article
-            # text = series.iloc[i]
-            # # extract its words
-            # words = BagOfWords.extract_words(text, stemming)
-            # # count words in single article
-            # word_count = len(words)
-            # vector = []
-            # for i, v in enumerate(vocab):
-                # vector.append(0)
-                # for w in words:
-                    # if w == v:
-                        # if relative_word_frequencies:
-                            # # relative word frequency
-                            # vector[i] += 1/word_count
-                        # else:
-                            # # absolute word frequency
-                            # vector[i] += 1
+    def extract_all_words(corpus, stemming=True):
+        '''param: all articles of corpus
+        returns list of lists of all extracted words, one row per article
+        '''
+        extracted_words = []
+        print('# extracting all words from articles...')
+        print()
+        for text in corpus:
+            extracted_words.append(BagOfWords.extract_words(text, stemming))
 
-            # # !!! hier passiert immer der MemoryError: !!!
+        return extracted_words
 
-            # # add single vector as tuple
-            # vectors.append(tuple(vector))
-        # df_vectors = pd.DataFrame.from_records(vectors,
-                                               # index=None,
-                                               # #header=vocab,
-                                               # columns=vocab)
-        # return df_vectors
-
-    def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
+    def make_matrix(extracted_words, vocab, rel_freq=True, stemming=True):
         '''calculates word stem frequencies in input articles. returns
         document term matrix(DataFrame) with relative word frequencies
         (0 <= values < 1) if relative_word_frequencies=True or absolute
@@ -101,28 +77,38 @@ class BagOfWords:
         '''
         print('# BOW: calculating matrix...')
         print()
+
+        # total number of words in bag of words
+        word_count = 0
+        print('# counting number of features in corpus...')
+        print()
+        for list in extracted_words:
+            word_count += len(list)
+
+        # number of articles
+        n_articles = len(extracted_words)
+        # number of words in vocab
+        l_vocab = len(vocab)
+
         # create zero-filled dataframe
-        array = np.zeros(shape=(len(series),len(vocab)))
+        array = np.zeros(shape=(n_articles, l_vocab))
         df_matrix = pd.DataFrame(array, columns=vocab)
 
+        print('# calculating frequencies...')
+        print()
+
         # for every text in series
-        for i in range(len(series)):
+        for i in range(len(extracted_words)):
 
-            # extract text of single article
-            text = series.iloc[i]
+            # extract words of single article
+            words = extracted_words[i]
 
-            # extract its words
-            words = BagOfWords.extract_words(text, stemming)
-            # count words in article
-            word_count = len(words)
-
-            # for every word in global vocab
             for v in vocab:
                 # for every word in article
                 for w in words:
                     # find right position
                     if w == v:
-                        if relative_word_frequencies:
+                        if rel_freq:
                             # relative word frequency
                             df_matrix.loc[i][v] += 1/word_count
                         else:
@@ -131,18 +117,22 @@ class BagOfWords:
 
         return df_matrix
 
-    def make_vocab(series, stemming=True):
-        '''adds words of input articles to a global vocabulary.
-        input: dataframe of all articles, return value: list of words
+    def make_vocab(extracted_words, stemming=True):
+        '''adds all words to a global vocabulary.
+        input: list of lists of all extracted words, returns: list of words
         '''
         print('# BOW: making vocabulary of data set...')
         print()
         vocab = set()
         # for every article's text
-        for text in series:
-            # add single article's text to total vocabulary
-            vocab |= set(BagOfWords.extract_words(text, stemming))
-        return vocab
+        for e_list in extracted_words:
+            for word in e_list:
+                # add every single word to vocabulary
+                vocab.add(word)
+        print('# vocabulary consists of {} features.'.format(len(vocab)))
+        print()
+        # transform set to list
+        return list(vocab)
 
     def set_stop_words(stemming=True):
         '''creates list of all words that will be ignored
@@ -179,7 +169,7 @@ class BagOfWords:
                       'yourselves']
 
         #add unwanted terms
-        stop_words.extend(['reuters', 'bloomberg', 'cnn', 'n', 'l', 'â',
+        stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
                            'file', 'photo', 'min', 'read', 'staff', 'left',
                            'right', 'updated', 'minutes', 'brief', 'editing',
                            'reporting', 'ago', 'also', 'would', 'could',
@@ -202,20 +192,23 @@ class BagOfWords:
             # transform list to set to eliminate duplicates
         return set(stop_words)
 
-    def make_dict_common_words(texts, rel_freq=True, stemming=True, n=200):
-        '''texts: df of article texts of complete data set as series,
-        return dict of words with their count.
+    def make_dict_common_words(df_matrix, n=200, rel_freq=True, stemming=True):
+        '''params: DataFrame document term matrix of complete data set,
+        number of n most common words.
+        returns: dict of words with their count.
         '''
+        print('# making dictionary of most common words...')
+        print()
+
         # words under that rel_freq limit are not included
-        limit = 0.0005
+        # set limit
+        limit = 0.001
         if not rel_freq:
-            limit = 25
+            limit = len(df_matrix) * 0.001
+
         # word => count
         dict = {}
-        vocab = BagOfWords.make_vocab(texts, stemming)
-        # calculate document term matrix
-        df_matrix = BagOfWords.make_matrix(texts, vocab, rel_freq, stemming)
-        print(df_matrix.shape)
+
         # iterate over words
         for column in df_matrix:
             # count word mentions in total
@@ -224,16 +217,23 @@ class BagOfWords:
         # sort dict by value and 
         o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
                              reverse=True))
+        print(o_dict)
         # return n higest values as dict (word => count)
         n_dict = {}
+
         for i in range(n):
-            n_dict[o_dict.popitem(last=False)[0]] = o_dict.popitem(last=False)[1]
+            # next highest score
+            next_highest = o_dict.popitem(last=False)
+            n_dict[next_highest[0]] = next_highest[1]
+
         return n_dict
 
     def count_features(texts, stemming=True):
+        ''' count total number of features in textual corpus
+        '''
         print('# counting all features in corpus...')
         print()
-        vocab = BagOfWords.make_vocab(texts, True)
+        vocab = BagOfWords.make_vocab(texts, stemming)
         return len(vocab)
 
     def count_all_words(texts):
@@ -244,26 +244,37 @@ class BagOfWords:
             sum += len(text.split())
         return sum
 
+    def test():
+        file = 'data\\interactive_labeling_dataset_without_header.csv'
+        df_dataset = pd.read_csv(file,
+                                 delimiter='|',
+                                 header=None,
+                                 index_col=None,
+                                 engine='python',
+                                 usecols=[1,2],
+                                 nrows=100,
+                                 quoting=csv.QUOTE_NONNUMERIC,
+                                 quotechar='\'')
+
+        corpus = df_dataset[1] + '. ' + df_dataset[2]
+        stemming = True
+        rel_freq = True
+        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
+        vocab = BagOfWords.make_vocab(extracted_words, stemming)
+        #print(vocab)
+        for text in corpus:
+            print(text)
+            print()
+            print()
+        # ab hier ValueError bei nrows=10000...
+        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
+        dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
+        print(dict)
+
 if __name__ == '__main__':
-
-    # load new data set
-    file = 'data\\interactive_labeling_dataset_without_header.csv'
-    df_dataset = pd.read_csv(file,
-                             delimiter='|',
-                             header=None,
-                             index_col=None,
-                             engine='python',
-                             usecols=[1,2],
-                             nrows=3000,
-                             quoting=csv.QUOTE_NONNUMERIC,
-                             quotechar='\'')
-
-    # find most common words in dataset
-    corpus = df_dataset[1] + '. ' + df_dataset[2]
-    stemming = False
-    rel_freq = False
-    vocab = BagOfWords.make_vocab(corpus, stemming)
-
-    # print(BagOfWords.make_matrix(corpus, vocab, False, stemming))
-    print(BagOfWords.make_dict_common_words(corpus, rel_freq, stemming, 200))
-    # print(BagOfWords.count_features(corpus))
\ No newline at end of file
+    for word in sorted(BagOfWords.set_stop_words(False)):
+        print(word)
+        print()
+        print(PorterStemmer().stem(word))
+        print()
+    # BagOfWords.test()
\ No newline at end of file
diff --git a/CosineSimilarity.py b/CosineSimilarity.py
index c036b71..4fcea9f 100644
--- a/CosineSimilarity.py
+++ b/CosineSimilarity.py
@@ -4,74 +4,77 @@ Cosine Similarity
 
 CosineSimilarity measures the similarity between to articles.
 It calculates c: the cosine of the angle between the articles
-vectors dict_1 and dict_2.
-c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
+vectors text_1 and text_2.
+c = (text_1 * text_2) / (|text_1| * |text_2|).
 c = 1, if articles are equal => identicalness is 100%
 0 > c > 1, else => identicalness is (c*100)%
 (The greater c, the more similar two articles are.)
 '''
+from BagOfWords import BagOfWords
 
-#TODO:uses dictionaries of each article
-#=>ToDo:has to be changed as we are now using vectors
-
+import csv
 import math
 
-from BagOfWords import BagOfWords
+import pandas as pd
 
 class CosineSimilarity:
 
-    def cos_sim(dict_1, dict_2):
+    def calc_similarity(text_1, text_2, rel_freq=True, stemming=True):
+        ''' calculates cosine similarity of two input articles
+        '''
+        print('# calculating cosine similarity...')
+        print()
 
-        # list of all different words
-        vocab = []
+        # extract words from articles
+        extracted_words_1 = BagOfWords.extract_words(text_1, stemming)
+        extracted_words_2 = BagOfWords.extract_words(text_2, stemming)
+        print(extracted_words_1)
+        print(extracted_words_2)
 
-        # insert words of 1st article into vocab
-        for key in dict_1.keys():
-            if key not in vocab:
-                vocab.append(key)
-
-        # insert words of 2nd article into vocab
-        for key in dict_2.keys():
-            if key not in vocab:
-                vocab.append(key)
-
-        # delete first entry ('sum_words')
-        vocab.pop(0)
+        # insert words into vocab
+        both_extracted = []
+        both_extracted.append(extracted_words_1)
+        both_extracted.append(extracted_words_2)
+        vocab = BagOfWords.make_vocab(both_extracted, stemming)
 
         # create vectors
-        vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
-        vector_2 = CosineSimilarity.create_vector(dict_2, vocab)
+        matrix = BagOfWords.make_matrix(both_extracted, vocab,\
+                                          rel_freq, stemming)
 
         # start calculation
         # calculate numerator of formula
         sum_1 = 0
 
-        for i in range (0,len(vector_1)):
-            sum_1 += vector_1[i] * vector_2[i]
+        for i in range (0,len(matrix.iloc[0])):
+            sum_1 += matrix.iloc[0][i] * matrix.iloc[1][i]
 
         # calculate denominator of formula
         sum_2 = 0
 
-        for entry in vector_1:
+        for entry in matrix.iloc[0]:
             sum_2 += entry ** 2
 
         sum_3 = 0
-        for entry in vector_2:
+        for entry in matrix.iloc[1]:
             sum_3 += entry ** 2
 
         return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
 
-    def create_vector(dict, vocab):
-        # word frequency vector
-        vector = []
-        for word in vocab:
-            # check if word occurs in article
-            if word in dict:
-                # insert word count
-                vector.append(dict[word])
-            else:
-                # insert zero
-                vector.append(0)
-        # delete first entry ('sum_words')
-        vector.pop(0)
-        return vector
\ No newline at end of file
+if __name__ == '__main__':
+        # read data set
+        file = 'data\\interactive_labeling_dataset_without_header.csv'
+        df = pd.read_csv(file,
+                         delimiter='|',
+                         header=None,
+                         index_col=None,
+                         engine='python',
+                         usecols=[1,2],
+                         nrows=100,
+                         quoting=csv.QUOTE_NONNUMERIC,
+                         quotechar='\'')
+
+        texts = df[1] + '. ' + df[2]
+
+        # compare first and second article in data set
+        print(CosineSimilarity.calc_similarity(texts.iloc[0], texts.iloc[1],\
+                                            rel_freq=True, stemming=True))
\ No newline at end of file
diff --git a/DecisionTree.py b/DecisionTree.py
index f4a45a2..5343679 100644
--- a/DecisionTree.py
+++ b/DecisionTree.py
@@ -7,6 +7,9 @@ array X of size [n_samples, n_features], holding the training samples,
 and array y of integer values, size [n_samples],
 holding the class labels for the training samples.
 '''
+# toDo: replace old dataset!!!
+# CountVectorizer funktioniert noch nicht
+
 from BagOfWords import BagOfWords
 
 import csv
@@ -16,21 +19,22 @@ import graphviz
 import numpy as np
 import pandas as pd
 from sklearn import tree
-#from sklearn.feature_extraction.text import CountVectorizer
+# from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
 from sklearn.metrics import f1_score
 from sklearn.model_selection import StratifiedKFold
 
 class DecisionTree:
 
-    def make_tree(dataset):
+    def make_tree(dataset, sklearn_cv=False, stemming=False, percentile=100):
         print('# fitting model')
         print('# ...')
 
         X = dataset['Title'] + ' ' + dataset['Text']
         y = dataset['Label']
 
-        #count_vector = CountVectorizer()
+        if sklearn_cv:
+            cv = CountVectorizer()
 
         # use stratified k-fold cross-validation as split method
         skf = StratifiedKFold(n_splits = 10, shuffle=True)
@@ -45,33 +49,48 @@ class DecisionTree:
         important_words = {}
 
         # for each fold
+        n = 0
         for train, test in skf.split(X,y):
 
-            # BOW
-            vocab = BagOfWords.make_vocab(X[train])
-            # fit the training data and then return the matrix
-            training_data = BagOfWords.make_matrix(X[train], vocab)
-            # transform testing data and return the matrix
-            testing_data = BagOfWords.make_matrix(X[test], vocab)
+            n += 1
+            vocab = []
+            print('# split no. ' + str(n))
 
-            # #fit the training data and then return the matrix
-            # training_data = count_vector.fit_transform(X[train], y[train]).toarray()
-            # #transform testing data and return the matrix
-            # testing_data = count_vector.transform(X[test]).toarray()
+            if sklearn_cv:
+                # use sklearn CountVectorizer
+                # fit the training data and then return the matrix
+                training_data = cv.fit_transform(X[train], y[train]).toarray()
+                # transform testing data and return the matrix
+                testing_data = cv.transform(X[test]).toarray()
+            else:
+                # use my own BagOfWords python implementation
+                rel_freq = True
+                extracted_words = BagOfWords.extract_all_words(X[train], stemming)
+                vocab = BagOfWords.make_vocab(extracted_words, stemming)
+                print(vocab)
 
-            # # apply select percentile
-            # selector = SelectPercentile(percentile=25)
-            # selector.fit(training_data, y[train])
+                # fit the training data and then return the matrix
+                training_data = BagOfWords.make_matrix(extracted_words,
+                                vocab, rel_freq, stemming)
+                # transform testing data and return the matrix
+                extracted_words = BagOfWords.extract_all_words(X[test], stemming)
+                testing_data = BagOfWords.make_matrix(extracted_words,
+                                vocab, rel_freq, stemming)
 
-            # training_data_r = selector.transform(training_data)
-            # testing_data_r = selector.transform(testing_data)
+            # apply select percentile
+            selector = SelectPercentile(percentile=percentile)
+            selector.fit(training_data, y[train])
+
+            # new reduced data sets
+            training_data_r = selector.transform(training_data)
+            testing_data_r = selector.transform(testing_data)
 
             # fit classifier
-            classifier.fit(training_data, y[train])
+            classifier.fit(training_data_r, y[train])
 
             #predict class
-            predictions_train = classifier.predict(training_data)
-            predictions_test = classifier.predict(testing_data)
+            predictions_train = classifier.predict(training_data_r)
+            predictions_test = classifier.predict(testing_data_r)
 
             #store metrics predicted on test/train set
             f1_scores.append(f1_score(y[test], predictions_test))
@@ -80,6 +99,7 @@ class DecisionTree:
             # search for important features
             feature_importances = np.array(classifier.feature_importances_)
             important_indices = feature_importances.argsort()[-50:][::-1]
+            print(important_indices)
 
             for i in important_indices:
                 if vocab[i] in important_words:
diff --git a/FilterKeywords.py b/FilterKeywords.py
index e5f4887..dccd886 100644
--- a/FilterKeywords.py
+++ b/FilterKeywords.py
@@ -6,6 +6,8 @@ FilterKeywords searches for merger specific keywords
 in an article and counts them.
 '''
 
+# toDo: replace dict by vector/matrix
+
 from collections import defaultdict
 import re
 
@@ -18,14 +20,6 @@ class FilterKeywords:
         output are the contained keywords and their count.
         '''
 
-        # # list of regular expressions that match merger specific keywords
-        # regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
-                      # r'business combinations?', r'combined compan(y|ies)',
-                      # r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
-                      # r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
-                      # r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
-                      # r'purchase', r'(sell(s|ers?|ing)?|sold)']
-
         keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
                         'acquisition', 'acquire', 'acquisitions', 'acquires',
                         'combine', 'combines', 'combination', 'combined',
@@ -44,22 +38,22 @@ class FilterKeywords:
         # remove duplicates
         keywords = set(keyword_list)
 
-        # counts keywords in article (default value: 0)
-        dict_keywords = defaultdict(int)
+        # # counts keywords in article (default value: 0)
+        # dict_keywords = defaultdict(int)
 
-        # search for matchings in dictionary of input article
-        for key in dict_input.keys():
-            # iterate over all regular expressions
-            for kword in keywords:
-                if re.match(kword, key):
-                    # if match, increase value of matching key
-                    if str(kword) in dict_keywords:
-                        dict_keywords[str(kword)] += dict_input[key]
-                    else:
-                        dict_keywords[str(kword)] = dict_input[key]
+        # # search for matchings in dictionary of input article
+        # for key in dict_input.keys():
+            # # iterate over all regular expressions
+            # for kword in keywords:
+                # if re.match(kword, key):
+                    # # if match, increase value of matching key
+                    # if str(kword) in dict_keywords:
+                        # dict_keywords[str(kword)] += dict_input[key]
+                    # else:
+                        # dict_keywords[str(kword)] = dict_input[key]
 
-        return dict_keywords
+        # return dict_keywords
 
 if __name__ == '__main__':
-    dict_test={'example':2, 'combined':5, 'sells':3}
-    print(FilterKeywords.search_keywords(dict_test))
\ No newline at end of file
+    # dict_test={'example':2, 'combined':5, 'sells':3}
+    # print(FilterKeywords.search_keywords(dict_test))
\ No newline at end of file
diff --git a/NaiveBayes.py b/NaiveBayes.py
index e5a25f9..7d2d7cb 100644
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@@ -25,7 +25,7 @@ from sklearn.naive_bayes import GaussianNB
 
 class NaiveBayes:
 
-    def make_naive_bayes(dataset):
+    def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
         '''fits naive bayes model with StratifiedKFold,
         uses my BOW
         '''
@@ -34,10 +34,11 @@ class NaiveBayes:
 
         # split data into text and label set
         # join title and text
-        X = dataset['Title'] + ' ' + dataset['Text']
+        X = dataset['Title'] + '. ' + dataset['Text']
         y = dataset['Label']
 
-        cv = CountVectorizer()
+        if sklearn_cv:
+            cv = CountVectorizer()
 
         # use stratified k-fold cross-validation as split method
         skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
@@ -61,23 +62,32 @@ class NaiveBayes:
             n += 1
             print('# split no. ' + str(n))
 
-            # # eigenes BOW
-            # vocab = BagOfWords.make_vocab(X[train])
-            # # fit the training data and then return the matrix
-            # training_data = BagOfWords.make_matrix(X[train], vocab)
-            # # transform testing data and return the matrix
-            # testing_data = BagOfWords.make_matrix(X[test], vocab)
+            if sklearn_cv:
+                # use sklearn CountVectorizer
+                # fit the training data and then return the matrix
+                training_data = cv.fit_transform(X[train], y[train]).toarray()
+                # transform testing data and return the matrix
+                testing_data = cv.transform(X[test]).toarray()
+            else:
+                # use my own BagOfWords python implementation
+                stemming = True
+                rel_freq = True
+                extracted_words = BagOfWords.extract_all_words(X[train])
+                vocab = BagOfWords.make_vocab(extracted_words)
 
-            # using CountVectorizer:
-            # fit the training data and then return the matrix
-            training_data = cv.fit_transform(X[train], y[train]).toarray()
-            # transform testing data and return the matrix
-            testing_data = cv.transform(X[test]).toarray()
+                # fit the training data and then return the matrix
+                training_data = BagOfWords.make_matrix(extracted_words,
+                                vocab, rel_freq, stemming)
+                # transform testing data and return the matrix
+                extracted_words = BagOfWords.extract_all_words(X[test])
+                testing_data = BagOfWords.make_matrix(extracted_words,
+                                vocab, rel_freq, stemming)
 
             # apply select percentile
-            selector = SelectPercentile(percentile=100)
+            selector = SelectPercentile(percentile=percentile)
             selector.fit(training_data, y[train])
 
+            # new reduced data sets
             training_data_r = selector.transform(training_data)
             testing_data_r = selector.transform(testing_data)
 
diff --git a/NaiveBayes_Interactive.py b/NaiveBayes_Interactive.py
index a09439d..2ba1778 100644
--- a/NaiveBayes_Interactive.py
+++ b/NaiveBayes_Interactive.py
@@ -10,13 +10,14 @@ import csv
 
 import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_selection import SelectPercentile
 from sklearn.metrics import recall_score, precision_score
 from sklearn.model_selection import StratifiedKFold
 from sklearn.naive_bayes import GaussianNB
 
 class NaiveBayes_Interactive:
 
-    def make_naive_bayes(dataset):
+    def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
         '''fits naive bayes model
         '''
         print('# fitting model')
@@ -24,10 +25,11 @@ class NaiveBayes_Interactive:
 
         # split data into text and label set
         # join title and text
-        X = dataset['Title'] + ' ' + dataset['Text']
+        X = dataset['Title'] + '. ' + dataset['Text']
         y = dataset['Label']
 
-        cv = CountVectorizer()
+        if sklearn_cv:
+            cv = CountVectorizer()
 
         # stratified k-fold cross-validation as split method
         kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)
@@ -51,17 +53,40 @@ class NaiveBayes_Interactive:
             n += 1
             print('# split no. ' + str(n))
 
-            # using CountVectorizer:
-            # fit the training data and then return the matrix
-            training_data = cv.fit_transform(X[train], y[train]).toarray()
-            # transform testing data and return the matrix
-            testing_data = cv.transform(X[test]).toarray()
+            if sklearn_cv:
+                # use sklearn CountVectorizer
+                # fit the training data and then return the matrix
+                training_data = cv.fit_transform(X[train], y[train]).toarray()
+                # transform testing data and return the matrix
+                testing_data = cv.transform(X[test]).toarray()
+            else:
+                # use my own BagOfWords python implementation
+                stemming = True
+                rel_freq = True
+                extracted_words = BagOfWords.extract_all_words(X[train])
+                vocab = BagOfWords.make_vocab(extracted_words)
+
+                # fit the training data and then return the matrix
+                training_data = BagOfWords.make_matrix(extracted_words,
+                                vocab, rel_freq, stemming)
+                # transform testing data and return the matrix
+                extracted_words = BagOfWords.extract_all_words(X[test])
+                testing_data = BagOfWords.make_matrix(extracted_words,
+                                vocab, rel_freq, stemming)
+
+            # apply select percentile
+            selector = SelectPercentile(percentile=percentile)
+            selector.fit(training_data, y[train])
+
+            # new reduced data sets
+            training_data_r = selector.transform(training_data)
+            testing_data_r = selector.transform(testing_data)
 
             #fit classifier
-            classifier.fit(training_data, y[train])
+            classifier.fit(training_data_r, y[train])
             #predict class
-            predictions_train = classifier.predict(training_data)
-            predictions_test = classifier.predict(testing_data)
+            predictions_train = classifier.predict(training_data_r)
+            predictions_test = classifier.predict(testing_data_r)
 
             #print and store metrics
             rec = recall_score(y[test], predictions_test)
@@ -166,7 +191,9 @@ class NaiveBayes_Interactive:
                            quotechar='\'',
                            quoting=csv.QUOTE_NONE)
 
-        make_naive_bayes(data)
+        use_count_vectorizer = True
+        select_percentile = 100
+        make_naive_bayes(data, use_count_vectorizer, select_percentile)
 
         print('#')
         print('# ending naive bayes')
\ No newline at end of file
diff --git a/SVM.py b/SVM.py
index 80532e1..1f69ad6 100644
--- a/SVM.py
+++ b/SVM.py
@@ -27,7 +27,7 @@ from sklearn.svm import SVC
 
 class SVM:
 
-    def make_svm(dataset):
+    def make_svm(dataset, sklearn_cv=True):
 
         print('# fitting model')
         print('# ...')
@@ -35,16 +35,18 @@ class SVM:
         # split data into text and label set
 
         # articles' text (title + text)
-        X = dataset['Title'] + ' ' + dataset['Text']
+        X = dataset['Title'] + '. ' + dataset['Text']
         # articles' labels
         y = dataset['Label']
+        matrix = pd.DataFrame()
 
-        # Bag of Words
-        print('# calculating bag of words')
-        print('# ...')
         # fit the training data and then return the matrix
-        #X = BagOfWords.fit_transform(X)
-        X = CountVectorizer().fit_transform(X).toarray()
+        if sklearn_cv:
+            # use sklearn CountVectorizer
+            matrix = CountVectorizer().fit_transform(X).toarray()
+        else:
+            # use own BOW implementation
+            matrix = BagOfWords.fit_transform(X)
 
         # use stratified k-fold cross-validation as split method
         skf = StratifiedKFold(n_splits = 10, shuffle=True)
@@ -64,7 +66,7 @@ class SVM:
         print('# fit classifier')
         print('# ...')
 
-        grid.fit(X,y)
+        grid.fit(matrix,y)
 
         # DataFrame of results
         df_results = grid.cv_results_
@@ -104,6 +106,7 @@ class SVM:
                    quotechar='\'',
                    quoting=csv.QUOTE_NONE)
 
-        make_svm(data)
+        use_count_vectorizer = True
+        make_svm(data, use_count_vectorizer)
 
         print('# ending svm')
\ No newline at end of file
diff --git a/VisualizerNews.py b/VisualizerNews.py
index 31724ab..749052b 100644
--- a/VisualizerNews.py
+++ b/VisualizerNews.py
@@ -22,7 +22,7 @@ class VisualizerNews:
     def plot_wordcloud_dataset():
         '''plots word cloud image of most common words in dataset.
         '''
-        print('# preparing word cloud...')
+        print('# preparing word cloud of 200 most common words...')
         print()
         # load new data set
         file = 'data\\interactive_labeling_dataset_without_header.csv'
@@ -32,17 +32,18 @@ class VisualizerNews:
                                  index_col=None,
                                  engine='python',
                                  usecols=[1,2],
-                                 #nrows=100,
                                  quoting=csv.QUOTE_NONNUMERIC,
                                  quotechar='\'')
 
-        corpus = df_dataset[1] + ' ' + df_dataset[2]
+        corpus = df_dataset[1] + '. ' + df_dataset[2]
+        stemming = False
+        rel_freq = False
 
         # find most common words in dataset
-        dict = BagOfWords.make_dict_common_words(corpus,
-                                                 rel_freq=True,
-                                                 stemming=False,
-                                                 n=200)
+        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
+        vocab = BagOfWords.make_vocab(extracted_words, stemming)
+        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
+        dict = BagOfWords.make_dict_common_words(matrix, 200, rel_freq, stemming)
 
         wordcloud = WordCloud(background_color='white',
                               width=2400, 
@@ -62,30 +63,25 @@ class VisualizerNews:
         x-axis: number of mentions of the company
         y-axis: frequency
         '''
-        print('# preparing histogram...')
+        print('# preparing histogram of company mentions...')
         print()
-        # old data set
-        filepath = 'data\\classification_labelled_corrected.csv'
-        df = pd.read_csv(filepath,
-                         sep='|',
+        # read data set
+        file = 'data\\interactive_labeling_dataset_without_header.csv'
+        df = pd.read_csv(file,
+                         delimiter='|',
+                         header=None,
+                         index_col=None,
                          engine='python',
-                         decimal='.',
-                         quotechar='\'',
-                         quoting=csv.QUOTE_NONE)
+                         usecols=[1,2],
+                         quoting=csv.QUOTE_NONNUMERIC,
+                         quotechar='\'')
 
-        # only articles with label==1
-        df_hits = df[df['Label'] == 1]
+        # # only articles with label==1
+        # df_hits = df[df['Label'] == 1]
+        # texts = df_hits['Title'] + '. ' + df_hits['Text']
+        texts = df[1] + '. ' + df[2]
 
-        texts = df_hits['Title'] + '. ' + df_hits['Text']
-
-        # # zum prüfen lesen
-        # for text in texts[10:20]:
-            # print(text)
-            # print()
-            # print(NER.find_companies(text))
-            # print()
-
-        # count names in hit articles
+        # dict: count articles with company names
         count_names = NER.count_companies(texts)
         
         # sort list in descending order
@@ -98,7 +94,7 @@ class VisualizerNews:
         plt.ylabel('Number of companies with this number of articles')
         num_bins = 50
         n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
-        # plt.grid(True)
+        plt.axis([0, 50, 0, 1000])
         plt.show()
 
     def plot_histogram_text_lengths():
@@ -106,20 +102,21 @@ class VisualizerNews:
         x-axis: number of characters in article (without headline)
         y-axis: frequency
         '''
-        print('# preparing histogram...')
+        print('# preparing histogram of text lengths...')
         print()
-        # new data set
+        # read data set
         filepath = 'data\\interactive_labeling_dataset.csv'
         df_dataset = pd.read_csv(filepath,
                                  delimiter='|',
                                  header=0,
                                  index_col=None,
                                  engine='python',
+                                 usecols=[2],
                                  #nrows=100,
                                  quoting=csv.QUOTE_NONNUMERIC,
                                  quotechar='\'')
         # consider only Text, not Headline
-        texts = df_dataset['Text']
+        texts = df_dataset[2]
 
         # count characters in articles
         print('# counting characters in articles...')
@@ -150,7 +147,7 @@ class VisualizerNews:
 
     def plot_pie_chart_of_sites():
 
-        print('# preparing pie chart...')
+        print('# preparing pie chart of news article sites...')
         print()
 
         # load data set
@@ -164,13 +161,15 @@ class VisualizerNews:
                                  #nrows=100,
                                  quoting=csv.QUOTE_NONNUMERIC,
                                  quotechar='\'')
-
+        # find all different sites
         df_counts = df_dataset.groupby('Site').count()
+        # count occurences of each site
         df_counts = df_counts.sort_values(['Url'], ascending=False)
 
         fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
 
         data = list(df_counts['Url'])
+        # legend labels
         labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)', 
                   'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
 
@@ -188,14 +187,14 @@ class VisualizerNews:
         plt.show()
 
     def plot_hist_most_common_words(n_commons = 10):
-        print('# preparing histogram...')
+        print('# preparing histogram of most common words...')
         print()
         # load data set
         filepath = 'data\\interactive_labeling_dataset_without_header.csv'
         df_dataset = pd.read_csv(filepath,
                                  delimiter='|',
                                  header=None,
-                                 #usecols=[1,2],
+                                 usecols=[1,2],
                                  index_col=None,
                                  engine='python',
                                  #nrows=1000,
@@ -204,11 +203,14 @@ class VisualizerNews:
 
         corpus = df_dataset[1] + '. ' + df_dataset[2]
 
+        stemming = False
+        rel_freq = True
+
         # find most common words in dataset
-        dict = BagOfWords.make_dict_common_words(corpus,
-                                                 rel_freq=True,
-                                                 stemming=False,
-                                                 n=n_commons)
+        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
+        vocab = BagOfWords.make_vocab(extracted_words, stemming)
+        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
+        dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, stemming)
 
         plt.xlabel('Most common words in textual corpus')
         plt.ylabel('Relative frequency')