deleted .gitignore

2018-09-14 09:19:12 +02:00 · 2018-09-14 09:19:12 +02:00 · 0b424835d8
commit 0b424835d8
parent 52146158e2
5 changed files with 108 additions and 292 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,221 +0,0 @@
-# Byte-compiled / optimized / DLL files
-
-__pycache__/
-
-*.py[cod]
-
-*$py.class
-
-
-
-# C extensions
-
-*.so
-
-
-
-# Distribution / packaging
-
-.Python
-
-build/
-
-develop-eggs/
-
-dist/
-
-downloads/
-
-eggs/
-
-.eggs/
-
-lib/
-
-lib64/
-
-parts/
-
-sdist/
-
-var/
-
-wheels/
-
-*.egg-info/
-
-.installed.cfg
-
-*.egg
-
-MANIFEST
-
-
-
-# PyInstaller
-
-#  Usually these files are written by a python script from a template
-
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-
-*.manifest
-
-*.spec
-
-
-
-# Installer logs
-
-pip-log.txt
-
-pip-delete-this-directory.txt
-
-
-
-# Unit test / coverage reports
-
-htmlcov/
-
-.tox/
-
-.nox/
-
-.coverage
-
-.coverage.*
-
-.cache
-
-nosetests.xml
-
-coverage.xml
-
-*.cover
-
-.hypothesis/
-
-.pytest_cache/
-
-
-
-# Translations
-
-*.mo
-
-*.pot
-
-
-
-# Django stuff:
-
-*.log
-
-local_settings.py
-
-db.sqlite3
-
-
-
-# Flask stuff:
-
-instance/
-
-.webassets-cache
-
-
-
-# Scrapy stuff:
-
-.scrapy
-
-
-
-# Sphinx documentation
-
-docs/_build/
-
-
-
-# PyBuilder
-
-target/
-
-
-
-# Jupyter Notebook
-
-.ipynb_checkpoints
-
-
-
-# IPython
-
-profile_default/
-
-ipython_config.py
-
-
-
-# pyenv
-
-.python-version
-
-
-
-# celery beat schedule file
-
-celerybeat-schedule
-
-
-
-# SageMath parsed files
-
-*.sage.py
-
-
-
-# Environments
-
-.env
-
-.venv
-
-env/
-
-venv/
-
-ENV/
-
-env.bak/
-
-venv.bak/
-
-
-
-# Spyder project settings
-
-.spyderproject
-
-.spyproject
-
-
-
-# Rope project settings
-
-.ropeproject
-
-
-
-# mkdocs documentation
-
-/site
-
-
-
-# mypy
-
-.mypy_cache/
-
-.dmypy.json
-
-dmypy.json
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -4,6 +4,15 @@ Bag Of Words

 BagOfWords counts word stems in an article
 and adds new words to the global vocabulary. 
+
+Anm.:
+The multinomial Naive Bayes classifier is suitable 
+for classification with discrete features (e.g., 
+word counts for text classification). 
+The multinomial distribution normally requires 
+integer feature counts. However, in practice, 
+fractional counts such as tf-idf may also work.
+=> durch 'relative_word_frequencies' als Paramter berücksichtigt
 '''

 import re
@ -50,12 +59,15 @@ class BagOfWords:
        word = stemmer.stem(word)           
        return word
        
-    def make_matrix(series, vocab, relative_word_frequencies):
+    def make_matrix(series, vocab, relative_word_frequencies=True):
        '''calculates word stem frequencies in input articles.
        returns matrix (DataFrame) with relative word frequencies 
-        (0 <= values < 1) or absolute word frequencies (int).
+        (0 <= values < 1) if relative_word_frequencies=True or absolute
+        word frequencies (int) if relative_word_frequencies=False.
        (rows: different articles, colums: different words in vocab)
        '''
+        print('# BOW: calculating matrix')
+        print('#')
        # create list of tuples
        vectors = []       
        for i in range(len(series)):
@ -88,6 +100,8 @@ class BagOfWords:
        '''adds words of input articles to a global vocabulary.
        input: dataframe of all articles, return value: list of words
        '''
+        print('# BOW: making vocabulary of data set')
+        print('#')
        vocab = set()
        for text in series:
            vocab |= set(BagOfWords.extract_words(text))
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -12,96 +12,119 @@ independently to the probability that it belongs to its category,
 regardless of any possible correlations between these features. 
 '''

-#!!
-# The multinomial Naive Bayes classifier is suitable 
-#for classification with discrete features (e.g., 
-#word counts for text classification). 
-#The multinomial distribution normally requires 
-#integer feature counts. However, in practice, 
-#fractional counts such as tf-idf may also work.
-
-# => nur bei eigenem BOW berücksichtigt
-
 from BagOfWords import BagOfWords

 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
-from sklearn.metrics import f1_score, make_scorer
+from sklearn.metrics import recall_score, precision_score
 from sklearn.model_selection import StratifiedKFold
-from sklearn.model_selection import GridSearchCV
-from sklearn.pipeline import Pipeline
-from sklearn.naive_bayes import MultinomialNB
-
-# MultinomialNB statt GaussianNB benutzt => OK?
-#from sklearn.naive_bayes import GaussianNB
+from sklearn.naive_bayes import GaussianNB

 class NaiveBayes:

    def make_naive_bayes(dataset):
-        '''fits naive bayes model
+        '''fits naive bayes model with StratifiedKFold, 
+        uses my BOW
        '''           
        print('# starting naive bayes')
        print('#')
        
        # split data into text and label set
+        # join title and text
        X = dataset['Title'] + ' ' + dataset['Text']        
        y = dataset['Label']
        
-        # Bag of Words
-        print('# calculating bag of words')
-        print('#')
-        
-        # fit the training data and then return the matrix     
-        
-        # toDO: warum so andere (schlechte) werte mit meinem BOW?
-        #X = BagOfWords.fit_transform(X, False)
-        
-        X = CountVectorizer().fit_transform(X).toarray()
+        cv = CountVectorizer()
        
        # use stratified k-fold cross-validation as split method
-        skf = StratifiedKFold(n_splits = 10, shuffle=True)        
-
-        # use only most important features
-        selector = SelectPercentile()  
+        skf = StratifiedKFold(n_splits = 10, shuffle=True)      
        
-        pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
+        classifier = GaussianNB()    
        
-        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
-                                       'NB__alpha': [0.00000001, 0.0000001, 
-                                                     0.000001, 0.00001, 
-                                                     0.0001, 0.001, 0.01, 
-                                                     0.1]},
-                                       cv=skf, 
-                                       scoring=make_scorer(f1_score))
+        # lists for metrics
+        recall_scores = []
+        precision_scores = []
+        f1_scores = []
+        
+        # for each fold
+        n = 0
+        for train, test in skf.split(X,y):  
+        
+            n += 1
+            print('# split no. ' + str(n))
            
-        print('# fit classifier')
-        print('#') 
-  
-        grid.fit(X,y)
-        
-        # DataFrame of results
-        df_results = grid.cv_results_
-        
-        # print results
-        ######################
-        print('RESULTS:')
-        print('#')
-        print('mean_test_score:')
-        print(df_results['mean_test_score'])
-        print('#')
-        print('mean of means:')
-        print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
-        print('#')
-        print('best score:') 
-        print(grid.best_score_)
-        print('#')
-        print('best parameters set found on development set:')
-        print(grid.best_params_)
-        print('#')
+            # # eigenes BOW => schlechtere ergebnisse
+            # vocab = BagOfWords.make_vocab(X[train])   
+            # # fit the training data and then return the matrix
+            # training_data = BagOfWords.make_matrix(X[train], vocab)            
+            # # transform testing data and return the matrix
+            # testing_data = BagOfWords.make_matrix(X[test], vocab)
+            
+            # # using CountVectorizer:
+            # fit the training data and then return the matrix
+            training_data = cv.fit_transform(X[train], y[train]).toarray()
+            # transform testing data and return the matrix
+            testing_data = cv.transform(X[test]).toarray()    
+            
+            # # apply select percentile
+            # selector = SelectPercentile(percentile=25)           
+            # selector.fit(training_data, y[train])
+            
+            # training_data_r = selector.transform(training_data)           
+            # testing_data_r = selector.transform(testing_data)
+            
+            # #fit classifier
+            # classifier.fit(training_data_r, y[train])            
+            # #predict class                      
+            # predictions_train = classifier.predict(training_data_r)
+            # predictions_test = classifier.predict(testing_data_r)
+            
+            #fit classifier
+            classifier.fit(training_data, y[train])            
+            #predict class                      
+            predictions_train = classifier.predict(training_data)
+            predictions_test = classifier.predict(testing_data)
+            
+            #print and store metrics
+            rec = recall_score(y[test], predictions_test)
+            print('rec: ' + str(rec))
+            recall_scores.append(rec)  
+            prec = precision_score(y[train], predictions_train)
+            print('prec: ' + str(prec))
+            print('#')
+            precision_scores.append(prec)
+            # equation for f1 score
+            f1_scores.append(2 * (prec * rec)/(prec + rec))
+          
+        ##########################
+        #print metrics of test set    
+        print('-------------------------')
+        print('prediction of testing set:')
+        print('Precision score: min = {}, max = {}, average = {}'
+                .format(min(precision_scores),
+                        max(precision_scores),
+                        sum(precision_scores)/float(len(precision_scores))))      
+        print('Recall score: min = {}, max = {}, average = {}'
+                .format(min(recall_scores),
+                        max(recall_scores),
+                        sum(recall_scores)/float(len(recall_scores))))      
+        print('F1 score: min = {}, max = {}, average = {}'
+                .format(min(f1_scores), 
+                        max(f1_scores),
+                        sum(f1_scores)/float(len(f1_scores))))    
+        print()
        
+        ##### nur für overfit testing ###########
+        #print('overfit testing: prediction of training set')
+        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
+        #format(min(f1_scores_train), max(f1_scores_train),
+        #sum(f1_scores_train)/float(len(f1_scores_train))))
+        #print() 
+
        print('# ending naive bayes')
-        print('#')
+        print('#') 
        
+    ######## nur für resubstitutionsfehler benötigt ########       
    def analyze_errors(dataset):
        '''calculates resubstitution error
        shows indices of false classified articles
--- a/SVM.py
+++ b/SVM.py
@ -51,10 +51,10 @@ class SVM:
        
        pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
        
-        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],                                                          
+        grid = GridSearchCV(pipeline, {'perc__percentile': [50, 100],                                                          
                            'SVC__kernel': ['linear','poly','rbf','sigmoid'],
-                            'SVC__gamma': [0.0001, 0.001, 0.01, 0.1, 1], 
-                            'SVC__C': [0.0001, 0.001, 0.01, 0.1, 1]}, 
+                            'SVC__gamma': [0.01, 0.1], 
+                            'SVC__C': [0.01, 0.1]}, 
                            cv=skf, 
                            scoring=make_scorer(f1_score))
            
--- a/Starter.py
+++ b/Starter.py
@ -27,6 +27,6 @@ dataset = CsvHandler.read_csv(file)

 # DecisionTree.make_tree(dataset)
 NaiveBayes.make_naive_bayes(dataset)
-SVM.make_svm(dataset)
+# SVM.make_svm(dataset)

 print('# ending program')