deleted .gitignore

2018-09-14 09:19:12 +02:00 · 2018-09-14 09:19:12 +02:00 · 0b424835d8
commit 0b424835d8
parent 52146158e2
5 changed files with 108 additions and 292 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,221 +0,0 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 .python-version
 # celery beat schedule file
 celerybeat-schedule
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -4,6 +4,15 @@ Bag Of Words
 BagOfWords counts word stems in an article
 and adds new words to the global vocabulary. 
 Anm.:
 The multinomial Naive Bayes classifier is suitable 
 for classification with discrete features (e.g., 
 word counts for text classification). 
 The multinomial distribution normally requires 
 integer feature counts. However, in practice, 
 fractional counts such as tf-idf may also work.
 => durch 'relative_word_frequencies' als Paramter berücksichtigt
 '''
 import re
@ -50,12 +59,15 @@ class BagOfWords:
        word = stemmer.stem(word)           
        return word
-    def make_matrix(series, vocab, relative_word_frequencies):
+    def make_matrix(series, vocab, relative_word_frequencies=True):
        '''calculates word stem frequencies in input articles.
        returns matrix (DataFrame) with relative word frequencies 
-        (0 <= values < 1) or absolute word frequencies (int).
+        (0 <= values < 1) if relative_word_frequencies=True or absolute
        word frequencies (int) if relative_word_frequencies=False.
        (rows: different articles, colums: different words in vocab)
        '''
        print('# BOW: calculating matrix')
        print('#')
        # create list of tuples
        vectors = []       
        for i in range(len(series)):
@ -88,6 +100,8 @@ class BagOfWords:
        '''adds words of input articles to a global vocabulary.
        input: dataframe of all articles, return value: list of words
        '''
        print('# BOW: making vocabulary of data set')
        print('#')
        vocab = set()
        for text in series:
            vocab |= set(BagOfWords.extract_words(text))
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -12,96 +12,119 @@ independently to the probability that it belongs to its category,
 regardless of any possible correlations between these features. 
 '''
 #!!
 # The multinomial Naive Bayes classifier is suitable 
 #for classification with discrete features (e.g., 
 #word counts for text classification). 
 #The multinomial distribution normally requires 
 #integer feature counts. However, in practice, 
 #fractional counts such as tf-idf may also work.
 # => nur bei eigenem BOW berücksichtigt
 from BagOfWords import BagOfWords
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
-from sklearn.metrics import f1_score, make_scorer
+from sklearn.metrics import recall_score, precision_score
 from sklearn.model_selection import StratifiedKFold
-from sklearn.model_selection import GridSearchCV
+from sklearn.naive_bayes import GaussianNB
 from sklearn.pipeline import Pipeline
 from sklearn.naive_bayes import MultinomialNB
 # MultinomialNB statt GaussianNB benutzt => OK?
 #from sklearn.naive_bayes import GaussianNB
 class NaiveBayes:
    def make_naive_bayes(dataset):
-        '''fits naive bayes model
+        '''fits naive bayes model with StratifiedKFold, 
        uses my BOW
        '''           
        print('# starting naive bayes')
        print('#')
        # split data into text and label set
        # join title and text
        X = dataset['Title'] + ' ' + dataset['Text']        
        y = dataset['Label']
-        # Bag of Words
+        cv = CountVectorizer()
        print('# calculating bag of words')
        print('#')
        # fit the training data and then return the matrix     
        # toDO: warum so andere (schlechte) werte mit meinem BOW?
        #X = BagOfWords.fit_transform(X, False)
        X = CountVectorizer().fit_transform(X).toarray()
        # use stratified k-fold cross-validation as split method
-        skf = StratifiedKFold(n_splits = 10, shuffle=True)        
+        skf = StratifiedKFold(n_splits = 10, shuffle=True)      
        # use only most important features
        selector = SelectPercentile()  
-        pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
+        classifier = GaussianNB()    
-        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
+        # lists for metrics
-                                       'NB__alpha': [0.00000001, 0.0000001, 
+        recall_scores = []
-                                                     0.000001, 0.00001, 
+        precision_scores = []
-                                                     0.0001, 0.001, 0.01, 
+        f1_scores = []
-                                                     0.1]},
+        
-                                       cv=skf, 
+        # for each fold
-                                       scoring=make_scorer(f1_score))
+        n = 0
        for train, test in skf.split(X,y):  
            n += 1
            print('# split no. ' + str(n))
-        print('# fit classifier')
+            # # eigenes BOW => schlechtere ergebnisse
-        print('#') 
+            # vocab = BagOfWords.make_vocab(X[train])   
-  
+            # # fit the training data and then return the matrix
-        grid.fit(X,y)
+            # training_data = BagOfWords.make_matrix(X[train], vocab)            
-        
+            # # transform testing data and return the matrix
-        # DataFrame of results
+            # testing_data = BagOfWords.make_matrix(X[test], vocab)
-        df_results = grid.cv_results_
+            
-        
+            # # using CountVectorizer:
-        # print results
+            # fit the training data and then return the matrix
-        ######################
+            training_data = cv.fit_transform(X[train], y[train]).toarray()
-        print('RESULTS:')
+            # transform testing data and return the matrix
-        print('#')
+            testing_data = cv.transform(X[test]).toarray()    
-        print('mean_test_score:')
+            
-        print(df_results['mean_test_score'])
+            # # apply select percentile
-        print('#')
+            # selector = SelectPercentile(percentile=25)           
-        print('mean of means:')
+            # selector.fit(training_data, y[train])
-        print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
+            
-        print('#')
+            # training_data_r = selector.transform(training_data)           
-        print('best score:') 
+            # testing_data_r = selector.transform(testing_data)
-        print(grid.best_score_)
+            
-        print('#')
+            # #fit classifier
-        print('best parameters set found on development set:')
+            # classifier.fit(training_data_r, y[train])            
-        print(grid.best_params_)
+            # #predict class                      
-        print('#')
+            # predictions_train = classifier.predict(training_data_r)
            # predictions_test = classifier.predict(testing_data_r)
            #fit classifier
            classifier.fit(training_data, y[train])            
            #predict class                      
            predictions_train = classifier.predict(training_data)
            predictions_test = classifier.predict(testing_data)
            #print and store metrics
            rec = recall_score(y[test], predictions_test)
            print('rec: ' + str(rec))
            recall_scores.append(rec)  
            prec = precision_score(y[train], predictions_train)
            print('prec: ' + str(prec))
            print('#')
            precision_scores.append(prec)
            # equation for f1 score
            f1_scores.append(2 * (prec * rec)/(prec + rec))
        ##########################
        #print metrics of test set    
        print('-------------------------')
        print('prediction of testing set:')
        print('Precision score: min = {}, max = {}, average = {}'
                .format(min(precision_scores),
                        max(precision_scores),
                        sum(precision_scores)/float(len(precision_scores))))      
        print('Recall score: min = {}, max = {}, average = {}'
                .format(min(recall_scores),
                        max(recall_scores),
                        sum(recall_scores)/float(len(recall_scores))))      
        print('F1 score: min = {}, max = {}, average = {}'
                .format(min(f1_scores), 
                        max(f1_scores),
                        sum(f1_scores)/float(len(f1_scores))))    
        print()
        ##### nur für overfit testing ###########
        #print('overfit testing: prediction of training set')
        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
        #format(min(f1_scores_train), max(f1_scores_train),
        #sum(f1_scores_train)/float(len(f1_scores_train))))
        #print() 
        print('# ending naive bayes')
-        print('#')
+        print('#') 
    ######## nur für resubstitutionsfehler benötigt ########       
    def analyze_errors(dataset):
        '''calculates resubstitution error
        shows indices of false classified articles
--- a/SVM.py
+++ b/SVM.py
@ -51,10 +51,10 @@ class SVM:
        pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
-        grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],                                                          
+        grid = GridSearchCV(pipeline, {'perc__percentile': [50, 100],                                                          
                            'SVC__kernel': ['linear','poly','rbf','sigmoid'],
-                            'SVC__gamma': [0.0001, 0.001, 0.01, 0.1, 1], 
+                            'SVC__gamma': [0.01, 0.1], 
-                            'SVC__C': [0.0001, 0.001, 0.01, 0.1, 1]}, 
+                            'SVC__C': [0.01, 0.1]}, 
                            cv=skf, 
                            scoring=make_scorer(f1_score))
--- a/Starter.py
+++ b/Starter.py
@ -27,6 +27,6 @@ dataset = CsvHandler.read_csv(file)
 # DecisionTree.make_tree(dataset)
 NaiveBayes.make_naive_bayes(dataset)
-SVM.make_svm(dataset)
+# SVM.make_svm(dataset)
 print('# ending program')