callable scripts

2018-09-17 21:16:19 +02:00 · 2018-09-17 21:16:19 +02:00 · f934b5a1a0
commit f934b5a1a0
parent ab578ae0c6
8 changed files with 131 additions and 105 deletions
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -67,7 +67,7 @@ class BagOfWords:
        (rows: different articles, colums: different words in vocab)
        '''
        print('# BOW: calculating matrix')
-        print('#')
+        print('# ...')
        # create list of tuples
        vectors = []
        for i in range(len(series)):
@ -101,7 +101,7 @@ class BagOfWords:
        input: dataframe of all articles, return value: list of words
        '''
        print('# BOW: making vocabulary of data set')
-        print('#')
+        print('# ...')
        vocab = set()
        for text in series:
            vocab |= set(BagOfWords.extract_words(text))
--- a/DecisionTree.py
+++ b/DecisionTree.py
@ -22,19 +22,9 @@ from sklearn.model_selection import StratifiedKFold

 class DecisionTree:

-    print('# starting program')
-    print('#')
-
-    file = 'classification_labelled_corrected.csv'
-
-    # read csv file
-    print('# reading dataset')
-    print('#')
-    dataset = CsvHandler.read_csv(file)
-
    def make_tree(dataset):
-        print('# starting decision tree')
-        print('#')
+        print('# fitting model')
+        print('# ...')

        X = dataset['Title'] + ' ' + dataset['Text']
        y = dataset['Label']
@ -114,8 +104,19 @@ class DecisionTree:
                # format(min(f1_scores_train), max(f1_scores_train),
                # sum(f1_scores_train)/float(len(f1_scores_train))))
        # print()
-        print('# ending decision tree')
-        print('#')

-    DecisionTree.make_tree(dataset)
-    print('# ending program')
+    #################################
+    print('# starting decision tree')
+    print('# ...')
+
+    file = 'classification_labelled_corrected.csv'
+
+    # read csv file
+    print('# reading dataset')
+    print('# ...')
+
+    dataset = CsvHandler.read_csv(file)
+
+    make_tree(dataset)
+
+    print('# ending decision tree')
--- a/FilterKeywords.py
+++ b/FilterKeywords.py
@ -6,6 +6,8 @@ FilterKeywords searches for merger specific keywords
 in an article and counts them.
 '''

+# toDo: dict ändern!
+
 import re

 from nltk.stem.porter import PorterStemmer
@ -64,6 +66,3 @@ class FilterKeywords:
        returns number of keywords that are found.
        '''
        return sum(dict_keywords.values())
-    
-    
-        
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -13,7 +13,7 @@ regardless of any possible correlations between these features.
 '''

 from BagOfWords import BagOfWords
-from CsvReader import CsvReader
+from CsvHandler import CsvHandler

 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
@ -23,22 +23,12 @@ from sklearn.naive_bayes import GaussianNB

 class NaiveBayes:

-    print('# starting program')
-    print('#')
-
-    file = 'classification_labelled_corrected.csv'
-
-    # read csv file
-    print('# reading dataset')
-    print('#')
-    dataset = CsvHandler.read_csv(file)
-
    def make_naive_bayes(dataset):
        '''fits naive bayes model with StratifiedKFold,
        uses my BOW
        '''
-        print('# starting naive bayes')
-        print('#')
+        print('# fitting model')
+        print('# ...')

        # split data into text and label set
        # join title and text
@ -132,9 +122,6 @@ class NaiveBayes:
        #sum(f1_scores_train)/float(len(f1_scores_train))))
        #print()

-        print('# ending naive bayes')
-        print('#') 
-        
    ######## nur für resubstitutionsfehler benötigt ########
    def analyze_errors(dataset):
        '''calculates resubstitution error
@ -172,5 +159,19 @@ class NaiveBayes:
        #print metrics
        print('F1 score: ', format(f1_score(y_train_test, predictions)))

+    #################################
+    print('# starting naive bayes')
+    print('# ...')
+
+    file = 'classification_labelled_corrected.csv'
+
+    # read csv file
+    print('# reading dataset')
+    print('# ...')
+
+    dataset = CsvHandler.read_csv(file)
+
+    make_naive_bayes(dataset)
+
    print('#')
-    print('# ending program')
+    print('# ending naive bayes')
--- a/Requester.py
+++ b/Requester.py
@ -28,6 +28,7 @@ class Requester:

        # print message
        print('# retrieving articles from webhose.io')
+        print('# ...')

        # personal API key
        webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
@ -57,6 +58,7 @@ class Requester:
        num_downloads = int(sum_posts / 100)
        print('# collecting first {} articles'.format(num_downloads * 100))
        print('# sorting out other sources than reuters')
+        print('# ...')

        # twodimensional list of all articles
        list_articles = []
@ -91,3 +93,8 @@ class Requester:
                          columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
        # save csv
        CsvHandler.write_csv(df, filestring)
+
+    print('# starting requester')
+    print('# ...')
+    save_articles_from_webhoseio()
+    print('# ending requester')
--- a/SVM.py
+++ b/SVM.py
@ -13,6 +13,7 @@ to belong to a category based on which side of the gap they fall.
 '''

 from BagOfWords import BagOfWords
+from CsvHandler import CsvHandler

 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
@ -26,8 +27,8 @@ class SVM:

    def make_svm(dataset):

-        print('# starting SVM')
-        print('#')
+        print('# fitting model')
+        print('# ...')

        # split data into text and label set

@ -38,7 +39,7 @@ class SVM:

        # Bag of Words
        print('# calculating bag of words')
-        print('#')
+        print('# ...')
        # fit the training data and then return the matrix
        #X = BagOfWords.fit_transform(X)
        X = CountVectorizer().fit_transform(X).toarray()
@ -59,7 +60,7 @@ class SVM:
                            scoring=make_scorer(f1_score))

        print('# fit classifier')
-        print('#')
+        print('# ...')

        grid.fit(X,y)

@ -83,5 +84,18 @@ class SVM:
        print(grid.best_params_)
        print()

-        print('# ending SVM')
-        print('#')
+    ########################
+    print('# starting svm')
+    print('# ...')
+
+    file = 'classification_labelled_corrected.csv'
+
+    # read csv file
+    print('# reading dataset')
+    print('# ...')
+
+    dataset = CsvHandler.read_csv(file)
+
+    make_svm(dataset)
+
+    print('# ending svm')
--- a/Starter.py
+++ b/Starter.py
@ -13,15 +13,19 @@ from NaiveBayes import NaiveBayes
 from SVM import SVM

 print('# starting program')
-print('#')
+print('# ...')
+
+# only if new unlabeled(!) data set is required:
+# Requester.save_articles_from_webhoseio()

 file = 'classification_labelled_corrected.csv'

 # read csv file
 print('# reading dataset')
-print('#')
+print('# ...')
 dataset = CsvHandler.read_csv(file)

+# DecisionTree.make_tree(dataset)
 NaiveBayes.make_naive_bayes(dataset)
 # SVM.make_svm(dataset)