callable scripts

2018-09-17 21:16:19 +02:00 · 2018-09-17 21:16:19 +02:00 · f934b5a1a0
commit f934b5a1a0
parent ab578ae0c6
8 changed files with 131 additions and 105 deletions
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -3,7 +3,7 @@ Bag Of Words
 ============

 BagOfWords counts word stems in an article
-and adds new words to the global vocabulary. 
+and adds new words to the global vocabulary.

 Anm.:
 The multinomial Naive Bayes classifier is suitable
@ -67,7 +67,7 @@ class BagOfWords:
        (rows: different articles, colums: different words in vocab)
        '''
        print('# BOW: calculating matrix')
-        print('#')
+        print('# ...')
        # create list of tuples
        vectors = []
        for i in range(len(series)):
@ -101,7 +101,7 @@ class BagOfWords:
        input: dataframe of all articles, return value: list of words
        '''
        print('# BOW: making vocabulary of data set')
-        print('#')
+        print('# ...')
        vocab = set()
        for text in series:
            vocab |= set(BagOfWords.extract_words(text))
--- a/DecisionTree.py
+++ b/DecisionTree.py
@ -22,19 +22,9 @@ from sklearn.model_selection import StratifiedKFold

 class DecisionTree:

-    print('# starting program')
-    print('#')
-
-    file = 'classification_labelled_corrected.csv'
-
-    # read csv file
-    print('# reading dataset')
-    print('#')
-    dataset = CsvHandler.read_csv(file)
-
    def make_tree(dataset):
-        print('# starting decision tree')
-        print('#')
+        print('# fitting model')
+        print('# ...')

        X = dataset['Title'] + ' ' + dataset['Text']
        y = dataset['Label']
@ -42,9 +32,9 @@ class DecisionTree:
        #count_vector = CountVectorizer()

        # use stratified k-fold cross-validation as split method
-        skf = StratifiedKFold(n_splits = 10, shuffle=True) 
+        skf = StratifiedKFold(n_splits = 10, shuffle=True)

-        # lists for metrics predicted on test/train set     
+        # lists for metrics predicted on test/train set
        f1_scores = []
        f1_scores_train = []

@ -114,8 +104,19 @@ class DecisionTree:
                # format(min(f1_scores_train), max(f1_scores_train),
                # sum(f1_scores_train)/float(len(f1_scores_train))))
        # print()
-        print('# ending decision tree')
-        print('#')

-    DecisionTree.make_tree(dataset)
-    print('# ending program')
+    #################################
+    print('# starting decision tree')
+    print('# ...')
+
+    file = 'classification_labelled_corrected.csv'
+
+    # read csv file
+    print('# reading dataset')
+    print('# ...')
+
+    dataset = CsvHandler.read_csv(file)
+
+    make_tree(dataset)
+
+    print('# ending decision tree')
--- a/FilterKeywords.py
+++ b/FilterKeywords.py
@ -2,68 +2,67 @@
 Filter Keywords
 ===============

-FilterKeywords searches for merger specific keywords 
+FilterKeywords searches for merger specific keywords
 in an article and counts them.
 '''

+# toDo: dict ändern!
+
 import re

 from nltk.stem.porter import PorterStemmer

 class FilterKeywords:
-          
+
    def search_keywords(dict_input):
        '''extracts relevant key-value pairs of in article's input dictionary,
        output are the contained keywords and their count.
-        ''' 
-        
+        '''
+
        # # list of regular expressions that match merger specific keywords
-        # regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?', 
-                      # r'business combinations?', r'combined compan(y|ies)', 
+        # regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
+                      # r'business combinations?', r'combined compan(y|ies)',
                      # r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
                      # r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
-                      # r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?', 
-                      # r'purchase', r'(sell(s|ers?|ing)?|sold)']   
-                    
-        keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 
-                        'acquisition', 'acquire', 'acquisitions', 'acquires', 
-                        'combine', 'combines', 'combination', 'combined', 
-                        'joint', 'venture', 'JV', 'takeover', 'take-over', 
-                        'tie-up', 'deal', 'deals', 'transaction', 
-                        'transactions', 'approve', 'approves', 'approved', 
-                        'approving', 'approval', 'approvals', 'buy', 'buys', 
-                        'buying', 'bought', 'buyout', 'buy-out', 'purchase', 
+                      # r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
+                      # r'purchase', r'(sell(s|ers?|ing)?|sold)']
+
+        keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
+                        'acquisition', 'acquire', 'acquisitions', 'acquires',
+                        'combine', 'combines', 'combination', 'combined',
+                        'joint', 'venture', 'JV', 'takeover', 'take-over',
+                        'tie-up', 'deal', 'deals', 'transaction',
+                        'transactions', 'approve', 'approves', 'approved',
+                        'approving', 'approval', 'approvals', 'buy', 'buys',
+                        'buying', 'bought', 'buyout', 'buy-out', 'purchase',
                        'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
-                        
+
        # reduce words to stem
        stemmer = PorterStemmer()
        for i in range(len(keyword_list)):
-            keyword_list[i] = stemmer.stem(keyword_list[i])       
-        
+            keyword_list[i] = stemmer.stem(keyword_list[i])
+
        # remove duplicates
        keywords = set(keyword_list)
-    
+
        # counts keywords in article
        dict_keywords = {}
-        
+
        # search for matchings in dictionary of input article
        for key in dict_input.keys():
            # iterate over all regular expressions
            for kword in keywords:
-                if re.match(kword, key):  
+                if re.match(kword, key):
                    # if match, increase value of matching key
                    if str(kword) in dict_keywords:
                        dict_keywords[str(kword)] += dict_input[key]
                    else:
                        dict_keywords[str(kword)] = dict_input[key]
-                        
+
        return dict_keywords
-        
+
    def count_keywords(dict_keywords):
        '''input: dict with article's keywords (key) and their count (value),
        returns number of keywords that are found.
        '''
-        return sum(dict_keywords.values())
-    
-    
-        
+        return sum(dict_keywords.values())
--- a/NER.py
+++ b/NER.py
@ -3,10 +3,10 @@ Named Entity Recognition (NER)
 ==============================

 NER takes a text as input and searches for names of persons, companies
-and countries. 
+and countries.
 '''
 from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
-from nltk.tree import Tree 
+from nltk.tree import Tree

 ''' TODO: falsch klassifiert:
 [('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
@ -16,7 +16,7 @@ from nltk.tree import Tree
 '''

 class NER:
-      
+
    def get_ne_with_label(text):
        labels = []
        names = []
@ -32,29 +32,29 @@ class NER:
                    #print(chunk.label(), ' '.join(c[0] for c in chunk))
        return list(zip(labels, names))

-test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets 
-                \nmostly fell in light volumes on Tuesday as energy shares 
+test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
+                \nmostly fell in light volumes on Tuesday as energy shares
                tracked \nfalls in global oil prices, while weaknesses in banking shares
-                \namid concerns about loans to an ailing steel firm sent the Thai 
-                \nindex to a one-week closing low. \nBangkok's SET index shed nearly 
-                1 percent after four \nsessions of gains. The index closed at 1,379.32, 
+                \namid concerns about loans to an ailing steel firm sent the Thai
+                \nindex to a one-week closing low. \nBangkok's SET index shed nearly
+                1 percent after four \nsessions of gains. The index closed at 1,379.32,
                its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
-                the most actively \ntraded by turnover, dropped 2.8 percent to a near 
-                one-month low, \nreflecting potential impact of loans to Sahaviriya Steel 
-                \nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities 
-                downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure 
-                to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be 
-                lower than 130 percent, the \ndesired level we think and hence the need for 
-                more provisioning \nin the following quarters,\" the broker said in a report. 
-                \nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its 
-                creditors, dropped 1 percent. The steel firm \nand its three creditors 
-                agreed on Monday to consider options to \nrestructure debt worth over 
-                50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their 
-                slides for a third \nsession, Singapore gave up early gains and Indonesia 
-                \nhit a near one-week low, all with trading volumes below \nthe 30-day 
-                average ahead of a public holiday on Thursday. \nAmong top losers in the 
-                region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and 
-                Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell 
+                the most actively \ntraded by turnover, dropped 2.8 percent to a near
+                one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
+                \nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
+                downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
+                to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
+                lower than 130 percent, the \ndesired level we think and hence the need for
+                more provisioning \nin the following quarters,\" the broker said in a report.
+                \nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
+                creditors, dropped 1 percent. The steel firm \nand its three creditors
+                agreed on Monday to consider options to \nrestructure debt worth over
+                50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
+                slides for a third \nsession, Singapore gave up early gains and Indonesia
+                \nhit a near one-week low, all with trading volumes below \nthe 30-day
+                average ahead of a public holiday on Thursday. \nAmong top losers in the
+                region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
+                Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
                \namid uncertainty over global demand. \nFor Asian Companies click.'''

 print(NER.get_ne_with_label(test_article))
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -1,6 +1,6 @@
 '''
 Naive Bayes Classifier
-====================== 
+======================

 Naive Bayes is a probabilistic classifier that is able to predict a
 probability distribution over a set of classes, rather than only
@ -13,7 +13,7 @@ regardless of any possible correlations between these features.
 '''

 from BagOfWords import BagOfWords
-from CsvReader import CsvReader
+from CsvHandler import CsvHandler

 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
@ -23,22 +23,12 @@ from sklearn.naive_bayes import GaussianNB

 class NaiveBayes:

-    print('# starting program')
-    print('#')
-
-    file = 'classification_labelled_corrected.csv'
-
-    # read csv file
-    print('# reading dataset')
-    print('#')
-    dataset = CsvHandler.read_csv(file)
-
    def make_naive_bayes(dataset):
        '''fits naive bayes model with StratifiedKFold,
        uses my BOW
        '''
-        print('# starting naive bayes')
-        print('#')
+        print('# fitting model')
+        print('# ...')

        # split data into text and label set
        # join title and text
@ -120,7 +110,7 @@ class NaiveBayes:
                        max(recall_scores),
                        sum(recall_scores)/float(len(recall_scores))))
        print('F1 score: min = {}, max = {}, average = {}'
-                .format(min(f1_scores), 
+                .format(min(f1_scores),
                        max(f1_scores),
                        sum(f1_scores)/float(len(f1_scores))))
        print()
@ -130,11 +120,8 @@ class NaiveBayes:
        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
        #format(min(f1_scores_train), max(f1_scores_train),
        #sum(f1_scores_train)/float(len(f1_scores_train))))
-        #print() 
+        #print()

-        print('# ending naive bayes')
-        print('#') 
-        
    ######## nur für resubstitutionsfehler benötigt ########
    def analyze_errors(dataset):
        '''calculates resubstitution error
@ -143,7 +130,7 @@ class NaiveBayes:
        '''
        X_train_test = dataset['Title'] + ' ' + dataset['Text']
        y_train_test = dataset['Label']
-        
+
        count_vector = CountVectorizer()
        # fit the training data and then return the matrix
        training_data = count_vector.fit_transform(X_train_test).toarray()
@ -172,5 +159,19 @@ class NaiveBayes:
        #print metrics
        print('F1 score: ', format(f1_score(y_train_test, predictions)))

+    #################################
+    print('# starting naive bayes')
+    print('# ...')
+
+    file = 'classification_labelled_corrected.csv'
+
+    # read csv file
+    print('# reading dataset')
+    print('# ...')
+
+    dataset = CsvHandler.read_csv(file)
+
+    make_naive_bayes(dataset)
+
    print('#')
-    print('# ending program')
+    print('# ending naive bayes')
--- a/Requester.py
+++ b/Requester.py
@ -28,7 +28,8 @@ class Requester:

        # print message
        print('# retrieving articles from webhose.io')
-    
+        print('# ...')
+
        # personal API key
        webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")

@ -57,6 +58,7 @@ class Requester:
        num_downloads = int(sum_posts / 100)
        print('# collecting first {} articles'.format(num_downloads * 100))
        print('# sorting out other sources than reuters')
+        print('# ...')

        # twodimensional list of all articles
        list_articles = []
@ -90,4 +92,9 @@ class Requester:
        df = pd.DataFrame(data=list_articles,
                          columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
        # save csv
-        CsvHandler.write_csv(df, filestring)
+        CsvHandler.write_csv(df, filestring)
+
+    print('# starting requester')
+    print('# ...')
+    save_articles_from_webhoseio()
+    print('# ending requester')
--- a/SVM.py
+++ b/SVM.py
@ -13,6 +13,7 @@ to belong to a category based on which side of the gap they fall.
 '''

 from BagOfWords import BagOfWords
+from CsvHandler import CsvHandler

 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_selection import SelectPercentile
@ -26,8 +27,8 @@ class SVM:

    def make_svm(dataset):

-        print('# starting SVM')
-        print('#')
+        print('# fitting model')
+        print('# ...')

        # split data into text and label set

@ -38,7 +39,7 @@ class SVM:

        # Bag of Words
        print('# calculating bag of words')
-        print('#')
+        print('# ...')
        # fit the training data and then return the matrix
        #X = BagOfWords.fit_transform(X)
        X = CountVectorizer().fit_transform(X).toarray()
@ -59,7 +60,7 @@ class SVM:
                            scoring=make_scorer(f1_score))

        print('# fit classifier')
-        print('#')
+        print('# ...')

        grid.fit(X,y)

@ -83,5 +84,18 @@ class SVM:
        print(grid.best_params_)
        print()

-        print('# ending SVM')
-        print('#')
+    ########################
+    print('# starting svm')
+    print('# ...')
+
+    file = 'classification_labelled_corrected.csv'
+
+    # read csv file
+    print('# reading dataset')
+    print('# ...')
+
+    dataset = CsvHandler.read_csv(file)
+
+    make_svm(dataset)
+
+    print('# ending svm')
--- a/Starter.py
+++ b/Starter.py
@ -13,15 +13,19 @@ from NaiveBayes import NaiveBayes
 from SVM import SVM

 print('# starting program')
-print('#')
+print('# ...')
+
+# only if new unlabeled(!) data set is required:
+# Requester.save_articles_from_webhoseio()

 file = 'classification_labelled_corrected.csv'

 # read csv file
 print('# reading dataset')
-print('#')
+print('# ...')
 dataset = CsvHandler.read_csv(file)

+# DecisionTree.make_tree(dataset)
 NaiveBayes.make_naive_bayes(dataset)
 # SVM.make_svm(dataset)