Something

2018-09-14 17:44:10 +01:00 · 2018-09-14 17:44:10 +01:00 · fd467620a0
commit fd467620a0
parent c2066d6adb
3 changed files with 71 additions and 61 deletions
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -1,16 +1,18 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
 '''
 Bag Of Words
 ============

 BagOfWords counts word stems in an article
-and adds new words to the global vocabulary. 
+and adds new words to the global vocabulary.

 Anm.:
-The multinomial Naive Bayes classifier is suitable 
-for classification with discrete features (e.g., 
-word counts for text classification). 
-The multinomial distribution normally requires 
-integer feature counts. However, in practice, 
+The multinomial Naive Bayes classifier is suitable
+for classification with discrete features (e.g.,
+word counts for text classification).
+The multinomial distribution normally requires
+integer feature counts. However, in practice,
 fractional counts such as tf-idf may also work.
 => durch 'relative_word_frequencies' als Paramter berücksichtigt
 '''
@ -32,14 +34,14 @@ class BagOfWords:
    def extract_words(text):
        '''takes article as argument, removes numbers,
        returns list of single words, recurrences included.
-        '''                   
+        '''
        stop_words = BagOfWords.set_stop_words()
        # replace punctuation marks with spaces
-        words = re.sub(r'\W', ' ', text)      
+        words = re.sub(r'\W', ' ', text)
        # split str into list of single words
-        words = words.split()        
+        words = words.split()
        # list of all words to return
-        words_cleaned = []        
+        words_cleaned = []
        for word in words:
            # remove numbers
            if word.isalpha():
@ -50,18 +52,18 @@ class BagOfWords:
                    # add every word in lowercase
                    words_cleaned.append(word.lower())
        return words_cleaned
-        
+
    def reduce_word_to_stem(word):
        '''takes normal word as input, returns the word's stem
        '''
        stemmer = PorterStemmer()
        # replace word by its stem
-        word = stemmer.stem(word)           
+        word = stemmer.stem(word)
        return word
-        
+
    def make_matrix(series, vocab, relative_word_frequencies=True):
        '''calculates word stem frequencies in input articles.
-        returns matrix (DataFrame) with relative word frequencies 
+        returns matrix (DataFrame) with relative word frequencies
        (0 <= values < 1) if relative_word_frequencies=True or absolute
        word frequencies (int) if relative_word_frequencies=False.
        (rows: different articles, colums: different words in vocab)
@ -69,14 +71,14 @@ class BagOfWords:
        print('# BOW: calculating matrix')
        print('#')
        # create list of tuples
-        vectors = []       
+        vectors = []
        for i in range(len(series)):
            # extract text of single article
            text = series.iloc[i]
            # extract its words
            words = BagOfWords.extract_words(text)
-            # count words in single article 
-            word_count = len(words)     
+            # count words in single article
+            word_count = len(words)
            vector = []
            for i, v in enumerate(vocab):
                vector.append(0)
@ -88,14 +90,14 @@ class BagOfWords:
                        else:
                            # absolute word frequency
                            vector[i] += 1
-                            
+
            # add single vector as tuple
-            vectors.append(tuple(vector))           
-        df_vectors = pd.DataFrame.from_records(vectors, 
-                                               index=None, 
-                                               columns=vocab)            
+            vectors.append(tuple(vector))
+        df_vectors = pd.DataFrame.from_records(vectors,
+                                               index=None,
+                                               columns=vocab)
        return df_vectors
-        
+
    def make_vocab(series):
        '''adds words of input articles to a global vocabulary.
        input: dataframe of all articles, return value: list of words
@ -110,56 +112,56 @@ class BagOfWords:
        # sort list
        vocab.sort()
        return vocab
-        
+
    def set_stop_words():
        '''creates list of all words that will be ignored
-        '''   
+        '''
        # stopwords
-        stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 
+        stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
                      'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
-                      'aren\'t', 'as', 'at', 'be', 'because', 'been', 
-                      'before', 'being', 'below', 'between', 'both', 'but', 
-                      'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn', 
-                      'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing', 
-                      'don', 'don\'t', 'down', 'during', 'each', 'few', 
+                      'aren\'t', 'as', 'at', 'be', 'because', 'been',
+                      'before', 'being', 'below', 'between', 'both', 'but',
+                      'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
+                      'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
+                      'don', 'don\'t', 'down', 'during', 'each', 'few',
                      'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
-                      'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t', 
+                      'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
                      'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
-                      'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 
+                      'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
                      'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
-                      'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 
-                      'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn', 
+                      'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
+                      'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
                      'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
-                      'on', 'once', 'only', 'or', 'other', 'our', 'ours', 
-                      'ourselves', 'out', 'over', 'own', 're', 's', 'same', 
-                      'shan', 'shan\'t', 'she', 'she\'s', 'should', 
-                      'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some', 
+                      'on', 'once', 'only', 'or', 'other', 'our', 'ours',
+                      'ourselves', 'out', 'over', 'own', 're', 's', 'same',
+                      'shan', 'shan\'t', 'she', 'she\'s', 'should',
+                      'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
                      'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
-                      'theirs', 'them', 'themselves', 'then', 'there', 
-                      'these', 'they', 'this', 'those', 'through', 'to', 
-                      'too', 'under', 'until', 'up', 've', 'very', 'was', 
-                      'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t', 
-                      'what', 'when', 'where', 'which', 'while', 'who', 
-                      'whom', 'why', 'will', 'with', 'won', 'won\'t', 
-                      'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 
-                      'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 
-                      'yourselves']    
-               
-        ##=> ist das sinnvoll?:         
+                      'theirs', 'them', 'themselves', 'then', 'there',
+                      'these', 'they', 'this', 'those', 'through', 'to',
+                      'too', 'under', 'until', 'up', 've', 'very', 'was',
+                      'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
+                      'what', 'when', 'where', 'which', 'while', 'who',
+                      'whom', 'why', 'will', 'with', 'won', 'won\'t',
+                      'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
+                      'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
+                      'yourselves']
+
+        ##=> ist das sinnvoll?:
        #add specific words
-        #stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 
-        #                   'wednesday', 'thursday', 'friday'])          
+        #stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
+        #                   'wednesday', 'thursday', 'friday'])
        #remove the word 'not' from stop words
-        #stop_words.remove('not')       
-        
+        #stop_words.remove('not')
+
        for i in range(len(stop_words)):
-        
+
            # remove punctuation marks and strip endings from abbreviations
            #stop_words[i] = re.split(r'\W', stop_words[i])[0]
-            
+
            # reduce word to stem
            stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
        # transform list to set to eliminate duplicates
-        stop_words = set(stop_words)    
-        
-        return stop_words
+        stop_words = set(stop_words)
+
+        return stop_words
--- a/README.md
+++ b/README.md
@ -1,3 +1,9 @@
 # thesis-anne

-my python classes for text mining, machine learning models, … 
+my python classes for text mining, machine learning models, … 
+
+# Requirements
+
+## Installation under (UBUNTU?)
+
+apt-get install XX
--- a/Starter.py
+++ b/Starter.py
@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 '''
 Starter
 =============
@ -29,4 +31,4 @@ dataset = CsvHandler.read_csv(file)
 NaiveBayes.make_naive_bayes(dataset)
 # SVM.make_svm(dataset)

-print('# ending program')
+print('# ending program')