thesis-anne/BagOfWords.py

'''
Bag Of Words
============

BagOfWords counts word stems in an article
and adds new words to the global vocabulary. 

Anm.:
The multinomial Naive Bayes classifier is suitable 
for classification with discrete features (e.g., 
word counts for text classification). 
The multinomial distribution normally requires 
integer feature counts. However, in practice, 
fractional counts such as tf-idf may also work.
=> durch 'relative_word_frequencies' als Paramter berücksichtigt
'''

import re

import pandas as pd

from nltk.stem.porter import PorterStemmer

class BagOfWords:

    def fit_transform(X, relative_word_frequencies=True):
        ''' similar to CountVectorizer's fit_transform method
        '''
        vocab = BagOfWords.make_vocab(X)
        return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)

    def extract_words(text):
        '''takes article as argument, removes numbers,
        returns list of single words, recurrences included.
        '''                   
        stop_words = BagOfWords.set_stop_words()
        # replace punctuation marks with spaces
        words = re.sub(r'\W', ' ', text)      
        # split str into list of single words
        words = words.split()        
        # list of all words to return
        words_cleaned = []        
        for word in words:
            # remove numbers
            if word.isalpha():
                # reduce word to stem
                word = BagOfWords.reduce_word_to_stem(word)
                # check if not stop word
                if word.lower() not in stop_words:
                    # add every word in lowercase
                    words_cleaned.append(word.lower())
        return words_cleaned
        
    def reduce_word_to_stem(word):
        '''takes normal word as input, returns the word's stem
        '''
        stemmer = PorterStemmer()
        # replace word by its stem
        word = stemmer.stem(word)           
        return word
        
    def make_matrix(series, vocab, relative_word_frequencies=True):
        '''calculates word stem frequencies in input articles.
        returns matrix (DataFrame) with relative word frequencies 
        (0 <= values < 1) if relative_word_frequencies=True or absolute
        word frequencies (int) if relative_word_frequencies=False.
        (rows: different articles, colums: different words in vocab)
        '''
        print('# BOW: calculating matrix')
        print('#')
        # create list of tuples
        vectors = []       
        for i in range(len(series)):
            # extract text of single article
            text = series.iloc[i]
            # extract its words
            words = BagOfWords.extract_words(text)
            # count words in single article 
            word_count = len(words)     
            vector = []
            for i, v in enumerate(vocab):
                vector.append(0)
                for w in words:
                    if w == v:
                        if relative_word_frequencies:
                            # relative word frequency
                            vector[i] += 1/word_count
                        else:
                            # absolute word frequency
                            vector[i] += 1
                            
            # add single vector as tuple
            vectors.append(tuple(vector))           
        df_vectors = pd.DataFrame.from_records(vectors, 
                                               index=None, 
                                               columns=vocab)            
        return df_vectors
        
    def make_vocab(series):
        '''adds words of input articles to a global vocabulary.
        input: dataframe of all articles, return value: list of words
        '''
        print('# BOW: making vocabulary of data set')
        print('#')
        vocab = set()
        for text in series:
            vocab |= set(BagOfWords.extract_words(text))
        # transform to list
        vocab = list(vocab)
        # sort list
        vocab.sort()
        return vocab
        
    def set_stop_words():
        '''creates list of all words that will be ignored
        '''   
        # stopwords
        stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 
                      'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
                      'aren\'t', 'as', 'at', 'be', 'because', 'been', 
                      'before', 'being', 'below', 'between', 'both', 'but', 
                      'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn', 
                      'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing', 
                      'don', 'don\'t', 'down', 'during', 'each', 'few', 
                      'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
                      'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t', 
                      'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
                      'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 
                      'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
                      'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 
                      'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn', 
                      'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
                      'on', 'once', 'only', 'or', 'other', 'our', 'ours', 
                      'ourselves', 'out', 'over', 'own', 're', 's', 'same', 
                      'shan', 'shan\'t', 'she', 'she\'s', 'should', 
                      'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some', 
                      'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
                      'theirs', 'them', 'themselves', 'then', 'there', 
                      'these', 'they', 'this', 'those', 'through', 'to', 
                      'too', 'under', 'until', 'up', 've', 'very', 'was', 
                      'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t', 
                      'what', 'when', 'where', 'which', 'while', 'who', 
                      'whom', 'why', 'will', 'with', 'won', 'won\'t', 
                      'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 
                      'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 
                      'yourselves']    
               
        ##=> ist das sinnvoll?:         
        #add specific words
        #stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 
        #                   'wednesday', 'thursday', 'friday'])          
        #remove the word 'not' from stop words
        #stop_words.remove('not')       
        
        for i in range(len(stop_words)):
        
            # remove punctuation marks and strip endings from abbreviations
            #stop_words[i] = re.split(r'\W', stop_words[i])[0]
            
            # reduce word to stem
            stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
        # transform list to set to eliminate duplicates
        stop_words = set(stop_words)    
        
        return stop_words
initial project version 2018-09-05 12:08:13 +00:00			`'''`
			`Bag Of Words`
			`============`

			`BagOfWords counts word stems in an article`
			`and adds new words to the global vocabulary.`
deleted .gitignore 2018-09-14 07:19:12 +00:00
			`Anm.:`
			`The multinomial Naive Bayes classifier is suitable`
			`for classification with discrete features (e.g.,`
			`word counts for text classification).`
			`The multinomial distribution normally requires`
			`integer feature counts. However, in practice,`
			`fractional counts such as tf-idf may also work.`
			`=> durch 'relative_word_frequencies' als Paramter berücksichtigt`
initial project version 2018-09-05 12:08:13 +00:00			`'''`

			`import re`

			`import pandas as pd`

			`from nltk.stem.porter import PorterStemmer`

added .gitignore file 2018-09-10 08:38:24 +00:00			`class BagOfWords:`
initial project version 2018-09-05 12:08:13 +00:00
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`def fit_transform(X, relative_word_frequencies=True):`
			`''' similar to CountVectorizer's fit_transform method`
			`'''`
			`vocab = BagOfWords.make_vocab(X)`
			`return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)`

initial project version 2018-09-05 12:08:13 +00:00			`def extract_words(text):`
			`'''takes article as argument, removes numbers,`
			`returns list of single words, recurrences included.`
			`'''`
			`stop_words = BagOfWords.set_stop_words()`
			`# replace punctuation marks with spaces`
			`words = re.sub(r'\W', ' ', text)`
			`# split str into list of single words`
			`words = words.split()`
			`# list of all words to return`
			`words_cleaned = []`
			`for word in words:`
			`# remove numbers`
			`if word.isalpha():`
			`# reduce word to stem`
			`word = BagOfWords.reduce_word_to_stem(word)`
			`# check if not stop word`
			`if word.lower() not in stop_words:`
			`# add every word in lowercase`
			`words_cleaned.append(word.lower())`
			`return words_cleaned`

			`def reduce_word_to_stem(word):`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`'''takes normal word as input, returns the word's stem`
initial project version 2018-09-05 12:08:13 +00:00			`'''`
			`stemmer = PorterStemmer()`
			`# replace word by its stem`
			`word = stemmer.stem(word)`
			`return word`

deleted .gitignore 2018-09-14 07:19:12 +00:00			`def make_matrix(series, vocab, relative_word_frequencies=True):`
initial project version 2018-09-05 12:08:13 +00:00			`'''calculates word stem frequencies in input articles.`
added .gitignore file 2018-09-10 08:38:24 +00:00			`returns matrix (DataFrame) with relative word frequencies`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`(0 <= values < 1) if relative_word_frequencies=True or absolute`
			`word frequencies (int) if relative_word_frequencies=False.`
initial project version 2018-09-05 12:08:13 +00:00			`(rows: different articles, colums: different words in vocab)`
			`'''`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`print('# BOW: calculating matrix')`
			`print('#')`
initial project version 2018-09-05 12:08:13 +00:00			`# create list of tuples`
			`vectors = []`
			`for i in range(len(series)):`
			`# extract text of single article`
			`text = series.iloc[i]`
			`# extract its words`
			`words = BagOfWords.extract_words(text)`
			`# count words in single article`
			`word_count = len(words)`
			`vector = []`
			`for i, v in enumerate(vocab):`
			`vector.append(0)`
			`for w in words:`
			`if w == v:`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`if relative_word_frequencies:`
			`# relative word frequency`
			`vector[i] += 1/word_count`
			`else:`
			`# absolute word frequency`
			`vector[i] += 1`

initial project version 2018-09-05 12:08:13 +00:00			`# add single vector as tuple`
			`vectors.append(tuple(vector))`
added .gitignore file 2018-09-10 08:38:24 +00:00			`df_vectors = pd.DataFrame.from_records(vectors,`
			`index=None,`
			`columns=vocab)`
initial project version 2018-09-05 12:08:13 +00:00			`return df_vectors`

			`def make_vocab(series):`
			`'''adds words of input articles to a global vocabulary.`
			`input: dataframe of all articles, return value: list of words`
			`'''`
deleted .gitignore 2018-09-14 07:19:12 +00:00			`print('# BOW: making vocabulary of data set')`
			`print('#')`
initial project version 2018-09-05 12:08:13 +00:00			`vocab = set()`
			`for text in series:`
			`vocab \|= set(BagOfWords.extract_words(text))`
			`# transform to list`
			`vocab = list(vocab)`
			`# sort list`
			`vocab.sort()`
			`return vocab`

			`def set_stop_words():`
			`'''creates list of all words that will be ignored`
			`'''`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00			`# stopwords`
added .gitignore file 2018-09-10 08:38:24 +00:00			`stop_words = ['a', 'about', 'above', 'after', 'again', 'against',`
			`'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',`
			`'aren\'t', 'as', 'at', 'be', 'because', 'been',`
			`'before', 'being', 'below', 'between', 'both', 'but',`
			`'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',`
			`'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',`
			`'don', 'don\'t', 'down', 'during', 'each', 'few',`
			`'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',`
			`'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',`
			`'having', 'he', 'her', 'here', 'hers', 'herself', 'him',`
			`'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',`
			`'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',`
			`'ll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',`
			`'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',`
			`'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',`
			`'on', 'once', 'only', 'or', 'other', 'our', 'ours',`
			`'ourselves', 'out', 'over', 'own', 're', 's', 'same',`
			`'shan', 'shan\'t', 'she', 'she\'s', 'should',`
			`'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',`
			`'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',`
			`'theirs', 'them', 'themselves', 'then', 'there',`
			`'these', 'they', 'this', 'those', 'through', 'to',`
			`'too', 'under', 'until', 'up', 've', 'very', 'was',`
			`'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',`
			`'what', 'when', 'where', 'which', 'while', 'who',`
			`'whom', 'why', 'will', 'with', 'won', 'won\'t',`
			`'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',`
			`'you\'re', 'you\'ve', 'your', 'yours', 'yourself',`
			`'yourselves']`
SVM.py, NaiveBaies.py: built in grid-search, pipeline 2018-09-12 12:21:50 +00:00
			`##=> ist das sinnvoll?:`
			`#add specific words`
			`#stop_words.extend(['reuters', 'also', 'monday', 'tuesday',`
			`# 'wednesday', 'thursday', 'friday'])`
			`#remove the word 'not' from stop words`
added .gitignore file 2018-09-10 08:38:24 +00:00			`#stop_words.remove('not')`
initial project version 2018-09-05 12:08:13 +00:00
			`for i in range(len(stop_words)):`
added .gitignore file 2018-09-10 08:38:24 +00:00
initial project version 2018-09-05 12:08:13 +00:00			`# remove punctuation marks and strip endings from abbreviations`
			`#stop_words[i] = re.split(r'\W', stop_words[i])[0]`
added .gitignore file 2018-09-10 08:38:24 +00:00
initial project version 2018-09-05 12:08:13 +00:00			`# reduce word to stem`
			`stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])`
			`# transform list to set to eliminate duplicates`
			`stop_words = set(stop_words)`

			`return stop_words`