thesis-anne/BagOfWords.py

'''
Bag Of Words
============

BagOfWords counts word stems in an article
and adds new words to the global vocabulary. 
'''

import re

import pandas as pd

from nltk.stem.porter import PorterStemmer

class BagOfWords():

    def extract_words(text):
        '''takes article as argument, removes numbers,
        returns list of single words, recurrences included.
        '''                   
        stop_words = BagOfWords.set_stop_words()
        # replace punctuation marks with spaces
        words = re.sub(r'\W', ' ', text)      
        # split str into list of single words
        words = words.split()        
        # list of all words to return
        words_cleaned = []        
        for word in words:
            # remove numbers
            if word.isalpha():
                # reduce word to stem
                word = BagOfWords.reduce_word_to_stem(word)
                # check if not stop word
                if word.lower() not in stop_words:
                    # add every word in lowercase
                    words_cleaned.append(word.lower())
        return words_cleaned
        
    def reduce_word_to_stem(word):
        '''takes normal word as input, returns the word's word stem
        '''
        stemmer = PorterStemmer()
        # replace word by its stem
        word = stemmer.stem(word)           
        return word
        
    def make_matrix(series, vocab):
        '''calculates word stem frequencies in input articles.
        returns matrix (DataFrame) with relative word frequencies (0 <= values < 1)
        (rows: different articles, colums: different words in vocab)
        '''
        # create list of tuples
        vectors = []       
        for i in range(len(series)):
            # extract text of single article
            text = series.iloc[i]
            # extract its words
            words = BagOfWords.extract_words(text)
            # count words in single article 
            word_count = len(words)     
            vector = []
            for i, v in enumerate(vocab):
                vector.append(0)
                for w in words:
                    if w == v:
                        # add relative word frequency
                        vector[i] += 1/word_count
            # add single vector as tuple
            vectors.append(tuple(vector))           
        df_vectors = pd.DataFrame.from_records(vectors, index=None, columns=vocab)            
        return df_vectors
        
    def make_vocab(series):
        '''adds words of input articles to a global vocabulary.
        input: dataframe of all articles, return value: list of words
        '''
        vocab = set()
        for text in series:
            vocab |= set(BagOfWords.extract_words(text))
        # transform to list
        vocab = list(vocab)
        # sort list
        vocab.sort()
        return vocab
        
    def set_stop_words():
        '''creates list of all words that will be ignored
        '''   
        # standard stopwords from nltk.corpus stopwords('english')
        stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 
                        'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t', 
                        'as', 'at', 'be', 'because', 'been', 'before', 'being', 
                        'below', 'between', 'both', 'but', 'by', 'can', 'couldn', 
                        'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does', 
                        'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down', 
                        'during', 'each', 'few', 'for', 'from', 'further', 'had', 
                        'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven', 
                        'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself', 
                        'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 
                        'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll', 
                        'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most', 
                        'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t', 
                        'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 
                        'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 
                        'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she', 
                        'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t', 
                        'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the', 
                        'their', 'theirs', 'them', 'themselves', 'then', 'there', 
                        'these', 'they', 'this', 'those', 'through', 'to', 'too', 
                        'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t', 
                        'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where', 
                        'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 
                        'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 
                        'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves']    
                        
        # add specific words
        stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday'])    

        # remove the word 'not' from stop words
        stop_words.remove('not')       
        
        for i in range(len(stop_words)):
            # remove punctuation marks and strip endings from abbreviations
            #stop_words[i] = re.split(r'\W', stop_words[i])[0]
            # reduce word to stem
            stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
        # transform list to set to eliminate duplicates
        stop_words = set(stop_words)    
        
        return stop_words
initial project version 2018-09-05 12:08:13 +00:00			`'''`
			`Bag Of Words`
			`============`

			`BagOfWords counts word stems in an article`
			`and adds new words to the global vocabulary.`
			`'''`

			`import re`

			`import pandas as pd`

			`from nltk.stem.porter import PorterStemmer`

			`class BagOfWords():`

			`def extract_words(text):`
			`'''takes article as argument, removes numbers,`
			`returns list of single words, recurrences included.`
			`'''`
			`stop_words = BagOfWords.set_stop_words()`
			`# replace punctuation marks with spaces`
			`words = re.sub(r'\W', ' ', text)`
			`# split str into list of single words`
			`words = words.split()`
			`# list of all words to return`
			`words_cleaned = []`
			`for word in words:`
			`# remove numbers`
			`if word.isalpha():`
			`# reduce word to stem`
			`word = BagOfWords.reduce_word_to_stem(word)`
			`# check if not stop word`
			`if word.lower() not in stop_words:`
			`# add every word in lowercase`
			`words_cleaned.append(word.lower())`
			`return words_cleaned`

			`def reduce_word_to_stem(word):`
			`'''takes normal word as input, returns the word's word stem`
			`'''`
			`stemmer = PorterStemmer()`
			`# replace word by its stem`
			`word = stemmer.stem(word)`
			`return word`

			`def make_matrix(series, vocab):`
			`'''calculates word stem frequencies in input articles.`
			`returns matrix (DataFrame) with relative word frequencies (0 <= values < 1)`
			`(rows: different articles, colums: different words in vocab)`
			`'''`
			`# create list of tuples`
			`vectors = []`
			`for i in range(len(series)):`
			`# extract text of single article`
			`text = series.iloc[i]`
			`# extract its words`
			`words = BagOfWords.extract_words(text)`
			`# count words in single article`
			`word_count = len(words)`
			`vector = []`
			`for i, v in enumerate(vocab):`
			`vector.append(0)`
			`for w in words:`
			`if w == v:`
			`# add relative word frequency`
			`vector[i] += 1/word_count`
			`# add single vector as tuple`
			`vectors.append(tuple(vector))`
			`df_vectors = pd.DataFrame.from_records(vectors, index=None, columns=vocab)`
			`return df_vectors`

			`def make_vocab(series):`
			`'''adds words of input articles to a global vocabulary.`
			`input: dataframe of all articles, return value: list of words`
			`'''`
			`vocab = set()`
			`for text in series:`
			`vocab \|= set(BagOfWords.extract_words(text))`
			`# transform to list`
			`vocab = list(vocab)`
			`# sort list`
			`vocab.sort()`
			`return vocab`

			`def set_stop_words():`
			`'''creates list of all words that will be ignored`
			`'''`
			`# standard stopwords from nltk.corpus stopwords('english')`
			`stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain',`
			`'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t',`
			`'as', 'at', 'be', 'because', 'been', 'before', 'being',`
			`'below', 'between', 'both', 'but', 'by', 'can', 'couldn',`
			`'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does',`
			`'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down',`
			`'during', 'each', 'few', 'for', 'from', 'further', 'had',`
			`'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven',`
			`'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself',`
			`'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',`
			`'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll',`
			`'m', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most',`
			`'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t',`
			`'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once',`
			`'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out',`
			`'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she',`
			`'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t',`
			`'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the',`
			`'their', 'theirs', 'them', 'themselves', 'then', 'there',`
			`'these', 'they', 'this', 'those', 'through', 'to', 'too',`
			`'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t',`
			`'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where',`
			`'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won',`
			`'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',`
			`'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves']`

			`# add specific words`
			`stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday'])`

			`# remove the word 'not' from stop words`
			`stop_words.remove('not')`

			`for i in range(len(stop_words)):`
			`# remove punctuation marks and strip endings from abbreviations`
			`#stop_words[i] = re.split(r'\W', stop_words[i])[0]`
			`# reduce word to stem`
			`stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])`
			`# transform list to set to eliminate duplicates`
			`stop_words = set(stop_words)`

			`return stop_words`