thesis-anne/BagOfWords.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Bag Of Words
============

BagOfWords counts word stems in an article
and adds new words to the global vocabulary.

note:
The multinomial Naive Bayes classifier is suitable
for classification with discrete features (e.g.,
word counts for text classification).
The multinomial distribution normally requires
integer feature counts. However, in practice,
fractional counts such as tf-idf may also work.
=> considered by 'relative_word_frequencies' as parameter
'''

import re

import pandas as pd

from nltk.stem.porter import PorterStemmer

class BagOfWords:

    def fit_transform(X, relative_word_frequencies=True):
        ''' similar to CountVectorizer's fit_transform method
        '''
        vocab = BagOfWords.make_vocab(X)
        return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)

    def extract_words(text):
        '''takes article as argument, removes numbers,
        returns list of single words, recurrences included.
        '''
        stop_words = BagOfWords.set_stop_words()
        # replace punctuation marks with spaces
        words = re.sub(r'\W', ' ', text)
        # split str into list of single words
        words = words.split()
        # list of all words to return
        words_cleaned = []
        for word in words:
            # leave out numbers
            if word.isalpha():
                # reduce word to stem
                word = BagOfWords.reduce_word_to_stem(word)
                # check if not stop word
                if word.lower() not in stop_words:
                    # add every word in lowercase
                    words_cleaned.append(word.lower())
        return words_cleaned

    def reduce_word_to_stem(word):
        '''takes normal word as input, returns the word's stem
        '''
        stemmer = PorterStemmer()
        # replace word by its stem
        word = stemmer.stem(word)
        return word

    def make_matrix(series, vocab, relative_word_frequencies=True):
        '''calculates word stem frequencies in input articles.
        returns matrix (DataFrame) with relative word frequencies
        (0 <= values < 1) if relative_word_frequencies=True or absolute
        word frequencies (int) if relative_word_frequencies=False.
        (rows: different articles, colums: different words in vocab)
        '''
        print('# BOW: calculating matrix')
        print('# ...')
        # create list of tuples
        vectors = []
        for i in range(len(series)):
            # extract text of single article
            text = series.iloc[i]
            # extract its words
            words = BagOfWords.extract_words(text)
            # count words in single article
            word_count = len(words)
            vector = []
            for i, v in enumerate(vocab):
                vector.append(0)
                for w in words:
                    if w == v:
                        if relative_word_frequencies:
                            # relative word frequency
                            vector[i] += 1/word_count
                        else:
                            # absolute word frequency
                            vector[i] += 1

            # add single vector as tuple
            vectors.append(tuple(vector))
        df_vectors = pd.DataFrame.from_records(vectors,
                                               index=None,
                                               columns=vocab)
        return df_vectors

    def make_vocab(series):
        '''adds words of input articles to a global vocabulary.
        input: dataframe of all articles, return value: list of words
        '''
        print('# BOW: making vocabulary of data set')
        print('# ...')
        vocab = set()
        # for every article's text
        for text in series:
            # add single article's text to total vocabulary
            vocab |= set(BagOfWords.extract_words(text))
        # transform to list
        vocab = list(vocab)
        # sort list
        vocab.sort()
        return vocab

    def set_stop_words():
        '''creates list of all words that will be ignored
        '''
        # stopwords
        stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
                      'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
                      'aren\'t', 'as', 'at', 'be', 'because', 'been',
                      'before', 'being', 'below', 'between', 'both', 'but',
                      'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
                      'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
                      'don', 'don\'t', 'down', 'during', 'each', 'few',
                      'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
                      'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
                      'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
                      'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
                      'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
                      'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
                      'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
                      'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
                      'on', 'once', 'only', 'or', 'other', 'our', 'ours',
                      'ourselves', 'out', 'over', 'own', 're', 's', 'same',
                      'shan', 'shan\'t', 'she', 'she\'s', 'should',
                      'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
                      'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
                      'theirs', 'them', 'themselves', 'then', 'there',
                      'these', 'they', 'this', 'those', 'through', 'to',
                      'too', 'under', 'until', 'up', 've', 'very', 'was',
                      'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
                      'what', 'when', 'where', 'which', 'while', 'who',
                      'whom', 'why', 'will', 'with', 'won', 'won\'t',
                      'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
                      'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
                      'yourselves']

        #add unwanted terms
        stop_words.extend(['reuters', 'bloomberg', 'cnn', 'economist'])
        #remove the word 'not' from stop words
        #stop_words.remove('not')

        for i in range(len(stop_words)):

            # remove punctuation marks and strip endings from abbreviations
            #stop_words[i] = re.split(r'\W', stop_words[i])[0]

            # reduce word to stem
            stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
        # transform list to set to eliminate duplicates
        stop_words = set(stop_words)

        return stop_words
if __name__ == '__main__':
    test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
                    EU approval - sources. BRUSSELS (Reuters) - U.S. software
                    giant Microsoft (MSFT.O) is set to win unconditional EU
                    antitrust approval for its $7.5 billion purchase of
                    privately held coding website GitHub, two people familiar
                    with the matter said on Monday. Microsoft announced the
                    deal in June, its largest acquisition since it bought
                    LinkedIn for $26 billion in 2016. The GitHub deal is
                    expected to boost the U.S. software giant’s cloud
                    computing business and challenge market leader Amazon
                    (AMZN.O). GitHub, the world’s largest code host, has
                    more than 28 million developers using its platform. It
                    will become a part of Microsoft’s Intelligent Cloud unit
                    once the acquisition is completed. Microsoft Chief
                    Executive Satya Nadella has tried to assuage users’
                    worries that GitHub might favor Microsoft products
                    over competitors after the deal, saying GitHub would
                    continue to be an open platform that works with all
                    public clouds. The European Commission, which is set to
                    decide on the deal by Oct. 19, did not respond to a
                    request for immediate comment. Microsoft declined to
                    comment. Reporting by Foo Yun Chee; editing by Jason
                    Neely'''
    print(BagOfWords.extract_words(test_article))
-												Something

											
										
										
											2018-09-14 16:44:10 +00:00
+								#!/usr/bin/env python
 								# -*- coding: utf-8 -*-
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								'''
 								Bag Of Words
 								============
 								BagOfWords counts word stems in an article
-												callable scripts

											
										
										
											2018-09-17 19:16:19 +00:00
+								and adds new words to the global vocabulary.
-												deleted .gitignore

											
										
										
											2018-09-14 07:19:12 +00:00
-												Update

											
										
										
											2018-10-18 08:48:07 +00:00
+								note:
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								The multinomial Naive Bayes classifier is suitable
 								for classification with discrete features (e.g.,
 								word counts for text classification).
 								The multinomial distribution normally requires
 								integer feature counts. However, in practice,
-												deleted .gitignore

											
										
										
											2018-09-14 07:19:12 +00:00
+								fractional counts such as tf-idf may also work.
-												Update

											
										
										
											2018-10-18 08:48:07 +00:00
+								=> considered by 'relative_word_frequencies' as parameter
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								'''
 								import re
 								import pandas as pd
 								from nltk.stem.porter import PorterStemmer
-												added .gitignore file

											
										
										
											2018-09-10 08:38:24 +00:00
+								class BagOfWords:
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
-												SVM.py, NaiveBaies.py: built in grid-search, pipeline

											
										
										
											2018-09-12 12:21:50 +00:00
+								    def fit_transform(X, relative_word_frequencies=True):
 								        ''' similar to CountVectorizer's fit_transform method
 								        '''
 								        vocab = BagOfWords.make_vocab(X)
 								        return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								    def extract_words(text):
 								        '''takes article as argument, removes numbers,
 								        returns list of single words, recurrences included.
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								        '''
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        stop_words = BagOfWords.set_stop_words()
 								        # replace punctuation marks with spaces
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								        words = re.sub(r'\W', ' ', text)
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        # split str into list of single words
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								        words = words.split()
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        # list of all words to return
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								        words_cleaned = []
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        for word in words:
-												Update

											
										
										
											2018-10-18 08:48:07 +00:00
+								            # leave out numbers
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								            if word.isalpha():
 								                # reduce word to stem
 								                word = BagOfWords.reduce_word_to_stem(word)
 								                # check if not stop word
 								                if word.lower() not in stop_words:
 								                    # add every word in lowercase
 								                    words_cleaned.append(word.lower())
 								        return words_cleaned
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								    def reduce_word_to_stem(word):
-												SVM.py, NaiveBaies.py: built in grid-search, pipeline

											
										
										
											2018-09-12 12:21:50 +00:00
+								        '''takes normal word as input, returns the word's stem
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        '''
 								        stemmer = PorterStemmer()
 								        # replace word by its stem
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								        word = stemmer.stem(word)
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        return word
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
-												deleted .gitignore

											
										
										
											2018-09-14 07:19:12 +00:00
+								    def make_matrix(series, vocab, relative_word_frequencies=True):
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        '''calculates word stem frequencies in input articles.
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								        returns matrix (DataFrame) with relative word frequencies
-												deleted .gitignore

											
										
										
											2018-09-14 07:19:12 +00:00
+								        (0 <= values < 1) if relative_word_frequencies=True or absolute
 								        word frequencies (int) if relative_word_frequencies=False.
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        (rows: different articles, colums: different words in vocab)
 								        '''
-												deleted .gitignore

											
										
										
											2018-09-14 07:19:12 +00:00
+								        print('# BOW: calculating matrix')
-												callable scripts

											
										
										
											2018-09-17 19:16:19 +00:00
+								        print('# ...')
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        # create list of tuples
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								        vectors = []
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        for i in range(len(series)):
 								            # extract text of single article
 								            text = series.iloc[i]
 								            # extract its words
 								            words = BagOfWords.extract_words(text)
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								            # count words in single article
 								            word_count = len(words)
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								            vector = []
 								            for i, v in enumerate(vocab):
 								                vector.append(0)
 								                for w in words:
 								                    if w == v:
-												SVM.py, NaiveBaies.py: built in grid-search, pipeline

											
										
										
											2018-09-12 12:21:50 +00:00
+								                        if relative_word_frequencies:
 								                            # relative word frequency
 								                            vector[i] += 1/word_count
 								                        else:
 								                            # absolute word frequency
 								                            vector[i] += 1
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								            # add single vector as tuple
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								            vectors.append(tuple(vector))
 								        df_vectors = pd.DataFrame.from_records(vectors,
 								                                               index=None,
 								                                               columns=vocab)
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        return df_vectors
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								    def make_vocab(series):
 								        '''adds words of input articles to a global vocabulary.
 								        input: dataframe of all articles, return value: list of words
 								        '''
-												deleted .gitignore

											
										
										
											2018-09-14 07:19:12 +00:00
+								        print('# BOW: making vocabulary of data set')
-												callable scripts

											
										
										
											2018-09-17 19:16:19 +00:00
+								        print('# ...')
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        vocab = set()
-												Update

											
										
										
											2018-10-18 08:48:07 +00:00
+								        # for every article's text
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        for text in series:
-												Update

											
										
										
											2018-10-18 08:48:07 +00:00
+								            # add single article's text to total vocabulary
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								            vocab |= set(BagOfWords.extract_words(text))
 								        # transform to list
 								        vocab = list(vocab)
 								        # sort list
 								        vocab.sort()
 								        return vocab
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								    def set_stop_words():
 								        '''creates list of all words that will be ignored
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								        '''
-												SVM.py, NaiveBaies.py: built in grid-search, pipeline

											
										
										
											2018-09-12 12:21:50 +00:00
+								        # stopwords
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								        stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
-												added .gitignore file

											
										
										
											2018-09-10 08:38:24 +00:00
+								                      'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								                      'aren\'t', 'as', 'at', 'be', 'because', 'been',
 								                      'before', 'being', 'below', 'between', 'both', 'but',
 								                      'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
 								                      'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
 								                      'don', 'don\'t', 'down', 'during', 'each', 'few',
-												added .gitignore file

											
										
										
											2018-09-10 08:38:24 +00:00
+								                      'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								                      'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
-												added .gitignore file

											
										
										
											2018-09-10 08:38:24 +00:00
+								                      'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								                      'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
-												added .gitignore file

											
										
										
											2018-09-10 08:38:24 +00:00
+								                      'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								                      'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
 								                      'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
-												added .gitignore file

											
										
										
											2018-09-10 08:38:24 +00:00
+								                      'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								                      'on', 'once', 'only', 'or', 'other', 'our', 'ours',
 								                      'ourselves', 'out', 'over', 'own', 're', 's', 'same',
 								                      'shan', 'shan\'t', 'she', 'she\'s', 'should',
 								                      'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
-												added .gitignore file

											
										
										
											2018-09-10 08:38:24 +00:00
+								                      'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								                      'theirs', 'them', 'themselves', 'then', 'there',
 								                      'these', 'they', 'this', 'those', 'through', 'to',
 								                      'too', 'under', 'until', 'up', 've', 'very', 'was',
 								                      'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
 								                      'what', 'when', 'where', 'which', 'while', 'who',
 								                      'whom', 'why', 'will', 'with', 'won', 'won\'t',
 								                      'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
 								                      'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
 								                      'yourselves']
-												Update

											
										
										
											2018-10-18 08:48:07 +00:00
+								        #add unwanted terms
 								        stop_words.extend(['reuters', 'bloomberg', 'cnn', 'economist'])
-												SVM.py, NaiveBaies.py: built in grid-search, pipeline

											
										
										
											2018-09-12 12:21:50 +00:00
+								        #remove the word 'not' from stop words
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								        #stop_words.remove('not')
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								        for i in range(len(stop_words)):
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								            # remove punctuation marks and strip endings from abbreviations
 								            #stop_words[i] = re.split(r'\W', stop_words[i])[0]
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
-												initial project version

											
										
										
											2018-09-05 12:08:13 +00:00
+								            # reduce word to stem
 								            stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
 								        # transform list to set to eliminate duplicates
-												added requirements and some things

											
										
										
											2018-09-17 12:47:50 +00:00
+								        stop_words = set(stop_words)
-												Something

											
										
										
											2018-09-14 16:44:10 +00:00
+								        return stop_words
-												Update

											
										
										
											2018-10-18 08:48:07 +00:00
+								if __name__ == '__main__':
 								    test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
 								                    EU approval - sources. BRUSSELS (Reuters) - U.S. software
 								                    giant Microsoft (MSFT.O) is set to win unconditional EU
 								                    antitrust approval for its $7.5 billion purchase of
 								                    privately held coding website GitHub, two people familiar
 								                    with the matter said on Monday. Microsoft announced the
 								                    deal in June, its largest acquisition since it bought
 								                    LinkedIn for $26 billion in 2016. The GitHub deal is
 								                    expected to boost the U.S. software giant’s cloud
 								                    computing business and challenge market leader Amazon
 								                    (AMZN.O). GitHub, the world’s largest code host, has
 								                    more than 28 million developers using its platform. It
 								                    will become a part of Microsoft’s Intelligent Cloud unit
 								                    once the acquisition is completed. Microsoft Chief
 								                    Executive Satya Nadella has tried to assuage users’
 								                    worries that GitHub might favor Microsoft products
 								                    over competitors after the deal, saying GitHub would
 								                    continue to be an open platform that works with all
 								                    public clouds. The European Commission, which is set to
 								                    decide on the deal by Oct. 19, did not respond to a
 								                    request for immediate comment. Microsoft declined to
 								                    comment. Reporting by Foo Yun Chee; editing by Jason
 								                    Neely'''
 								    print(BagOfWords.extract_words(test_article))