thesis-anne/src/BagOfWords.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Bag Of Words
============

BagOfWords counts word stems in an article and adds new words to the global
vocabulary. As the multinomial Naive Bayes classifier is suitable for
classification with discrete features (e.g., word counts for text
classification). The multinomial distribution normally requires integer
feature counts. However, in practice, fractional counts such as tf-idf may
also work => considered by 'rel_freq'(relative word frequencies) as parameter.
'''
from collections import OrderedDict
import csv
import pickle
import re
import string

import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer

class BagOfWords:

    def fit_transform(corpus, rel_freq=True, stemming=True):
        ''' similar to CountVectorizer's fit_transform method
        '''
        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
        vocab = BagOfWords.make_vocab(extracted_words, stemming)
        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
                                        stemming)
        return matrix

    def extract_words(text, stemming=True):
        '''takes article as argument, removes numbers,
        returns list of single words, recurrences included.
        '''
        stemmer = PorterStemmer()
        stop_words = BagOfWords.set_stop_words(stemming)

        # ignore company names
        company_names_list = BagOfWords.load_company_names()
        for company in company_names_list:
            text = text.replace(company, '')

        # replace punctuation marks with spaces
        words = re.sub(r'\W', ' ', text)
        # split str into list of single words
        words = words.split()
        # list of all words to return
        words_cleaned = []
        for word in words:
            word = word.lower()
            # check if alphabetic and not stop word
            if (word.isalpha() and word not in stop_words):
                if stemming:
                    # reduce word to its stem
                    word = stemmer.stem(word)
                    # filter out spam chars
                    word = word.replace('â', '').replace('œ', '')\
                               .replace('ã', '')
                words_cleaned.append(word)
        return words_cleaned

    def extract_all_words(corpus, stemming=True):
        '''param: all articles of corpus
        returns list of lists of all extracted words, one row per article
        '''
        extracted_words = []
        print('# BOW: extracting all words from articles...')
        print()
        for text in corpus:
            extracted_words.append(BagOfWords.extract_words(text, stemming))

        return extracted_words

    def make_matrix(extracted_words, vocab, rel_freq=True, stemming=True):
        '''calculates word stem frequencies in input articles. returns
        document term matrix(DataFrame) with relative word frequencies
        (0 <= values < 1) if relative_word_frequencies=True or absolute
        word frequencies (int) if relative_word_frequencies=False.
        (rows: different articles, colums: different words in vocab)
        returns matrix as DataFrame
        '''
        print('# BOW: calculating matrix...')
        print()

        # total number of words in bag of words
        word_count = 0

        for list in extracted_words:
            word_count += len(list)

        # number of articles
        n_articles = len(extracted_words)
        # number of words in vocab
        l_vocab = len(vocab)

        # create zero-filled dataframe
        array = np.zeros(shape=(n_articles, l_vocab))
        df_matrix = pd.DataFrame(array, columns=vocab)

        print('# BOW: calculating frequencies...')
        print()

        # for every text in series
        for i in range(len(extracted_words)):

            # extract words of single article
            words = extracted_words[i]

            for v in vocab:
                # for every word in article
                for w in words:
                    # find right position
                    if w == v:
                        if rel_freq:
                            # relative word frequency
                            df_matrix.loc[i][v] += 1/word_count
                        else:
                            # absolute word frequency
                            df_matrix.loc[i][v] += 1

        # size too large :-(
        # # save df_matrix object
        # with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
            # pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)

        return df_matrix

    def make_vocab(extracted_words, stemming=True):
        '''adds all words to a global vocabulary.
        input: list of lists of all extracted words, returns: list of words
        '''
        print('# BOW: making vocabulary of data set...')
        print()
        vocab = set()
        # for every article's text
        for e_list in extracted_words:
            for word in e_list:
                # add every single word to vocabulary
                vocab.add(word)
        print('# BOW: vocabulary consists of {} features.'.format(len(vocab)))
        print()
        # transform set to list
        return list(vocab)

    def load_company_names():
        # load pickle object of organizations
        with open('../obj/dict_organizations.pkl', 'rb') as input:
            dict = pickle.load(input)
        list = []
        for key in dict.keys():
            list.append(key)
        return list

    def set_stop_words(stemming=True):
        '''creates list of all words that will be ignored:
        stopwords, company names and other disturbing terms
        '''
        # stopwords
        stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
                      'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
                      'aren\'t', 'as', 'at', 'be', 'because', 'been',
                      'before', 'being', 'below', 'between', 'both', 'but',
                      'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
                      'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
                      'don', 'don\'t', 'down', 'during', 'each', 'few',
                      'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
                      'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
                      'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
                      'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
                      'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
                      'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
                      'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
                      'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
                      'on', 'once', 'only', 'or', 'other', 'our', 'ours',
                      'ourselves', 'out', 'over', 'own', 're', 's', 'same',
                      'shan', 'shan\'t', 'she', 'she\'s', 'should',
                      'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
                      'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
                      'theirs', 'them', 'themselves', 'then', 'there',
                      'these', 'they', 'this', 'those', 'through', 'to',
                      'too', 'under', 'until', 'up', 've', 'very', 'was',
                      'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
                      'what', 'when', 'where', 'which', 'while', 'who',
                      'whom', 'why', 'will', 'with', 'won', 'won\'t',
                      'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
                      'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
                      'yourselves']

        #add unwanted terms
        stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
                           'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
                           'right', 'updated', 'minutes', 'brief', 'editing',
                           'reporting', 'ago', 'also', 'would', 'could',
                           'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])

        stop_words.extend(['monday', 'tuesday', 'wednesday', 'thursday', 'friday',
                           'saturday', 'sunday'])

        stop_words.extend(['january', 'february', 'march', 'april', 'may',
                           'june', 'july', 'august', 'september', 'october',
                           'november', 'december', 'jan', 'feb', 'mar', 'apr',
                           'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov',
                           'dec'])

        if stemming:
            stemmer = PorterStemmer()
            for i in range(len(stop_words)):
                # reduce stop words to stem
                stop_words[i] = stemmer.stem(stop_words[i])
            # transform list to set to eliminate duplicates
        return set(stop_words)

    def make_dict_common_words(df_matrix, n=200, rel_freq=True, stemming=True):
        '''params: DataFrame document term matrix of complete data set,
        number of n most common words.
        returns: dict of words with their count.
        '''
        print('# making dictionary of most common words...')
        print()

        # words under that rel_freq limit are not included
        # set limit
        limit = 0.0001
        if not rel_freq:
            limit = len(df_matrix) * 0.0001

        # word => count
        dict = {}

        # iterate over words
        for column in df_matrix:
            # count word mentions in total
            if (df_matrix[column].sum() > limit):
                dict[column] = df_matrix[column].sum()

        # sort dict by value
        o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
                             reverse=True))
        print(o_dict)
        # return n higest values as dict (word => count)
        n_dict = {}

        for i in range(n):
            # next highest score
            next_highest = o_dict.popitem(last=False)
            n_dict[next_highest[0]] = next_highest[1]

        # save n_dict object
        with open('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
            pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)

        return n_dict

    def count_features(texts, stemming=True):
        ''' count total number of features in textual corpus
        '''
        print('# BOW: counting all features in corpus...')
        print()
        vocab = BagOfWords.make_vocab(texts, stemming)
        return len(vocab)

    def count_all_words(texts):
        print('# counting all words in corpus...')
        print()
        sum = 0
        for text in texts:
            sum += len(text.split())
        return sum

    def test():
        file = '..\\data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(file,
                                 delimiter='|',
                                 header=None,
                                 index_col=None,
                                 engine='python',
                                 usecols=[1,2],
                                 #nrows=100,
                                 quoting=csv.QUOTE_NONNUMERIC,
                                 quotechar='\'')

        corpus = df_dataset[1] + '. ' + df_dataset[2]
        stemming = True
        rel_freq = True
        #print(BagOfWords.count_features(corpus))
        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
        vocab = BagOfWords.make_vocab(extracted_words, stemming)
        print(len(vocab))