#!/usr/bin/env python # -*- coding: utf-8 -*- ''' Bag Of Words ============ BagOfWords counts word stems in an article and adds new words to the global vocabulary. As the multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work => considered by 'rel_freq'(relative word frequencies) as parameter. ''' from collections import OrderedDict import csv import pickle import re import string import numpy as np import pandas as pd from nltk.stem.porter import PorterStemmer class BagOfWords: def fit_transform(corpus, rel_freq=True, stemming=True): ''' similar to CountVectorizer's fit_transform method ''' extracted_words = BagOfWords.extract_all_words(corpus, stemming) vocab = BagOfWords.make_vocab(extracted_words, stemming) matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming) return matrix def extract_words(text, stemming=True): '''takes article as argument, removes numbers, returns list of single words, recurrences included. ''' stemmer = PorterStemmer() stop_words = BagOfWords.set_stop_words(stemming) # ignore company names company_names_list = BagOfWords.load_company_names() for company in company_names_list: text = text.replace(company, '') # replace punctuation marks with spaces words = re.sub(r'\W', ' ', text) # split str into list of single words words = words.split() # list of all words to return words_cleaned = [] for word in words: word = word.lower() # check if alphabetic and not stop word if (word.isalpha() and word not in stop_words): if stemming: # reduce word to its stem word = stemmer.stem(word) # filter out spam chars word = word.replace('â', '').replace('œ', '')\ .replace('ã', '') words_cleaned.append(word) return words_cleaned def extract_all_words(corpus, stemming=True): '''param: all articles of corpus returns list of lists of all extracted words, one row per article ''' extracted_words = [] print('# BOW: extracting all words from articles...') print() for text in corpus: extracted_words.append(BagOfWords.extract_words(text, stemming)) return extracted_words def make_matrix(extracted_words, vocab, rel_freq=True, stemming=True): '''calculates word stem frequencies in input articles. returns document term matrix(DataFrame) with relative word frequencies (0 <= values < 1) if relative_word_frequencies=True or absolute word frequencies (int) if relative_word_frequencies=False. (rows: different articles, colums: different words in vocab) returns matrix as DataFrame ''' print('# BOW: calculating matrix...') print() # total number of words in bag of words word_count = 0 for list in extracted_words: word_count += len(list) # number of articles n_articles = len(extracted_words) # number of words in vocab l_vocab = len(vocab) # create zero-filled dataframe array = np.zeros(shape=(n_articles, l_vocab)) df_matrix = pd.DataFrame(array, columns=vocab) print('# BOW: calculating frequencies...') print() # for every text in series for i in range(len(extracted_words)): # extract words of single article words = extracted_words[i] for v in vocab: # for every word in article for w in words: # find right position if w == v: if rel_freq: # relative word frequency df_matrix.loc[i][v] += 1/word_count else: # absolute word frequency df_matrix.loc[i][v] += 1 # size too large :-( # # save df_matrix object # with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f: # pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL) return df_matrix def make_vocab(extracted_words, stemming=True): '''adds all words to a global vocabulary. input: list of lists of all extracted words, returns: list of words ''' print('# BOW: making vocabulary of data set...') print() vocab = set() # for every article's text for e_list in extracted_words: for word in e_list: # add every single word to vocabulary vocab.add(word) print('# BOW: vocabulary consists of {} features.'.format(len(vocab))) print() # transform set to list return list(vocab) def load_company_names(): # load pickle object of organizations with open('../obj/dict_organizations.pkl', 'rb') as input: dict = pickle.load(input) list = [] for key in dict.keys(): list.append(key) return list def set_stop_words(stemming=True): '''creates list of all words that will be ignored: stopwords, company names and other disturbing terms ''' # stopwords stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she', 'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves'] #add unwanted terms stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l', 'file', 'photo', 'min', 'read', 'staff', 'left', 'â', 'right', 'updated', 'minutes', 'brief', 'editing', 'reporting', 'ago', 'also', 'would', 'could', 'bit', 'ly', 'fy', 'economist', 'u', 'guardian']) stop_words.extend(['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']) stop_words.extend(['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']) if stemming: stemmer = PorterStemmer() for i in range(len(stop_words)): # reduce stop words to stem stop_words[i] = stemmer.stem(stop_words[i]) # transform list to set to eliminate duplicates return set(stop_words) def make_dict_common_words(df_matrix, n=200, rel_freq=True, stemming=True): '''params: DataFrame document term matrix of complete data set, number of n most common words. returns: dict of words with their count. ''' print('# making dictionary of most common words...') print() # words under that rel_freq limit are not included # set limit limit = 0.0001 if not rel_freq: limit = len(df_matrix) * 0.0001 # word => count dict = {} # iterate over words for column in df_matrix: # count word mentions in total if (df_matrix[column].sum() > limit): dict[column] = df_matrix[column].sum() # sort dict by value o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\ reverse=True)) print(o_dict) # return n higest values as dict (word => count) n_dict = {} for i in range(n): # next highest score next_highest = o_dict.popitem(last=False) n_dict[next_highest[0]] = next_highest[1] # save n_dict object with open('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f: pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL) return n_dict def count_features(texts, stemming=True): ''' count total number of features in textual corpus ''' print('# BOW: counting all features in corpus...') print() vocab = BagOfWords.make_vocab(texts, stemming) return len(vocab) def count_all_words(texts): print('# counting all words in corpus...') print() sum = 0 for text in texts: sum += len(text.split()) return sum def test(): file = '..\\data\\cleaned_data_set_without_header.csv' df_dataset = pd.read_csv(file, delimiter='|', header=None, index_col=None, engine='python', usecols=[1,2], #nrows=100, quoting=csv.QUOTE_NONNUMERIC, quotechar='\'') corpus = df_dataset[1] + '. ' + df_dataset[2] stemming = True rel_freq = True #print(BagOfWords.count_features(corpus)) extracted_words = BagOfWords.extract_all_words(corpus, stemming) vocab = BagOfWords.make_vocab(extracted_words, stemming) print(len(vocab))