thesis-anne/src/BagOfWords.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Bag Of Words
============

BagOfWords counts word stems in an article and adds new words to the global
vocabulary. As the multinomial Naive Bayes classifier is suitable for
classification with discrete features (e.g., word counts for text
classification). The multinomial distribution normally requires integer
feature counts. However, in practice, fractional counts such as tf-idf may
also work => considered by 'rel_freq'(relative word frequencies) as parameter.
'''
from collections import OrderedDict
import csv
import pickle
import re
import string

import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer

class BagOfWords:

	def fit_transform(corpus, rel_freq=True, stemming=True):
		''' similar to CountVectorizer's fit_transform method
		'''
		extracted_words = BagOfWords.extract_all_words(corpus, stemming)
		vocab = BagOfWords.make_vocab(extracted_words, stemming)
		matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
										stemming)
		return matrix

	def extract_words(text, stemming=True):
		'''takes article as argument, removes numbers,
		returns list of single words, recurrences included.
		'''
		stemmer = PorterStemmer()
		stop_words = BagOfWords.set_stop_words(stemming)

		# ignore company names
		company_names_list = BagOfWords.load_company_names()
		for company in company_names_list:
			text = text.replace(company, '')

		# replace punctuation marks with spaces
		words = re.sub(r'\W', ' ', text)
		# split str into list of single words
		words = words.split()
		# list of all words to return
		words_cleaned = []
		for word in words:
			word = word.lower()
			# check if alphabetic and not stop word
			if (word.isalpha() and word not in stop_words):
				if stemming:
					# reduce word to its stem
					word = stemmer.stem(word)
					# filter out spam chars
					word = word.replace('â', '').replace('œ', '')\
							   .replace('ã', '')
				words_cleaned.append(word)
		return words_cleaned

	def extract_all_words(corpus, stemming=True):
		'''param: all articles of corpus
		returns list of lists of all extracted words, one row per article
		'''
		extracted_words = []
		print('# BOW: extracting all words from articles...')
		print()
		for text in corpus:
			extracted_words.append(BagOfWords.extract_words(text, stemming))

		return extracted_words

	def make_matrix(extracted_words, vocab, rel_freq=True, stemming=True):
		'''calculates word stem frequencies in input articles. returns
		document term matrix(DataFrame) with relative word frequencies
		(0 <= values < 1) if relative_word_frequencies=True or absolute
		word frequencies (int) if relative_word_frequencies=False.
		(rows: different articles, colums: different words in vocab)
		returns matrix as DataFrame
		'''
		print('# BOW: calculating matrix...')
		print()

		# total number of words in bag of words
		word_count = 0

		for list in extracted_words:
			word_count += len(list)

		# number of articles
		n_articles = len(extracted_words)
		# number of words in vocab
		l_vocab = len(vocab)

		# create zero-filled dataframe
		array = np.zeros(shape=(n_articles, l_vocab))
		df_matrix = pd.DataFrame(array, columns=vocab)

		print('# BOW: calculating frequencies...')
		print()

		# for every text in series
		for i in range(len(extracted_words)):

			# extract words of single article
			words = extracted_words[i]

			for v in vocab:
				# for every word in article
				for w in words:
					# find right position
					if w == v:
						if rel_freq:
							# relative word frequency
							df_matrix.loc[i][v] += 1/word_count
						else:
							# absolute word frequency
							df_matrix.loc[i][v] += 1
		return df_matrix

	def make_vocab(extracted_words, stemming=True):
		'''adds all words to a global vocabulary.
		input: list of lists of all extracted words, returns: list of words
		'''
		print('# BOW: making vocabulary of data set...')
		print()
		vocab = set()
		# for every article's text
		for e_list in extracted_words:
			for word in e_list:
				# add every single word to vocabulary
				vocab.add(word)
		print('# BOW: vocabulary consists of {} features.'.format(len(vocab)))
		print()
		# transform set to list
		return list(vocab)

	def load_company_names():
		# load pickle object of organizations
		with open('../obj/dict_organizations.pkl', 'rb') as input:
			dict = pickle.load(input)
		list = []
		for key in dict.keys():
			list.append(key)
		return list

	def set_stop_words(stemming=True):
		'''creates list of all words that will be ignored:
		stopwords, company names and other disturbing terms
		'''
		# stopwords
		stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
					  'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
					  'aren\'t', 'as', 'at', 'be', 'because', 'been',
					  'before', 'being', 'below', 'between', 'both', 'but',
					  'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
					  'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
					  'don', 'don\'t', 'down', 'during', 'each', 'few',
					  'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
					  'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
					  'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
					  'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
					  'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
					  'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
					  'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
					  'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
					  'on', 'once', 'only', 'or', 'other', 'our', 'ours',
					  'ourselves', 'out', 'over', 'own', 're', 's', 'same',
					  'shan', 'shan\'t', 'she', 'she\'s', 'should',
					  'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
					  'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
					  'theirs', 'them', 'themselves', 'then', 'there',
					  'these', 'they', 'this', 'those', 'through', 'to',
					  'too', 'under', 'until', 'up', 've', 'very', 'was',
					  'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
					  'what', 'when', 'where', 'which', 'while', 'who',
					  'whom', 'why', 'will', 'with', 'won', 'won\'t',
					  'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
					  'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
					  'yourselves']

		#add unwanted terms
		stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
						   'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
						   'right', 'updated', 'minutes', 'brief', 'editing',
						   'reporting', 'ago', 'also', 'would', 'could',
						   'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])

		stop_words.extend(['monday', 'tuesday', 'wednesday', 'thursday', 'friday',
						   'saturday', 'sunday'])

		stop_words.extend(['january', 'february', 'march', 'april', 'may',
						   'june', 'july', 'august', 'september', 'october',
						   'november', 'december', 'jan', 'feb', 'mar', 'apr',
						   'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov',
						   'dec'])

		if stemming:
			stemmer = PorterStemmer()
			for i in range(len(stop_words)):
				# reduce stop words to stem
				stop_words[i] = stemmer.stem(stop_words[i])
			# transform list to set to eliminate duplicates
		return set(stop_words)

	def make_dict_common_words(df_matrix, n=200, rel_freq=True, stemming=True):
		'''params: DataFrame document term matrix of complete data set,
		number of n most common words.
		returns: dict of words with their count.
		'''
		print('# making dictionary of most common words...')
		print()

		# words under that rel_freq limit are not included
		# set limit
		limit = 0.0001
		if not rel_freq:
			limit = len(df_matrix) * 0.0001

		# word => count
		dict = {}

		# iterate over words
		for column in df_matrix:
			# count word mentions in total
			if (df_matrix[column].sum() > limit):
				dict[column] = df_matrix[column].sum()

		# sort dict by value
		o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
							 reverse=True))
		print(o_dict)
		# return n higest values as dict (word => count)
		n_dict = {}

		for i in range(n):
			# next highest score
			next_highest = o_dict.popitem(last=False)
			n_dict[next_highest[0]] = next_highest[1]

		# save n_dict object
		with open('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
			pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)

		return n_dict

	def count_features(texts, stemming=True):
		''' count total number of features in textual corpus
		'''
		print('# BOW: counting all features in corpus...')
		print()
		vocab = BagOfWords.make_vocab(texts, stemming)
		return len(vocab)

	def count_all_words(texts):
		print('# counting all words in corpus...')
		print()
		sum = 0
		for text in texts:
			sum += len(text.split())
		return sum

	def test():
		file = '..\\data\\cleaned_data_set_without_header.csv'
		df_dataset = pd.read_csv(file,
								 delimiter='|',
								 header=None,
								 index_col=None,
								 engine='python',
								 usecols=[1,2],
								 #nrows=100,
								 quoting=csv.QUOTE_NONNUMERIC,
								 quotechar='\'')

		corpus = df_dataset[1] + '. ' + df_dataset[2]
		stemming = True
		rel_freq = True
		#print(BagOfWords.count_features(corpus))
		extracted_words = BagOfWords.extract_all_words(corpus, stemming)
		vocab = BagOfWords.make_vocab(extracted_words, stemming)
		print(len(vocab))

if __name__ == '__main__':
	stemmer = PorterStemmer()
	text = 'German Economy Minister Peter Altmaier said on Tuesday that he did not favor getting ministerial approval for deals such as the proposal to merge Siemens and Alstom’s rail businesses to better compete in Europe and abroad.'
	# replace punctuation marks with spaces
	words = re.sub(r'\W', ' ', text)
	# split str into list of single words
	words = words.split()
	# list of all words to return
	words_cleaned = []
	for word in words:
		word = word.lower()
		# check if alphabetic and not stop word
		if (word.isalpha()):# and word not in stop_words):
			# reduce word to its stem
			word = stemmer.stem(word)
			# filter out spam chars
			word = word.replace('â', '').replace('œ', '')\
					   .replace('ã', '')
			words_cleaned.append(word)
	print(words_cleaned)