thesis-anne/src/BagOfWords.py

307 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Bag Of Words
============
BagOfWords counts word stems in an article and adds new words to the global
vocabulary. As the multinomial Naive Bayes classifier is suitable for
classification with discrete features (e.g., word counts for text
classification). The multinomial distribution normally requires integer
feature counts. However, in practice, fractional counts such as tf-idf may
also work => considered by 'rel_freq'(relative word frequencies) as parameter.
'''
from collections import OrderedDict
import csv
import pickle
import re
import string
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
class BagOfWords:
def fit_transform(corpus, rel_freq=True, stemming=True):
''' similar to CountVectorizer's fit_transform method
'''
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
stemming)
return matrix
def extract_words(text, stemming=True):
'''takes article as argument, removes numbers,
returns list of single words, recurrences included.
'''
stemmer = PorterStemmer()
stop_words = BagOfWords.set_stop_words(stemming)
# ignore company names
company_names_list = BagOfWords.load_company_names()
for company in company_names_list:
text = text.replace(company, '')
# replace punctuation marks with spaces
words = re.sub(r'\W', ' ', text)
# split str into list of single words
words = words.split()
# list of all words to return
words_cleaned = []
for word in words:
word = word.lower()
# check if alphabetic and not stop word
if (word.isalpha() and word not in stop_words):
if stemming:
# reduce word to its stem
word = stemmer.stem(word)
# filter out spam chars
word = word.replace('â', '').replace('œ', '')\
.replace('ã', '')
words_cleaned.append(word)
return words_cleaned
def extract_all_words(corpus, stemming=True):
'''param: all articles of corpus
returns list of lists of all extracted words, one row per article
'''
extracted_words = []
print('# BOW: extracting all words from articles...')
print()
for text in corpus:
extracted_words.append(BagOfWords.extract_words(text, stemming))
return extracted_words
def make_matrix(extracted_words, vocab, rel_freq=True, stemming=True):
'''calculates word stem frequencies in input articles. returns
document term matrix(DataFrame) with relative word frequencies
(0 <= values < 1) if relative_word_frequencies=True or absolute
word frequencies (int) if relative_word_frequencies=False.
(rows: different articles, colums: different words in vocab)
returns matrix as DataFrame
'''
print('# BOW: calculating matrix...')
print()
# total number of words in bag of words
word_count = 0
for list in extracted_words:
word_count += len(list)
# number of articles
n_articles = len(extracted_words)
# number of words in vocab
l_vocab = len(vocab)
# create zero-filled dataframe
array = np.zeros(shape=(n_articles, l_vocab))
df_matrix = pd.DataFrame(array, columns=vocab)
print('# BOW: calculating frequencies...')
print()
# for every text in series
for i in range(len(extracted_words)):
# extract words of single article
words = extracted_words[i]
for v in vocab:
# for every word in article
for w in words:
# find right position
if w == v:
if rel_freq:
# relative word frequency
df_matrix.loc[i][v] += 1/word_count
else:
# absolute word frequency
df_matrix.loc[i][v] += 1
return df_matrix
def make_vocab(extracted_words, stemming=True):
'''adds all words to a global vocabulary.
input: list of lists of all extracted words, returns: list of words
'''
print('# BOW: making vocabulary of data set...')
print()
vocab = set()
# for every article's text
for e_list in extracted_words:
for word in e_list:
# add every single word to vocabulary
vocab.add(word)
print('# BOW: vocabulary consists of {} features.'.format(len(vocab)))
print()
# transform set to list
return list(vocab)
def load_company_names():
# load pickle object of organizations
with open('../obj/dict_organizations.pkl', 'rb') as input:
dict = pickle.load(input)
list = []
for key in dict.keys():
list.append(key)
return list
def set_stop_words(stemming=True):
'''creates list of all words that will be ignored:
stopwords, company names and other disturbing terms
'''
# stopwords
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
'aren\'t', 'as', 'at', 'be', 'because', 'been',
'before', 'being', 'below', 'between', 'both', 'but',
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
'don', 'don\'t', 'down', 'during', 'each', 'few',
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
'shan', 'shan\'t', 'she', 'she\'s', 'should',
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
'theirs', 'them', 'themselves', 'then', 'there',
'these', 'they', 'this', 'those', 'through', 'to',
'too', 'under', 'until', 'up', 've', 'very', 'was',
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
'what', 'when', 'where', 'which', 'while', 'who',
'whom', 'why', 'will', 'with', 'won', 'won\'t',
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
'yourselves']
#add unwanted terms
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
'right', 'updated', 'minutes', 'brief', 'editing',
'reporting', 'ago', 'also', 'would', 'could',
'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
stop_words.extend(['monday', 'tuesday', 'wednesday', 'thursday', 'friday',
'saturday', 'sunday'])
stop_words.extend(['january', 'february', 'march', 'april', 'may',
'june', 'july', 'august', 'september', 'october',
'november', 'december', 'jan', 'feb', 'mar', 'apr',
'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov',
'dec'])
if stemming:
stemmer = PorterStemmer()
for i in range(len(stop_words)):
# reduce stop words to stem
stop_words[i] = stemmer.stem(stop_words[i])
# transform list to set to eliminate duplicates
return set(stop_words)
def make_dict_common_words(df_matrix, n=200, rel_freq=True, stemming=True):
'''params: DataFrame document term matrix of complete data set,
number of n most common words.
returns: dict of words with their count.
'''
print('# making dictionary of most common words...')
print()
# words under that rel_freq limit are not included
# set limit
limit = 0.0001
if not rel_freq:
limit = len(df_matrix) * 0.0001
# word => count
dict = {}
# iterate over words
for column in df_matrix:
# count word mentions in total
if (df_matrix[column].sum() > limit):
dict[column] = df_matrix[column].sum()
# sort dict by value
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
reverse=True))
print(o_dict)
# return n higest values as dict (word => count)
n_dict = {}
for i in range(n):
# next highest score
next_highest = o_dict.popitem(last=False)
n_dict[next_highest[0]] = next_highest[1]
# save n_dict object
with open('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
return n_dict
def count_features(texts, stemming=True):
''' count total number of features in textual corpus
'''
print('# BOW: counting all features in corpus...')
print()
vocab = BagOfWords.make_vocab(texts, stemming)
return len(vocab)
def count_all_words(texts):
print('# counting all words in corpus...')
print()
sum = 0
for text in texts:
sum += len(text.split())
return sum
def test():
file = '..\\data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
usecols=[1,2],
#nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = True
rel_freq = True
#print(BagOfWords.count_features(corpus))
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
print(len(vocab))
if __name__ == '__main__':
stemmer = PorterStemmer()
text = 'German Economy Minister Peter Altmaier said on Tuesday that he did not favor getting ministerial approval for deals such as the proposal to merge Siemens and Alstoms rail businesses to better compete in Europe and abroad.'
# replace punctuation marks with spaces
words = re.sub(r'\W', ' ', text)
# split str into list of single words
words = words.split()
# list of all words to return
words_cleaned = []
for word in words:
word = word.lower()
# check if alphabetic and not stop word
if (word.isalpha()):# and word not in stop_words):
# reduce word to its stem
word = stemmer.stem(word)
# filter out spam chars
word = word.replace('â', '').replace('œ', '')\
.replace('ã', '')
words_cleaned.append(word)
print(words_cleaned)