292 lines
12 KiB
Python
292 lines
12 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
'''
|
|
Bag Of Words
|
|
============
|
|
|
|
BagOfWords counts word stems in an article and adds new words to the global
|
|
vocabulary. As the multinomial Naive Bayes classifier is suitable for
|
|
classification with discrete features (e.g., word counts for text
|
|
classification). The multinomial distribution normally requires integer
|
|
feature counts. However, in practice, fractional counts such as tf-idf may
|
|
also work => considered by 'rel_freq'(relative word frequencies) as parameter.
|
|
'''
|
|
from collections import OrderedDict
|
|
import csv
|
|
import pickle
|
|
import re
|
|
import string
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from nltk.stem.porter import PorterStemmer
|
|
|
|
class BagOfWords:
|
|
|
|
def fit_transform(corpus, rel_freq=True, stemming=True):
|
|
''' similar to CountVectorizer's fit_transform method
|
|
'''
|
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
|
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
|
|
stemming)
|
|
return matrix
|
|
|
|
def extract_words(text, stemming=True):
|
|
'''takes article as argument, removes numbers,
|
|
returns list of single words, recurrences included.
|
|
'''
|
|
stemmer = PorterStemmer()
|
|
stop_words = BagOfWords.set_stop_words(stemming)
|
|
|
|
# ignore company names
|
|
company_names_list = BagOfWords.load_company_names()
|
|
for company in company_names_list:
|
|
text = text.replace(company, '')
|
|
|
|
# replace punctuation marks with spaces
|
|
words = re.sub(r'\W', ' ', text)
|
|
# split str into list of single words
|
|
words = words.split()
|
|
# list of all words to return
|
|
words_cleaned = []
|
|
for word in words:
|
|
word = word.lower()
|
|
# check if alphabetic and not stop word
|
|
if (word.isalpha() and word not in stop_words):
|
|
if stemming:
|
|
# reduce word to its stem
|
|
word = stemmer.stem(word)
|
|
# filter out spam chars
|
|
word = word.replace('â', '').replace('œ', '')\
|
|
.replace('ã', '')
|
|
words_cleaned.append(word)
|
|
return words_cleaned
|
|
|
|
def extract_all_words(corpus, stemming=True):
|
|
'''param: all articles of corpus
|
|
returns list of lists of all extracted words, one row per article
|
|
'''
|
|
extracted_words = []
|
|
print('# BOW: extracting all words from articles...')
|
|
print()
|
|
for text in corpus:
|
|
extracted_words.append(BagOfWords.extract_words(text, stemming))
|
|
|
|
return extracted_words
|
|
|
|
def make_matrix(extracted_words, vocab, rel_freq=True, stemming=True):
|
|
'''calculates word stem frequencies in input articles. returns
|
|
document term matrix(DataFrame) with relative word frequencies
|
|
(0 <= values < 1) if relative_word_frequencies=True or absolute
|
|
word frequencies (int) if relative_word_frequencies=False.
|
|
(rows: different articles, colums: different words in vocab)
|
|
returns matrix as DataFrame
|
|
'''
|
|
print('# BOW: calculating matrix...')
|
|
print()
|
|
|
|
# total number of words in bag of words
|
|
word_count = 0
|
|
|
|
for list in extracted_words:
|
|
word_count += len(list)
|
|
|
|
# number of articles
|
|
n_articles = len(extracted_words)
|
|
# number of words in vocab
|
|
l_vocab = len(vocab)
|
|
|
|
# create zero-filled dataframe
|
|
array = np.zeros(shape=(n_articles, l_vocab))
|
|
df_matrix = pd.DataFrame(array, columns=vocab)
|
|
|
|
print('# BOW: calculating frequencies...')
|
|
print()
|
|
|
|
# for every text in series
|
|
for i in range(len(extracted_words)):
|
|
|
|
# extract words of single article
|
|
words = extracted_words[i]
|
|
|
|
for v in vocab:
|
|
# for every word in article
|
|
for w in words:
|
|
# find right position
|
|
if w == v:
|
|
if rel_freq:
|
|
# relative word frequency
|
|
df_matrix.loc[i][v] += 1/word_count
|
|
else:
|
|
# absolute word frequency
|
|
df_matrix.loc[i][v] += 1
|
|
|
|
# size too large :-(
|
|
# # save df_matrix object
|
|
# with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
|
|
# pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
|
|
|
|
return df_matrix
|
|
|
|
def make_vocab(extracted_words, stemming=True):
|
|
'''adds all words to a global vocabulary.
|
|
input: list of lists of all extracted words, returns: list of words
|
|
'''
|
|
print('# BOW: making vocabulary of data set...')
|
|
print()
|
|
vocab = set()
|
|
# for every article's text
|
|
for e_list in extracted_words:
|
|
for word in e_list:
|
|
# add every single word to vocabulary
|
|
vocab.add(word)
|
|
print('# BOW: vocabulary consists of {} features.'.format(len(vocab)))
|
|
print()
|
|
# transform set to list
|
|
return list(vocab)
|
|
|
|
def load_company_names():
|
|
# load pickle object of organizations
|
|
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
|
dict = pickle.load(input)
|
|
list = []
|
|
for key in dict.keys():
|
|
list.append(key)
|
|
return list
|
|
|
|
def set_stop_words(stemming=True):
|
|
'''creates list of all words that will be ignored:
|
|
stopwords, company names and other disturbing terms
|
|
'''
|
|
# stopwords
|
|
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
|
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
|
|
'aren\'t', 'as', 'at', 'be', 'because', 'been',
|
|
'before', 'being', 'below', 'between', 'both', 'but',
|
|
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
|
|
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
|
|
'don', 'don\'t', 'down', 'during', 'each', 'few',
|
|
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
|
|
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
|
|
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
|
|
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
|
|
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
|
|
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
|
|
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
|
|
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
|
|
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
|
|
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
|
|
'shan', 'shan\'t', 'she', 'she\'s', 'should',
|
|
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
|
|
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
|
|
'theirs', 'them', 'themselves', 'then', 'there',
|
|
'these', 'they', 'this', 'those', 'through', 'to',
|
|
'too', 'under', 'until', 'up', 've', 'very', 'was',
|
|
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
|
|
'what', 'when', 'where', 'which', 'while', 'who',
|
|
'whom', 'why', 'will', 'with', 'won', 'won\'t',
|
|
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
|
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
|
|
'yourselves']
|
|
|
|
#add unwanted terms
|
|
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
|
|
'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
|
|
'right', 'updated', 'minutes', 'brief', 'editing',
|
|
'reporting', 'ago', 'also', 'would', 'could',
|
|
'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
|
|
|
|
stop_words.extend(['monday', 'tuesday', 'wednesday', 'thursday', 'friday',
|
|
'saturday', 'sunday'])
|
|
|
|
stop_words.extend(['january', 'february', 'march', 'april', 'may',
|
|
'june', 'july', 'august', 'september', 'october',
|
|
'november', 'december', 'jan', 'feb', 'mar', 'apr',
|
|
'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov',
|
|
'dec'])
|
|
|
|
if stemming:
|
|
stemmer = PorterStemmer()
|
|
for i in range(len(stop_words)):
|
|
# reduce stop words to stem
|
|
stop_words[i] = stemmer.stem(stop_words[i])
|
|
# transform list to set to eliminate duplicates
|
|
return set(stop_words)
|
|
|
|
def make_dict_common_words(df_matrix, n=200, rel_freq=True, stemming=True):
|
|
'''params: DataFrame document term matrix of complete data set,
|
|
number of n most common words.
|
|
returns: dict of words with their count.
|
|
'''
|
|
print('# making dictionary of most common words...')
|
|
print()
|
|
|
|
# words under that rel_freq limit are not included
|
|
# set limit
|
|
limit = 0.0001
|
|
if not rel_freq:
|
|
limit = len(df_matrix) * 0.0001
|
|
|
|
# word => count
|
|
dict = {}
|
|
|
|
# iterate over words
|
|
for column in df_matrix:
|
|
# count word mentions in total
|
|
if (df_matrix[column].sum() > limit):
|
|
dict[column] = df_matrix[column].sum()
|
|
|
|
# sort dict by value
|
|
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
|
reverse=True))
|
|
print(o_dict)
|
|
# return n higest values as dict (word => count)
|
|
n_dict = {}
|
|
|
|
for i in range(n):
|
|
# next highest score
|
|
next_highest = o_dict.popitem(last=False)
|
|
n_dict[next_highest[0]] = next_highest[1]
|
|
|
|
# save n_dict object
|
|
with open('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
|
|
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
|
|
|
return n_dict
|
|
|
|
def count_features(texts, stemming=True):
|
|
''' count total number of features in textual corpus
|
|
'''
|
|
print('# BOW: counting all features in corpus...')
|
|
print()
|
|
vocab = BagOfWords.make_vocab(texts, stemming)
|
|
return len(vocab)
|
|
|
|
def count_all_words(texts):
|
|
print('# counting all words in corpus...')
|
|
print()
|
|
sum = 0
|
|
for text in texts:
|
|
sum += len(text.split())
|
|
return sum
|
|
|
|
def test():
|
|
file = '..\\data\\cleaned_data_set_without_header.csv'
|
|
df_dataset = pd.read_csv(file,
|
|
delimiter='|',
|
|
header=None,
|
|
index_col=None,
|
|
engine='python',
|
|
usecols=[1,2],
|
|
#nrows=100,
|
|
quoting=csv.QUOTE_NONNUMERIC,
|
|
quotechar='\'')
|
|
|
|
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
|
stemming = True
|
|
rel_freq = True
|
|
#print(BagOfWords.count_features(corpus))
|
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
|
print(len(vocab)) |