307 lines
10 KiB
Python
307 lines
10 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
'''
|
||
Bag Of Words
|
||
============
|
||
|
||
BagOfWords counts word stems in an article and adds new words to the global
|
||
vocabulary. As the multinomial Naive Bayes classifier is suitable for
|
||
classification with discrete features (e.g., word counts for text
|
||
classification). The multinomial distribution normally requires integer
|
||
feature counts. However, in practice, fractional counts such as tf-idf may
|
||
also work => considered by 'rel_freq'(relative word frequencies) as parameter.
|
||
'''
|
||
from collections import OrderedDict
|
||
import csv
|
||
import pickle
|
||
import re
|
||
import string
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
from nltk.stem.porter import PorterStemmer
|
||
|
||
class BagOfWords:
|
||
|
||
def fit_transform(corpus, rel_freq=True, stemming=True):
|
||
''' similar to CountVectorizer's fit_transform method
|
||
'''
|
||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
|
||
stemming)
|
||
return matrix
|
||
|
||
def extract_words(text, stemming=True):
|
||
'''takes article as argument, removes numbers,
|
||
returns list of single words, recurrences included.
|
||
'''
|
||
stemmer = PorterStemmer()
|
||
stop_words = BagOfWords.set_stop_words(stemming)
|
||
|
||
# ignore company names
|
||
company_names_list = BagOfWords.load_company_names()
|
||
for company in company_names_list:
|
||
text = text.replace(company, '')
|
||
|
||
# replace punctuation marks with spaces
|
||
words = re.sub(r'\W', ' ', text)
|
||
# split str into list of single words
|
||
words = words.split()
|
||
# list of all words to return
|
||
words_cleaned = []
|
||
for word in words:
|
||
word = word.lower()
|
||
# check if alphabetic and not stop word
|
||
if (word.isalpha() and word not in stop_words):
|
||
if stemming:
|
||
# reduce word to its stem
|
||
word = stemmer.stem(word)
|
||
# filter out spam chars
|
||
word = word.replace('â', '').replace('œ', '')\
|
||
.replace('ã', '')
|
||
words_cleaned.append(word)
|
||
return words_cleaned
|
||
|
||
def extract_all_words(corpus, stemming=True):
|
||
'''param: all articles of corpus
|
||
returns list of lists of all extracted words, one row per article
|
||
'''
|
||
extracted_words = []
|
||
print('# BOW: extracting all words from articles...')
|
||
print()
|
||
for text in corpus:
|
||
extracted_words.append(BagOfWords.extract_words(text, stemming))
|
||
|
||
return extracted_words
|
||
|
||
def make_matrix(extracted_words, vocab, rel_freq=True, stemming=True):
|
||
'''calculates word stem frequencies in input articles. returns
|
||
document term matrix(DataFrame) with relative word frequencies
|
||
(0 <= values < 1) if relative_word_frequencies=True or absolute
|
||
word frequencies (int) if relative_word_frequencies=False.
|
||
(rows: different articles, colums: different words in vocab)
|
||
returns matrix as DataFrame
|
||
'''
|
||
print('# BOW: calculating matrix...')
|
||
print()
|
||
|
||
# total number of words in bag of words
|
||
word_count = 0
|
||
|
||
for list in extracted_words:
|
||
word_count += len(list)
|
||
|
||
# number of articles
|
||
n_articles = len(extracted_words)
|
||
# number of words in vocab
|
||
l_vocab = len(vocab)
|
||
|
||
# create zero-filled dataframe
|
||
array = np.zeros(shape=(n_articles, l_vocab))
|
||
df_matrix = pd.DataFrame(array, columns=vocab)
|
||
|
||
print('# BOW: calculating frequencies...')
|
||
print()
|
||
|
||
# for every text in series
|
||
for i in range(len(extracted_words)):
|
||
|
||
# extract words of single article
|
||
words = extracted_words[i]
|
||
|
||
for v in vocab:
|
||
# for every word in article
|
||
for w in words:
|
||
# find right position
|
||
if w == v:
|
||
if rel_freq:
|
||
# relative word frequency
|
||
df_matrix.loc[i][v] += 1/word_count
|
||
else:
|
||
# absolute word frequency
|
||
df_matrix.loc[i][v] += 1
|
||
return df_matrix
|
||
|
||
def make_vocab(extracted_words, stemming=True):
|
||
'''adds all words to a global vocabulary.
|
||
input: list of lists of all extracted words, returns: list of words
|
||
'''
|
||
print('# BOW: making vocabulary of data set...')
|
||
print()
|
||
vocab = set()
|
||
# for every article's text
|
||
for e_list in extracted_words:
|
||
for word in e_list:
|
||
# add every single word to vocabulary
|
||
vocab.add(word)
|
||
print('# BOW: vocabulary consists of {} features.'.format(len(vocab)))
|
||
print()
|
||
# transform set to list
|
||
return list(vocab)
|
||
|
||
def load_company_names():
|
||
# load pickle object of organizations
|
||
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
||
dict = pickle.load(input)
|
||
list = []
|
||
for key in dict.keys():
|
||
list.append(key)
|
||
return list
|
||
|
||
def set_stop_words(stemming=True):
|
||
'''creates list of all words that will be ignored:
|
||
stopwords, company names and other disturbing terms
|
||
'''
|
||
# stopwords
|
||
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
||
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
|
||
'aren\'t', 'as', 'at', 'be', 'because', 'been',
|
||
'before', 'being', 'below', 'between', 'both', 'but',
|
||
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
|
||
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
|
||
'don', 'don\'t', 'down', 'during', 'each', 'few',
|
||
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
|
||
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
|
||
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
|
||
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
|
||
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
|
||
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
|
||
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
|
||
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
|
||
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
|
||
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
|
||
'shan', 'shan\'t', 'she', 'she\'s', 'should',
|
||
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
|
||
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
|
||
'theirs', 'them', 'themselves', 'then', 'there',
|
||
'these', 'they', 'this', 'those', 'through', 'to',
|
||
'too', 'under', 'until', 'up', 've', 'very', 'was',
|
||
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
|
||
'what', 'when', 'where', 'which', 'while', 'who',
|
||
'whom', 'why', 'will', 'with', 'won', 'won\'t',
|
||
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
||
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
|
||
'yourselves']
|
||
|
||
#add unwanted terms
|
||
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
|
||
'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
|
||
'right', 'updated', 'minutes', 'brief', 'editing',
|
||
'reporting', 'ago', 'also', 'would', 'could',
|
||
'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
|
||
|
||
stop_words.extend(['monday', 'tuesday', 'wednesday', 'thursday', 'friday',
|
||
'saturday', 'sunday'])
|
||
|
||
stop_words.extend(['january', 'february', 'march', 'april', 'may',
|
||
'june', 'july', 'august', 'september', 'october',
|
||
'november', 'december', 'jan', 'feb', 'mar', 'apr',
|
||
'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov',
|
||
'dec'])
|
||
|
||
if stemming:
|
||
stemmer = PorterStemmer()
|
||
for i in range(len(stop_words)):
|
||
# reduce stop words to stem
|
||
stop_words[i] = stemmer.stem(stop_words[i])
|
||
# transform list to set to eliminate duplicates
|
||
return set(stop_words)
|
||
|
||
def make_dict_common_words(df_matrix, n=200, rel_freq=True, stemming=True):
|
||
'''params: DataFrame document term matrix of complete data set,
|
||
number of n most common words.
|
||
returns: dict of words with their count.
|
||
'''
|
||
print('# making dictionary of most common words...')
|
||
print()
|
||
|
||
# words under that rel_freq limit are not included
|
||
# set limit
|
||
limit = 0.0001
|
||
if not rel_freq:
|
||
limit = len(df_matrix) * 0.0001
|
||
|
||
# word => count
|
||
dict = {}
|
||
|
||
# iterate over words
|
||
for column in df_matrix:
|
||
# count word mentions in total
|
||
if (df_matrix[column].sum() > limit):
|
||
dict[column] = df_matrix[column].sum()
|
||
|
||
# sort dict by value
|
||
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||
reverse=True))
|
||
print(o_dict)
|
||
# return n higest values as dict (word => count)
|
||
n_dict = {}
|
||
|
||
for i in range(n):
|
||
# next highest score
|
||
next_highest = o_dict.popitem(last=False)
|
||
n_dict[next_highest[0]] = next_highest[1]
|
||
|
||
# save n_dict object
|
||
with open('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
|
||
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||
|
||
return n_dict
|
||
|
||
def count_features(texts, stemming=True):
|
||
''' count total number of features in textual corpus
|
||
'''
|
||
print('# BOW: counting all features in corpus...')
|
||
print()
|
||
vocab = BagOfWords.make_vocab(texts, stemming)
|
||
return len(vocab)
|
||
|
||
def count_all_words(texts):
|
||
print('# counting all words in corpus...')
|
||
print()
|
||
sum = 0
|
||
for text in texts:
|
||
sum += len(text.split())
|
||
return sum
|
||
|
||
def test():
|
||
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||
df_dataset = pd.read_csv(file,
|
||
delimiter='|',
|
||
header=None,
|
||
index_col=None,
|
||
engine='python',
|
||
usecols=[1,2],
|
||
#nrows=100,
|
||
quoting=csv.QUOTE_NONNUMERIC,
|
||
quotechar='\'')
|
||
|
||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||
stemming = True
|
||
rel_freq = True
|
||
#print(BagOfWords.count_features(corpus))
|
||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||
print(len(vocab))
|
||
|
||
if __name__ == '__main__':
|
||
stemmer = PorterStemmer()
|
||
text = 'German Economy Minister Peter Altmaier said on Tuesday that he did not favor getting ministerial approval for deals such as the proposal to merge Siemens and Alstom’s rail businesses to better compete in Europe and abroad.'
|
||
# replace punctuation marks with spaces
|
||
words = re.sub(r'\W', ' ', text)
|
||
# split str into list of single words
|
||
words = words.split()
|
||
# list of all words to return
|
||
words_cleaned = []
|
||
for word in words:
|
||
word = word.lower()
|
||
# check if alphabetic and not stop word
|
||
if (word.isalpha()):# and word not in stop_words):
|
||
# reduce word to its stem
|
||
word = stemmer.stem(word)
|
||
# filter out spam chars
|
||
word = word.replace('â', '').replace('œ', '')\
|
||
.replace('ã', '')
|
||
words_cleaned.append(word)
|
||
print(words_cleaned) |