thesis-anne/BagOfWords.py

192 lines
8.5 KiB
Python
Raw Normal View History

2018-09-14 16:44:10 +00:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
2018-09-05 12:08:13 +00:00
'''
Bag Of Words
============
BagOfWords counts word stems in an article
2018-09-17 19:16:19 +00:00
and adds new words to the global vocabulary.
2018-09-14 07:19:12 +00:00
2018-10-18 08:48:07 +00:00
note:
2018-09-17 12:47:50 +00:00
The multinomial Naive Bayes classifier is suitable
for classification with discrete features (e.g.,
word counts for text classification).
The multinomial distribution normally requires
integer feature counts. However, in practice,
2018-09-14 07:19:12 +00:00
fractional counts such as tf-idf may also work.
2018-10-18 08:48:07 +00:00
=> considered by 'relative_word_frequencies' as parameter
2018-09-05 12:08:13 +00:00
'''
import re
import pandas as pd
from nltk.stem.porter import PorterStemmer
2018-09-10 08:38:24 +00:00
class BagOfWords:
2018-09-05 12:08:13 +00:00
def fit_transform(X, relative_word_frequencies=True):
''' similar to CountVectorizer's fit_transform method
'''
vocab = BagOfWords.make_vocab(X)
return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)
2018-09-05 12:08:13 +00:00
def extract_words(text):
'''takes article as argument, removes numbers,
returns list of single words, recurrences included.
2018-09-17 12:47:50 +00:00
'''
2018-09-05 12:08:13 +00:00
stop_words = BagOfWords.set_stop_words()
# replace punctuation marks with spaces
2018-09-17 12:47:50 +00:00
words = re.sub(r'\W', ' ', text)
2018-09-05 12:08:13 +00:00
# split str into list of single words
2018-09-17 12:47:50 +00:00
words = words.split()
2018-09-05 12:08:13 +00:00
# list of all words to return
2018-09-17 12:47:50 +00:00
words_cleaned = []
2018-09-05 12:08:13 +00:00
for word in words:
2018-10-18 08:48:07 +00:00
# leave out numbers
2018-09-05 12:08:13 +00:00
if word.isalpha():
# reduce word to stem
word = BagOfWords.reduce_word_to_stem(word)
# check if not stop word
if word.lower() not in stop_words:
# add every word in lowercase
words_cleaned.append(word.lower())
return words_cleaned
2018-09-17 12:47:50 +00:00
2018-09-05 12:08:13 +00:00
def reduce_word_to_stem(word):
'''takes normal word as input, returns the word's stem
2018-09-05 12:08:13 +00:00
'''
stemmer = PorterStemmer()
# replace word by its stem
2018-09-17 12:47:50 +00:00
word = stemmer.stem(word)
2018-09-05 12:08:13 +00:00
return word
2018-09-17 12:47:50 +00:00
2018-09-14 07:19:12 +00:00
def make_matrix(series, vocab, relative_word_frequencies=True):
2018-09-05 12:08:13 +00:00
'''calculates word stem frequencies in input articles.
2018-09-17 12:47:50 +00:00
returns matrix (DataFrame) with relative word frequencies
2018-09-14 07:19:12 +00:00
(0 <= values < 1) if relative_word_frequencies=True or absolute
word frequencies (int) if relative_word_frequencies=False.
2018-09-05 12:08:13 +00:00
(rows: different articles, colums: different words in vocab)
'''
2018-09-14 07:19:12 +00:00
print('# BOW: calculating matrix')
2018-09-17 19:16:19 +00:00
print('# ...')
2018-09-05 12:08:13 +00:00
# create list of tuples
2018-09-17 12:47:50 +00:00
vectors = []
2018-09-05 12:08:13 +00:00
for i in range(len(series)):
# extract text of single article
text = series.iloc[i]
# extract its words
words = BagOfWords.extract_words(text)
2018-09-17 12:47:50 +00:00
# count words in single article
word_count = len(words)
2018-09-05 12:08:13 +00:00
vector = []
for i, v in enumerate(vocab):
vector.append(0)
for w in words:
if w == v:
if relative_word_frequencies:
# relative word frequency
vector[i] += 1/word_count
else:
# absolute word frequency
vector[i] += 1
2018-09-17 12:47:50 +00:00
2018-09-05 12:08:13 +00:00
# add single vector as tuple
2018-09-17 12:47:50 +00:00
vectors.append(tuple(vector))
df_vectors = pd.DataFrame.from_records(vectors,
index=None,
columns=vocab)
2018-09-05 12:08:13 +00:00
return df_vectors
2018-09-17 12:47:50 +00:00
2018-09-05 12:08:13 +00:00
def make_vocab(series):
'''adds words of input articles to a global vocabulary.
input: dataframe of all articles, return value: list of words
'''
2018-09-14 07:19:12 +00:00
print('# BOW: making vocabulary of data set')
2018-09-17 19:16:19 +00:00
print('# ...')
2018-09-05 12:08:13 +00:00
vocab = set()
2018-10-18 08:48:07 +00:00
# for every article's text
2018-09-05 12:08:13 +00:00
for text in series:
2018-10-18 08:48:07 +00:00
# add single article's text to total vocabulary
2018-09-05 12:08:13 +00:00
vocab |= set(BagOfWords.extract_words(text))
# transform to list
vocab = list(vocab)
# sort list
vocab.sort()
return vocab
2018-09-17 12:47:50 +00:00
2018-09-05 12:08:13 +00:00
def set_stop_words():
'''creates list of all words that will be ignored
2018-09-17 12:47:50 +00:00
'''
# stopwords
2018-09-17 12:47:50 +00:00
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
2018-09-10 08:38:24 +00:00
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
2018-09-17 12:47:50 +00:00
'aren\'t', 'as', 'at', 'be', 'because', 'been',
'before', 'being', 'below', 'between', 'both', 'but',
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
'don', 'don\'t', 'down', 'during', 'each', 'few',
2018-09-10 08:38:24 +00:00
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
2018-09-17 12:47:50 +00:00
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
2018-09-10 08:38:24 +00:00
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
2018-09-17 12:47:50 +00:00
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
2018-09-10 08:38:24 +00:00
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
2018-09-17 12:47:50 +00:00
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
2018-09-10 08:38:24 +00:00
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
2018-09-17 12:47:50 +00:00
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
'shan', 'shan\'t', 'she', 'she\'s', 'should',
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
2018-09-10 08:38:24 +00:00
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
2018-09-17 12:47:50 +00:00
'theirs', 'them', 'themselves', 'then', 'there',
'these', 'they', 'this', 'those', 'through', 'to',
'too', 'under', 'until', 'up', 've', 'very', 'was',
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
'what', 'when', 'where', 'which', 'while', 'who',
'whom', 'why', 'will', 'with', 'won', 'won\'t',
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
'yourselves']
2018-10-18 08:48:07 +00:00
#add unwanted terms
stop_words.extend(['reuters', 'bloomberg', 'cnn', 'economist'])
#remove the word 'not' from stop words
2018-09-17 12:47:50 +00:00
#stop_words.remove('not')
2018-09-05 12:08:13 +00:00
for i in range(len(stop_words)):
2018-09-17 12:47:50 +00:00
2018-09-05 12:08:13 +00:00
# remove punctuation marks and strip endings from abbreviations
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
2018-09-17 12:47:50 +00:00
2018-09-05 12:08:13 +00:00
# reduce word to stem
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
# transform list to set to eliminate duplicates
2018-09-17 12:47:50 +00:00
stop_words = set(stop_words)
2018-09-14 16:44:10 +00:00
return stop_words
2018-10-18 08:48:07 +00:00
if __name__ == '__main__':
test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
EU approval - sources. BRUSSELS (Reuters) - U.S. software
giant Microsoft (MSFT.O) is set to win unconditional EU
antitrust approval for its $7.5 billion purchase of
privately held coding website GitHub, two people familiar
with the matter said on Monday. Microsoft announced the
deal in June, its largest acquisition since it bought
LinkedIn for $26 billion in 2016. The GitHub deal is
expected to boost the U.S. software giants cloud
computing business and challenge market leader Amazon
(AMZN.O). GitHub, the worlds largest code host, has
more than 28 million developers using its platform. It
will become a part of Microsofts Intelligent Cloud unit
once the acquisition is completed. Microsoft Chief
Executive Satya Nadella has tried to assuage users
worries that GitHub might favor Microsoft products
over competitors after the deal, saying GitHub would
continue to be an open platform that works with all
public clouds. The European Commission, which is set to
decide on the deal by Oct. 19, did not respond to a
request for immediate comment. Microsoft declined to
comment. Reporting by Foo Yun Chee; editing by Jason
Neely'''
print(BagOfWords.extract_words(test_article))