130 lines
5.7 KiB
Python
130 lines
5.7 KiB
Python
|
'''
|
||
|
Bag Of Words
|
||
|
============
|
||
|
|
||
|
BagOfWords counts word stems in an article
|
||
|
and adds new words to the global vocabulary.
|
||
|
'''
|
||
|
|
||
|
import re
|
||
|
|
||
|
import pandas as pd
|
||
|
|
||
|
from nltk.stem.porter import PorterStemmer
|
||
|
|
||
|
class BagOfWords():
|
||
|
|
||
|
def extract_words(text):
|
||
|
'''takes article as argument, removes numbers,
|
||
|
returns list of single words, recurrences included.
|
||
|
'''
|
||
|
stop_words = BagOfWords.set_stop_words()
|
||
|
# replace punctuation marks with spaces
|
||
|
words = re.sub(r'\W', ' ', text)
|
||
|
# split str into list of single words
|
||
|
words = words.split()
|
||
|
# list of all words to return
|
||
|
words_cleaned = []
|
||
|
for word in words:
|
||
|
# remove numbers
|
||
|
if word.isalpha():
|
||
|
# reduce word to stem
|
||
|
word = BagOfWords.reduce_word_to_stem(word)
|
||
|
# check if not stop word
|
||
|
if word.lower() not in stop_words:
|
||
|
# add every word in lowercase
|
||
|
words_cleaned.append(word.lower())
|
||
|
return words_cleaned
|
||
|
|
||
|
def reduce_word_to_stem(word):
|
||
|
'''takes normal word as input, returns the word's word stem
|
||
|
'''
|
||
|
stemmer = PorterStemmer()
|
||
|
# replace word by its stem
|
||
|
word = stemmer.stem(word)
|
||
|
return word
|
||
|
|
||
|
def make_matrix(series, vocab):
|
||
|
'''calculates word stem frequencies in input articles.
|
||
|
returns matrix (DataFrame) with relative word frequencies (0 <= values < 1)
|
||
|
(rows: different articles, colums: different words in vocab)
|
||
|
'''
|
||
|
# create list of tuples
|
||
|
vectors = []
|
||
|
for i in range(len(series)):
|
||
|
# extract text of single article
|
||
|
text = series.iloc[i]
|
||
|
# extract its words
|
||
|
words = BagOfWords.extract_words(text)
|
||
|
# count words in single article
|
||
|
word_count = len(words)
|
||
|
vector = []
|
||
|
for i, v in enumerate(vocab):
|
||
|
vector.append(0)
|
||
|
for w in words:
|
||
|
if w == v:
|
||
|
# add relative word frequency
|
||
|
vector[i] += 1/word_count
|
||
|
# add single vector as tuple
|
||
|
vectors.append(tuple(vector))
|
||
|
df_vectors = pd.DataFrame.from_records(vectors, index=None, columns=vocab)
|
||
|
return df_vectors
|
||
|
|
||
|
def make_vocab(series):
|
||
|
'''adds words of input articles to a global vocabulary.
|
||
|
input: dataframe of all articles, return value: list of words
|
||
|
'''
|
||
|
vocab = set()
|
||
|
for text in series:
|
||
|
vocab |= set(BagOfWords.extract_words(text))
|
||
|
# transform to list
|
||
|
vocab = list(vocab)
|
||
|
# sort list
|
||
|
vocab.sort()
|
||
|
return vocab
|
||
|
|
||
|
def set_stop_words():
|
||
|
'''creates list of all words that will be ignored
|
||
|
'''
|
||
|
# standard stopwords from nltk.corpus stopwords('english')
|
||
|
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain',
|
||
|
'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t',
|
||
|
'as', 'at', 'be', 'because', 'been', 'before', 'being',
|
||
|
'below', 'between', 'both', 'but', 'by', 'can', 'couldn',
|
||
|
'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does',
|
||
|
'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down',
|
||
|
'during', 'each', 'few', 'for', 'from', 'further', 'had',
|
||
|
'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven',
|
||
|
'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself',
|
||
|
'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
|
||
|
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll',
|
||
|
'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most',
|
||
|
'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t',
|
||
|
'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once',
|
||
|
'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out',
|
||
|
'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she',
|
||
|
'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t',
|
||
|
'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the',
|
||
|
'their', 'theirs', 'them', 'themselves', 'then', 'there',
|
||
|
'these', 'they', 'this', 'those', 'through', 'to', 'too',
|
||
|
'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t',
|
||
|
'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where',
|
||
|
'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won',
|
||
|
'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
||
|
'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves']
|
||
|
|
||
|
# add specific words
|
||
|
stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday'])
|
||
|
|
||
|
# remove the word 'not' from stop words
|
||
|
stop_words.remove('not')
|
||
|
|
||
|
for i in range(len(stop_words)):
|
||
|
# remove punctuation marks and strip endings from abbreviations
|
||
|
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
|
||
|
# reduce word to stem
|
||
|
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
|
||
|
# transform list to set to eliminate duplicates
|
||
|
stop_words = set(stop_words)
|
||
|
|
||
|
return stop_words
|