thesis-anne/BagOfWords.py

168 lines
6.8 KiB
Python
Raw Normal View History

2018-09-14 16:44:10 +00:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
2018-09-05 12:08:13 +00:00
'''
Bag Of Words
============
BagOfWords counts word stems in an article
2018-09-17 19:16:19 +00:00
and adds new words to the global vocabulary.
2018-09-14 07:19:12 +00:00
Anm.:
2018-09-17 12:47:50 +00:00
The multinomial Naive Bayes classifier is suitable
for classification with discrete features (e.g.,
word counts for text classification).
The multinomial distribution normally requires
integer feature counts. However, in practice,
2018-09-14 07:19:12 +00:00
fractional counts such as tf-idf may also work.
=> durch 'relative_word_frequencies' als Paramter berücksichtigt
2018-09-05 12:08:13 +00:00
'''
import re
import pandas as pd
from nltk.stem.porter import PorterStemmer
2018-09-10 08:38:24 +00:00
class BagOfWords:
2018-09-05 12:08:13 +00:00
def fit_transform(X, relative_word_frequencies=True):
''' similar to CountVectorizer's fit_transform method
'''
vocab = BagOfWords.make_vocab(X)
return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)
2018-09-05 12:08:13 +00:00
def extract_words(text):
'''takes article as argument, removes numbers,
returns list of single words, recurrences included.
2018-09-17 12:47:50 +00:00
'''
2018-09-05 12:08:13 +00:00
stop_words = BagOfWords.set_stop_words()
# replace punctuation marks with spaces
2018-09-17 12:47:50 +00:00
words = re.sub(r'\W', ' ', text)
2018-09-05 12:08:13 +00:00
# split str into list of single words
2018-09-17 12:47:50 +00:00
words = words.split()
2018-09-05 12:08:13 +00:00
# list of all words to return
2018-09-17 12:47:50 +00:00
words_cleaned = []
2018-09-05 12:08:13 +00:00
for word in words:
# remove numbers
if word.isalpha():
# reduce word to stem
word = BagOfWords.reduce_word_to_stem(word)
# check if not stop word
if word.lower() not in stop_words:
# add every word in lowercase
words_cleaned.append(word.lower())
return words_cleaned
2018-09-17 12:47:50 +00:00
2018-09-05 12:08:13 +00:00
def reduce_word_to_stem(word):
'''takes normal word as input, returns the word's stem
2018-09-05 12:08:13 +00:00
'''
stemmer = PorterStemmer()
# replace word by its stem
2018-09-17 12:47:50 +00:00
word = stemmer.stem(word)
2018-09-05 12:08:13 +00:00
return word
2018-09-17 12:47:50 +00:00
2018-09-14 07:19:12 +00:00
def make_matrix(series, vocab, relative_word_frequencies=True):
2018-09-05 12:08:13 +00:00
'''calculates word stem frequencies in input articles.
2018-09-17 12:47:50 +00:00
returns matrix (DataFrame) with relative word frequencies
2018-09-14 07:19:12 +00:00
(0 <= values < 1) if relative_word_frequencies=True or absolute
word frequencies (int) if relative_word_frequencies=False.
2018-09-05 12:08:13 +00:00
(rows: different articles, colums: different words in vocab)
'''
2018-09-14 07:19:12 +00:00
print('# BOW: calculating matrix')
2018-09-17 19:16:19 +00:00
print('# ...')
2018-09-05 12:08:13 +00:00
# create list of tuples
2018-09-17 12:47:50 +00:00
vectors = []
2018-09-05 12:08:13 +00:00
for i in range(len(series)):
# extract text of single article
text = series.iloc[i]
# extract its words
words = BagOfWords.extract_words(text)
2018-09-17 12:47:50 +00:00
# count words in single article
word_count = len(words)
2018-09-05 12:08:13 +00:00
vector = []
for i, v in enumerate(vocab):
vector.append(0)
for w in words:
if w == v:
if relative_word_frequencies:
# relative word frequency
vector[i] += 1/word_count
else:
# absolute word frequency
vector[i] += 1
2018-09-17 12:47:50 +00:00
2018-09-05 12:08:13 +00:00
# add single vector as tuple
2018-09-17 12:47:50 +00:00
vectors.append(tuple(vector))
df_vectors = pd.DataFrame.from_records(vectors,
index=None,
columns=vocab)
2018-09-05 12:08:13 +00:00
return df_vectors
2018-09-17 12:47:50 +00:00
2018-09-05 12:08:13 +00:00
def make_vocab(series):
'''adds words of input articles to a global vocabulary.
input: dataframe of all articles, return value: list of words
'''
2018-09-14 07:19:12 +00:00
print('# BOW: making vocabulary of data set')
2018-09-17 19:16:19 +00:00
print('# ...')
2018-09-05 12:08:13 +00:00
vocab = set()
for text in series:
vocab |= set(BagOfWords.extract_words(text))
# transform to list
vocab = list(vocab)
# sort list
vocab.sort()
return vocab
2018-09-17 12:47:50 +00:00
2018-09-05 12:08:13 +00:00
def set_stop_words():
'''creates list of all words that will be ignored
2018-09-17 12:47:50 +00:00
'''
# stopwords
2018-09-17 12:47:50 +00:00
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
2018-09-10 08:38:24 +00:00
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
2018-09-17 12:47:50 +00:00
'aren\'t', 'as', 'at', 'be', 'because', 'been',
'before', 'being', 'below', 'between', 'both', 'but',
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
'don', 'don\'t', 'down', 'during', 'each', 'few',
2018-09-10 08:38:24 +00:00
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
2018-09-17 12:47:50 +00:00
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
2018-09-10 08:38:24 +00:00
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
2018-09-17 12:47:50 +00:00
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
2018-09-10 08:38:24 +00:00
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
2018-09-17 12:47:50 +00:00
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
2018-09-10 08:38:24 +00:00
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
2018-09-17 12:47:50 +00:00
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
'shan', 'shan\'t', 'she', 'she\'s', 'should',
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
2018-09-10 08:38:24 +00:00
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
2018-09-17 12:47:50 +00:00
'theirs', 'them', 'themselves', 'then', 'there',
'these', 'they', 'this', 'those', 'through', 'to',
'too', 'under', 'until', 'up', 've', 'very', 'was',
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
'what', 'when', 'where', 'which', 'while', 'who',
'whom', 'why', 'will', 'with', 'won', 'won\'t',
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
'yourselves']
##=> ist das sinnvoll?:
#add specific words
2018-09-17 12:47:50 +00:00
#stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
# 'wednesday', 'thursday', 'friday'])
#remove the word 'not' from stop words
2018-09-17 12:47:50 +00:00
#stop_words.remove('not')
2018-09-05 12:08:13 +00:00
for i in range(len(stop_words)):
2018-09-17 12:47:50 +00:00
2018-09-05 12:08:13 +00:00
# remove punctuation marks and strip endings from abbreviations
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
2018-09-17 12:47:50 +00:00
2018-09-05 12:08:13 +00:00
# reduce word to stem
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
# transform list to set to eliminate duplicates
2018-09-17 12:47:50 +00:00
stop_words = set(stop_words)
2018-09-14 16:44:10 +00:00
return stop_words