Something
This commit is contained in:
parent
c2066d6adb
commit
fd467620a0
120
BagOfWords.py
120
BagOfWords.py
|
@ -1,16 +1,18 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
'''
|
'''
|
||||||
Bag Of Words
|
Bag Of Words
|
||||||
============
|
============
|
||||||
|
|
||||||
BagOfWords counts word stems in an article
|
BagOfWords counts word stems in an article
|
||||||
and adds new words to the global vocabulary.
|
and adds new words to the global vocabulary.
|
||||||
|
|
||||||
Anm.:
|
Anm.:
|
||||||
The multinomial Naive Bayes classifier is suitable
|
The multinomial Naive Bayes classifier is suitable
|
||||||
for classification with discrete features (e.g.,
|
for classification with discrete features (e.g.,
|
||||||
word counts for text classification).
|
word counts for text classification).
|
||||||
The multinomial distribution normally requires
|
The multinomial distribution normally requires
|
||||||
integer feature counts. However, in practice,
|
integer feature counts. However, in practice,
|
||||||
fractional counts such as tf-idf may also work.
|
fractional counts such as tf-idf may also work.
|
||||||
=> durch 'relative_word_frequencies' als Paramter berücksichtigt
|
=> durch 'relative_word_frequencies' als Paramter berücksichtigt
|
||||||
'''
|
'''
|
||||||
|
@ -32,14 +34,14 @@ class BagOfWords:
|
||||||
def extract_words(text):
|
def extract_words(text):
|
||||||
'''takes article as argument, removes numbers,
|
'''takes article as argument, removes numbers,
|
||||||
returns list of single words, recurrences included.
|
returns list of single words, recurrences included.
|
||||||
'''
|
'''
|
||||||
stop_words = BagOfWords.set_stop_words()
|
stop_words = BagOfWords.set_stop_words()
|
||||||
# replace punctuation marks with spaces
|
# replace punctuation marks with spaces
|
||||||
words = re.sub(r'\W', ' ', text)
|
words = re.sub(r'\W', ' ', text)
|
||||||
# split str into list of single words
|
# split str into list of single words
|
||||||
words = words.split()
|
words = words.split()
|
||||||
# list of all words to return
|
# list of all words to return
|
||||||
words_cleaned = []
|
words_cleaned = []
|
||||||
for word in words:
|
for word in words:
|
||||||
# remove numbers
|
# remove numbers
|
||||||
if word.isalpha():
|
if word.isalpha():
|
||||||
|
@ -50,18 +52,18 @@ class BagOfWords:
|
||||||
# add every word in lowercase
|
# add every word in lowercase
|
||||||
words_cleaned.append(word.lower())
|
words_cleaned.append(word.lower())
|
||||||
return words_cleaned
|
return words_cleaned
|
||||||
|
|
||||||
def reduce_word_to_stem(word):
|
def reduce_word_to_stem(word):
|
||||||
'''takes normal word as input, returns the word's stem
|
'''takes normal word as input, returns the word's stem
|
||||||
'''
|
'''
|
||||||
stemmer = PorterStemmer()
|
stemmer = PorterStemmer()
|
||||||
# replace word by its stem
|
# replace word by its stem
|
||||||
word = stemmer.stem(word)
|
word = stemmer.stem(word)
|
||||||
return word
|
return word
|
||||||
|
|
||||||
def make_matrix(series, vocab, relative_word_frequencies=True):
|
def make_matrix(series, vocab, relative_word_frequencies=True):
|
||||||
'''calculates word stem frequencies in input articles.
|
'''calculates word stem frequencies in input articles.
|
||||||
returns matrix (DataFrame) with relative word frequencies
|
returns matrix (DataFrame) with relative word frequencies
|
||||||
(0 <= values < 1) if relative_word_frequencies=True or absolute
|
(0 <= values < 1) if relative_word_frequencies=True or absolute
|
||||||
word frequencies (int) if relative_word_frequencies=False.
|
word frequencies (int) if relative_word_frequencies=False.
|
||||||
(rows: different articles, colums: different words in vocab)
|
(rows: different articles, colums: different words in vocab)
|
||||||
|
@ -69,14 +71,14 @@ class BagOfWords:
|
||||||
print('# BOW: calculating matrix')
|
print('# BOW: calculating matrix')
|
||||||
print('#')
|
print('#')
|
||||||
# create list of tuples
|
# create list of tuples
|
||||||
vectors = []
|
vectors = []
|
||||||
for i in range(len(series)):
|
for i in range(len(series)):
|
||||||
# extract text of single article
|
# extract text of single article
|
||||||
text = series.iloc[i]
|
text = series.iloc[i]
|
||||||
# extract its words
|
# extract its words
|
||||||
words = BagOfWords.extract_words(text)
|
words = BagOfWords.extract_words(text)
|
||||||
# count words in single article
|
# count words in single article
|
||||||
word_count = len(words)
|
word_count = len(words)
|
||||||
vector = []
|
vector = []
|
||||||
for i, v in enumerate(vocab):
|
for i, v in enumerate(vocab):
|
||||||
vector.append(0)
|
vector.append(0)
|
||||||
|
@ -88,14 +90,14 @@ class BagOfWords:
|
||||||
else:
|
else:
|
||||||
# absolute word frequency
|
# absolute word frequency
|
||||||
vector[i] += 1
|
vector[i] += 1
|
||||||
|
|
||||||
# add single vector as tuple
|
# add single vector as tuple
|
||||||
vectors.append(tuple(vector))
|
vectors.append(tuple(vector))
|
||||||
df_vectors = pd.DataFrame.from_records(vectors,
|
df_vectors = pd.DataFrame.from_records(vectors,
|
||||||
index=None,
|
index=None,
|
||||||
columns=vocab)
|
columns=vocab)
|
||||||
return df_vectors
|
return df_vectors
|
||||||
|
|
||||||
def make_vocab(series):
|
def make_vocab(series):
|
||||||
'''adds words of input articles to a global vocabulary.
|
'''adds words of input articles to a global vocabulary.
|
||||||
input: dataframe of all articles, return value: list of words
|
input: dataframe of all articles, return value: list of words
|
||||||
|
@ -110,56 +112,56 @@ class BagOfWords:
|
||||||
# sort list
|
# sort list
|
||||||
vocab.sort()
|
vocab.sort()
|
||||||
return vocab
|
return vocab
|
||||||
|
|
||||||
def set_stop_words():
|
def set_stop_words():
|
||||||
'''creates list of all words that will be ignored
|
'''creates list of all words that will be ignored
|
||||||
'''
|
'''
|
||||||
# stopwords
|
# stopwords
|
||||||
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
||||||
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
|
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
|
||||||
'aren\'t', 'as', 'at', 'be', 'because', 'been',
|
'aren\'t', 'as', 'at', 'be', 'because', 'been',
|
||||||
'before', 'being', 'below', 'between', 'both', 'but',
|
'before', 'being', 'below', 'between', 'both', 'but',
|
||||||
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
|
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
|
||||||
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
|
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
|
||||||
'don', 'don\'t', 'down', 'during', 'each', 'few',
|
'don', 'don\'t', 'down', 'during', 'each', 'few',
|
||||||
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
|
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
|
||||||
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
|
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
|
||||||
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
|
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
|
||||||
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
|
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
|
||||||
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
|
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
|
||||||
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
|
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
|
||||||
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
|
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
|
||||||
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
|
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
|
||||||
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
|
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
|
||||||
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
|
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
|
||||||
'shan', 'shan\'t', 'she', 'she\'s', 'should',
|
'shan', 'shan\'t', 'she', 'she\'s', 'should',
|
||||||
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
|
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
|
||||||
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
|
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
|
||||||
'theirs', 'them', 'themselves', 'then', 'there',
|
'theirs', 'them', 'themselves', 'then', 'there',
|
||||||
'these', 'they', 'this', 'those', 'through', 'to',
|
'these', 'they', 'this', 'those', 'through', 'to',
|
||||||
'too', 'under', 'until', 'up', 've', 'very', 'was',
|
'too', 'under', 'until', 'up', 've', 'very', 'was',
|
||||||
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
|
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
|
||||||
'what', 'when', 'where', 'which', 'while', 'who',
|
'what', 'when', 'where', 'which', 'while', 'who',
|
||||||
'whom', 'why', 'will', 'with', 'won', 'won\'t',
|
'whom', 'why', 'will', 'with', 'won', 'won\'t',
|
||||||
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
||||||
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
|
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
|
||||||
'yourselves']
|
'yourselves']
|
||||||
|
|
||||||
##=> ist das sinnvoll?:
|
##=> ist das sinnvoll?:
|
||||||
#add specific words
|
#add specific words
|
||||||
#stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
|
#stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
|
||||||
# 'wednesday', 'thursday', 'friday'])
|
# 'wednesday', 'thursday', 'friday'])
|
||||||
#remove the word 'not' from stop words
|
#remove the word 'not' from stop words
|
||||||
#stop_words.remove('not')
|
#stop_words.remove('not')
|
||||||
|
|
||||||
for i in range(len(stop_words)):
|
for i in range(len(stop_words)):
|
||||||
|
|
||||||
# remove punctuation marks and strip endings from abbreviations
|
# remove punctuation marks and strip endings from abbreviations
|
||||||
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
|
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
|
||||||
|
|
||||||
# reduce word to stem
|
# reduce word to stem
|
||||||
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
|
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
|
||||||
# transform list to set to eliminate duplicates
|
# transform list to set to eliminate duplicates
|
||||||
stop_words = set(stop_words)
|
stop_words = set(stop_words)
|
||||||
|
|
||||||
return stop_words
|
return stop_words
|
||||||
|
|
|
@ -1,3 +1,9 @@
|
||||||
# thesis-anne
|
# thesis-anne
|
||||||
|
|
||||||
my python classes for text mining, machine learning models, …
|
my python classes for text mining, machine learning models, …
|
||||||
|
|
||||||
|
# Requirements
|
||||||
|
|
||||||
|
## Installation under (UBUNTU?)
|
||||||
|
|
||||||
|
apt-get install XX
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
'''
|
'''
|
||||||
Starter
|
Starter
|
||||||
=============
|
=============
|
||||||
|
@ -29,4 +31,4 @@ dataset = CsvHandler.read_csv(file)
|
||||||
NaiveBayes.make_naive_bayes(dataset)
|
NaiveBayes.make_naive_bayes(dataset)
|
||||||
# SVM.make_svm(dataset)
|
# SVM.make_svm(dataset)
|
||||||
|
|
||||||
print('# ending program')
|
print('# ending program')
|
||||||
|
|
Loading…
Reference in New Issue