Something

This commit is contained in:
Julian M. Kunkel 2018-09-14 17:44:10 +01:00
parent c2066d6adb
commit fd467620a0
3 changed files with 71 additions and 61 deletions

View File

@ -1,16 +1,18 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
''' '''
Bag Of Words Bag Of Words
============ ============
BagOfWords counts word stems in an article BagOfWords counts word stems in an article
and adds new words to the global vocabulary. and adds new words to the global vocabulary.
Anm.: Anm.:
The multinomial Naive Bayes classifier is suitable The multinomial Naive Bayes classifier is suitable
for classification with discrete features (e.g., for classification with discrete features (e.g.,
word counts for text classification). word counts for text classification).
The multinomial distribution normally requires The multinomial distribution normally requires
integer feature counts. However, in practice, integer feature counts. However, in practice,
fractional counts such as tf-idf may also work. fractional counts such as tf-idf may also work.
=> durch 'relative_word_frequencies' als Paramter berücksichtigt => durch 'relative_word_frequencies' als Paramter berücksichtigt
''' '''
@ -32,14 +34,14 @@ class BagOfWords:
def extract_words(text): def extract_words(text):
'''takes article as argument, removes numbers, '''takes article as argument, removes numbers,
returns list of single words, recurrences included. returns list of single words, recurrences included.
''' '''
stop_words = BagOfWords.set_stop_words() stop_words = BagOfWords.set_stop_words()
# replace punctuation marks with spaces # replace punctuation marks with spaces
words = re.sub(r'\W', ' ', text) words = re.sub(r'\W', ' ', text)
# split str into list of single words # split str into list of single words
words = words.split() words = words.split()
# list of all words to return # list of all words to return
words_cleaned = [] words_cleaned = []
for word in words: for word in words:
# remove numbers # remove numbers
if word.isalpha(): if word.isalpha():
@ -50,18 +52,18 @@ class BagOfWords:
# add every word in lowercase # add every word in lowercase
words_cleaned.append(word.lower()) words_cleaned.append(word.lower())
return words_cleaned return words_cleaned
def reduce_word_to_stem(word): def reduce_word_to_stem(word):
'''takes normal word as input, returns the word's stem '''takes normal word as input, returns the word's stem
''' '''
stemmer = PorterStemmer() stemmer = PorterStemmer()
# replace word by its stem # replace word by its stem
word = stemmer.stem(word) word = stemmer.stem(word)
return word return word
def make_matrix(series, vocab, relative_word_frequencies=True): def make_matrix(series, vocab, relative_word_frequencies=True):
'''calculates word stem frequencies in input articles. '''calculates word stem frequencies in input articles.
returns matrix (DataFrame) with relative word frequencies returns matrix (DataFrame) with relative word frequencies
(0 <= values < 1) if relative_word_frequencies=True or absolute (0 <= values < 1) if relative_word_frequencies=True or absolute
word frequencies (int) if relative_word_frequencies=False. word frequencies (int) if relative_word_frequencies=False.
(rows: different articles, colums: different words in vocab) (rows: different articles, colums: different words in vocab)
@ -69,14 +71,14 @@ class BagOfWords:
print('# BOW: calculating matrix') print('# BOW: calculating matrix')
print('#') print('#')
# create list of tuples # create list of tuples
vectors = [] vectors = []
for i in range(len(series)): for i in range(len(series)):
# extract text of single article # extract text of single article
text = series.iloc[i] text = series.iloc[i]
# extract its words # extract its words
words = BagOfWords.extract_words(text) words = BagOfWords.extract_words(text)
# count words in single article # count words in single article
word_count = len(words) word_count = len(words)
vector = [] vector = []
for i, v in enumerate(vocab): for i, v in enumerate(vocab):
vector.append(0) vector.append(0)
@ -88,14 +90,14 @@ class BagOfWords:
else: else:
# absolute word frequency # absolute word frequency
vector[i] += 1 vector[i] += 1
# add single vector as tuple # add single vector as tuple
vectors.append(tuple(vector)) vectors.append(tuple(vector))
df_vectors = pd.DataFrame.from_records(vectors, df_vectors = pd.DataFrame.from_records(vectors,
index=None, index=None,
columns=vocab) columns=vocab)
return df_vectors return df_vectors
def make_vocab(series): def make_vocab(series):
'''adds words of input articles to a global vocabulary. '''adds words of input articles to a global vocabulary.
input: dataframe of all articles, return value: list of words input: dataframe of all articles, return value: list of words
@ -110,56 +112,56 @@ class BagOfWords:
# sort list # sort list
vocab.sort() vocab.sort()
return vocab return vocab
def set_stop_words(): def set_stop_words():
'''creates list of all words that will be ignored '''creates list of all words that will be ignored
''' '''
# stopwords # stopwords
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
'aren\'t', 'as', 'at', 'be', 'because', 'been', 'aren\'t', 'as', 'at', 'be', 'because', 'been',
'before', 'being', 'below', 'between', 'both', 'but', 'before', 'being', 'below', 'between', 'both', 'but',
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn', 'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing', 'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
'don', 'don\'t', 'down', 'during', 'each', 'few', 'don', 'don\'t', 'down', 'during', 'each', 'few',
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t', 'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'on', 'once', 'only', 'or', 'other', 'our', 'ours',
'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'ourselves', 'out', 'over', 'own', 're', 's', 'same',
'shan', 'shan\'t', 'she', 'she\'s', 'should', 'shan', 'shan\'t', 'she', 'she\'s', 'should',
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some', 'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their', 'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
'theirs', 'them', 'themselves', 'then', 'there', 'theirs', 'them', 'themselves', 'then', 'there',
'these', 'they', 'this', 'those', 'through', 'to', 'these', 'they', 'this', 'those', 'through', 'to',
'too', 'under', 'until', 'up', 've', 'very', 'was', 'too', 'under', 'until', 'up', 've', 'very', 'was',
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t', 'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
'what', 'when', 'where', 'which', 'while', 'who', 'what', 'when', 'where', 'which', 'while', 'who',
'whom', 'why', 'will', 'with', 'won', 'won\'t', 'whom', 'why', 'will', 'with', 'won', 'won\'t',
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
'yourselves'] 'yourselves']
##=> ist das sinnvoll?: ##=> ist das sinnvoll?:
#add specific words #add specific words
#stop_words.extend(['reuters', 'also', 'monday', 'tuesday', #stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
# 'wednesday', 'thursday', 'friday']) # 'wednesday', 'thursday', 'friday'])
#remove the word 'not' from stop words #remove the word 'not' from stop words
#stop_words.remove('not') #stop_words.remove('not')
for i in range(len(stop_words)): for i in range(len(stop_words)):
# remove punctuation marks and strip endings from abbreviations # remove punctuation marks and strip endings from abbreviations
#stop_words[i] = re.split(r'\W', stop_words[i])[0] #stop_words[i] = re.split(r'\W', stop_words[i])[0]
# reduce word to stem # reduce word to stem
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i]) stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
# transform list to set to eliminate duplicates # transform list to set to eliminate duplicates
stop_words = set(stop_words) stop_words = set(stop_words)
return stop_words return stop_words

View File

@ -1,3 +1,9 @@
# thesis-anne # thesis-anne
my python classes for text mining, machine learning models, … my python classes for text mining, machine learning models, …
# Requirements
## Installation under (UBUNTU?)
apt-get install XX

4
Starter.py Normal file → Executable file
View File

@ -1,3 +1,5 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
''' '''
Starter Starter
============= =============
@ -29,4 +31,4 @@ dataset = CsvHandler.read_csv(file)
NaiveBayes.make_naive_bayes(dataset) NaiveBayes.make_naive_bayes(dataset)
# SVM.make_svm(dataset) # SVM.make_svm(dataset)
print('# ending program') print('# ending program')