''' Bag Of Words ============ BagOfWords counts word stems in an article and adds new words to the global vocabulary. ''' import re import pandas as pd from nltk.stem.porter import PorterStemmer class BagOfWords: def fit_transform(X, relative_word_frequencies=True): ''' similar to CountVectorizer's fit_transform method ''' vocab = BagOfWords.make_vocab(X) return BagOfWords.make_matrix(X, vocab, relative_word_frequencies) def extract_words(text): '''takes article as argument, removes numbers, returns list of single words, recurrences included. ''' stop_words = BagOfWords.set_stop_words() # replace punctuation marks with spaces words = re.sub(r'\W', ' ', text) # split str into list of single words words = words.split() # list of all words to return words_cleaned = [] for word in words: # remove numbers if word.isalpha(): # reduce word to stem word = BagOfWords.reduce_word_to_stem(word) # check if not stop word if word.lower() not in stop_words: # add every word in lowercase words_cleaned.append(word.lower()) return words_cleaned def reduce_word_to_stem(word): '''takes normal word as input, returns the word's stem ''' stemmer = PorterStemmer() # replace word by its stem word = stemmer.stem(word) return word def make_matrix(series, vocab, relative_word_frequencies): '''calculates word stem frequencies in input articles. returns matrix (DataFrame) with relative word frequencies (0 <= values < 1) or absolute word frequencies (int). (rows: different articles, colums: different words in vocab) ''' # create list of tuples vectors = [] for i in range(len(series)): # extract text of single article text = series.iloc[i] # extract its words words = BagOfWords.extract_words(text) # count words in single article word_count = len(words) vector = [] for i, v in enumerate(vocab): vector.append(0) for w in words: if w == v: if relative_word_frequencies: # relative word frequency vector[i] += 1/word_count else: # absolute word frequency vector[i] += 1 # add single vector as tuple vectors.append(tuple(vector)) df_vectors = pd.DataFrame.from_records(vectors, index=None, columns=vocab) return df_vectors def make_vocab(series): '''adds words of input articles to a global vocabulary. input: dataframe of all articles, return value: list of words ''' vocab = set() for text in series: vocab |= set(BagOfWords.extract_words(text)) # transform to list vocab = list(vocab) # sort list vocab.sort() return vocab def set_stop_words(): '''creates list of all words that will be ignored ''' # stopwords stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she', 'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves'] ##=> ist das sinnvoll?: #add specific words #stop_words.extend(['reuters', 'also', 'monday', 'tuesday', # 'wednesday', 'thursday', 'friday']) #remove the word 'not' from stop words #stop_words.remove('not') for i in range(len(stop_words)): # remove punctuation marks and strip endings from abbreviations #stop_words[i] = re.split(r'\W', stop_words[i])[0] # reduce word to stem stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i]) # transform list to set to eliminate duplicates stop_words = set(stop_words) return stop_words