#!/usr/bin/env python # -*- coding: utf-8 -*- ''' Bag Of Words ============ BagOfWords counts word stems in an article and adds new words to the global vocabulary. note: The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work. => considered by 'relative_word_frequencies' as parameter ''' import re import pandas as pd from nltk.stem.porter import PorterStemmer class BagOfWords: def fit_transform(X, relative_word_frequencies=True): ''' similar to CountVectorizer's fit_transform method ''' vocab = BagOfWords.make_vocab(X) return BagOfWords.make_matrix(X, vocab, relative_word_frequencies) def extract_words(text): '''takes article as argument, removes numbers, returns list of single words, recurrences included. ''' stop_words = BagOfWords.set_stop_words() # replace punctuation marks with spaces words = re.sub(r'\W', ' ', text) # split str into list of single words words = words.split() # list of all words to return words_cleaned = [] for word in words: # leave out numbers if word.isalpha(): # reduce word to stem word = BagOfWords.reduce_word_to_stem(word) # check if not stop word if word.lower() not in stop_words: # add every word in lowercase words_cleaned.append(word.lower()) return words_cleaned def reduce_word_to_stem(word): '''takes normal word as input, returns the word's stem ''' stemmer = PorterStemmer() # replace word by its stem word = stemmer.stem(word) return word def make_matrix(series, vocab, relative_word_frequencies=True): '''calculates word stem frequencies in input articles. returns matrix (DataFrame) with relative word frequencies (0 <= values < 1) if relative_word_frequencies=True or absolute word frequencies (int) if relative_word_frequencies=False. (rows: different articles, colums: different words in vocab) ''' print('# BOW: calculating matrix') print('# ...') # create list of tuples vectors = [] for i in range(len(series)): # extract text of single article text = series.iloc[i] # extract its words words = BagOfWords.extract_words(text) # count words in single article word_count = len(words) vector = [] for i, v in enumerate(vocab): vector.append(0) for w in words: if w == v: if relative_word_frequencies: # relative word frequency vector[i] += 1/word_count else: # absolute word frequency vector[i] += 1 # add single vector as tuple vectors.append(tuple(vector)) df_vectors = pd.DataFrame.from_records(vectors, index=None, columns=vocab) return df_vectors def make_vocab(series): '''adds words of input articles to a global vocabulary. input: dataframe of all articles, return value: list of words ''' print('# BOW: making vocabulary of data set') print('# ...') vocab = set() # for every article's text for text in series: # add single article's text to total vocabulary vocab |= set(BagOfWords.extract_words(text)) # transform to list vocab = list(vocab) # sort list vocab.sort() return vocab def set_stop_words(): '''creates list of all words that will be ignored ''' # stopwords stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she', 'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves'] #add unwanted terms stop_words.extend(['reuters', 'bloomberg', 'cnn', 'economist']) #remove the word 'not' from stop words #stop_words.remove('not') for i in range(len(stop_words)): # remove punctuation marks and strip endings from abbreviations #stop_words[i] = re.split(r'\W', stop_words[i])[0] # reduce word to stem stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i]) # transform list to set to eliminate duplicates stop_words = set(stop_words) return stop_words if __name__ == '__main__': test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for EU approval - sources. BRUSSELS (Reuters) - U.S. software giant Microsoft (MSFT.O) is set to win unconditional EU antitrust approval for its $7.5 billion purchase of privately held coding website GitHub, two people familiar with the matter said on Monday. Microsoft announced the deal in June, its largest acquisition since it bought LinkedIn for $26 billion in 2016. The GitHub deal is expected to boost the U.S. software giant’s cloud computing business and challenge market leader Amazon (AMZN.O). GitHub, the world’s largest code host, has more than 28 million developers using its platform. It will become a part of Microsoft’s Intelligent Cloud unit once the acquisition is completed. Microsoft Chief Executive Satya Nadella has tried to assuage users’ worries that GitHub might favor Microsoft products over competitors after the deal, saying GitHub would continue to be an open platform that works with all public clouds. The European Commission, which is set to decide on the deal by Oct. 19, did not respond to a request for immediate comment. Microsoft declined to comment. Reporting by Foo Yun Chee; editing by Jason Neely''' print(BagOfWords.extract_words(test_article))