''' Filter Keywords =============== FilterKeywords searches for merger specific keywords in an article and counts them. ''' # toDo: replace dict by vector/matrix from collections import defaultdict import re from nltk.stem.porter import PorterStemmer class FilterKeywords: def search_keywords(dict_input): '''extracts relevant key-value pairs of in article's input dictionary, output are the contained keywords and their count. ''' keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition', 'acquire', 'acquisitions', 'acquires', 'combine', 'combines', 'combination', 'combined', 'joint', 'venture', 'JV', 'takeover', 'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions', 'approve', 'approves', 'approved', 'approving', 'approval', 'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out', 'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer'] # reduce words to stem stemmer = PorterStemmer() for i in range(len(keyword_list)): keyword_list[i] = stemmer.stem(keyword_list[i]) # remove duplicates keywords = set(keyword_list) # # counts keywords in article (default value: 0) # dict_keywords = defaultdict(int) # # search for matchings in dictionary of input article # for key in dict_input.keys(): # # iterate over all regular expressions # for kword in keywords: # if re.match(kword, key): # # if match, increase value of matching key # if str(kword) in dict_keywords: # dict_keywords[str(kword)] += dict_input[key] # else: # dict_keywords[str(kword)] = dict_input[key] # return dict_keywords if __name__ == '__main__': # dict_test={'example':2, 'combined':5, 'sells':3} # print(FilterKeywords.search_keywords(dict_test))