''' Filter Keywords =============== FilterKeywords searches for merger specific keywords in an article and counts them. ''' import re from nltk.stem.porter import PorterStemmer class FilterKeywords: def search_keywords(dict_input): '''extracts relevant key-value pairs of in article's input dictionary, output are the contained keywords and their count. ''' # # list of regular expressions that match merger specific keywords # regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?', # r'business combinations?', r'combined compan(y|ies)', # r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up', # r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?', # r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?', # r'purchase', r'(sell(s|ers?|ing)?|sold)'] keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition', 'acquire', 'acquisitions', 'acquires', 'combine', 'combines', 'combination', 'combined', 'joint', 'venture', 'JV', 'takeover', 'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions', 'approve', 'approves', 'approved', 'approving', 'approval', 'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out', 'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer'] # reduce words to stem stemmer = PorterStemmer() for i in range(len(keyword_list)): keyword_list[i] = stemmer.stem(keyword_list[i]) # remove duplicates keywords = set(keyword_list) # counts keywords in article dict_keywords = {} # search for matchings in dictionary of input article for key in dict_input.keys(): # iterate over all regular expressions for kword in keywords: if re.match(kword, key): # if match, increase value of matching key if str(kword) in dict_keywords: dict_keywords[str(kword)] += dict_input[key] else: dict_keywords[str(kword)] = dict_input[key] return dict_keywords def count_keywords(dict_keywords): '''input: dict with article's keywords (key) and their count (value), returns number of keywords that are found. ''' return sum(dict_keywords.values())