59 lines
2.2 KiB
Python
59 lines
2.2 KiB
Python
|
'''
|
||
|
Filter Keywords
|
||
|
===============
|
||
|
|
||
|
FilterKeywords searches for merger specific keywords
|
||
|
in an article and counts them.
|
||
|
'''
|
||
|
|
||
|
import re
|
||
|
|
||
|
from nltk.stem.porter import PorterStemmer
|
||
|
|
||
|
class FilterKeywords():
|
||
|
|
||
|
def search_keywords(dict_input):
|
||
|
'''extracts relevant key-value pairs of in article's input dictionary.
|
||
|
output are the contained keywords and their count.
|
||
|
'''
|
||
|
|
||
|
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition',
|
||
|
'acquire', 'acquisitions', 'acquires', 'combine', 'combines',
|
||
|
'combination', 'combined', 'joint', 'venture', 'JV', 'takeover',
|
||
|
'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions',
|
||
|
'approve', 'approves', 'approved', 'approving', 'approval',
|
||
|
'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out',
|
||
|
'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
|
||
|
|
||
|
# reduce words to stem
|
||
|
stemmer = PorterStemmer()
|
||
|
for i in range(len(keyword_list)):
|
||
|
keyword_list[i] = stemmer.stem(keyword_list[i])
|
||
|
|
||
|
# remove duplicates
|
||
|
keywords = set(keyword_list)
|
||
|
|
||
|
# counts keywords in article
|
||
|
dict_keywords = {}
|
||
|
|
||
|
# search for matchings in dictionary of input article
|
||
|
for key in dict_input.keys():
|
||
|
# iterate over all regular expressions
|
||
|
for kword in keywords:
|
||
|
if re.match(kword, key):
|
||
|
# if match, increase value of matching key
|
||
|
if str(kword) in dict_keywords:
|
||
|
dict_keywords[str(kword)] += dict_input[key]
|
||
|
else:
|
||
|
dict_keywords[str(kword)] = dict_input[key]
|
||
|
|
||
|
return dict_keywords
|
||
|
|
||
|
def count_keywords(dict_keywords):
|
||
|
'''input: dict with article's keywords (key) and their count (value).
|
||
|
returns number of keywords that are found.
|
||
|
'''
|
||
|
return sum(dict_keywords.values())
|
||
|
|
||
|
|
||
|
|