thesis-anne/FilterKeywords.py

59 lines
2.2 KiB
Python
Raw Normal View History

2018-09-05 12:08:13 +00:00
'''
Filter Keywords
===============
FilterKeywords searches for merger specific keywords
in an article and counts them.
'''
import re
from nltk.stem.porter import PorterStemmer
class FilterKeywords():
def search_keywords(dict_input):
'''extracts relevant key-value pairs of in article's input dictionary.
output are the contained keywords and their count.
'''
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition',
'acquire', 'acquisitions', 'acquires', 'combine', 'combines',
'combination', 'combined', 'joint', 'venture', 'JV', 'takeover',
'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions',
'approve', 'approves', 'approved', 'approving', 'approval',
'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out',
'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
# reduce words to stem
stemmer = PorterStemmer()
for i in range(len(keyword_list)):
keyword_list[i] = stemmer.stem(keyword_list[i])
# remove duplicates
keywords = set(keyword_list)
# counts keywords in article
dict_keywords = {}
# search for matchings in dictionary of input article
for key in dict_input.keys():
# iterate over all regular expressions
for kword in keywords:
if re.match(kword, key):
# if match, increase value of matching key
if str(kword) in dict_keywords:
dict_keywords[str(kword)] += dict_input[key]
else:
dict_keywords[str(kword)] = dict_input[key]
return dict_keywords
def count_keywords(dict_keywords):
'''input: dict with article's keywords (key) and their count (value).
returns number of keywords that are found.
'''
return sum(dict_keywords.values())