2018-09-05 12:08:13 +00:00
|
|
|
'''
|
|
|
|
Filter Keywords
|
|
|
|
===============
|
|
|
|
|
2018-09-17 19:16:19 +00:00
|
|
|
FilterKeywords searches for merger specific keywords
|
2018-09-05 12:08:13 +00:00
|
|
|
in an article and counts them.
|
|
|
|
'''
|
|
|
|
|
2018-09-17 19:16:19 +00:00
|
|
|
# toDo: dict ändern!
|
|
|
|
|
2018-09-05 12:08:13 +00:00
|
|
|
import re
|
|
|
|
|
|
|
|
from nltk.stem.porter import PorterStemmer
|
|
|
|
|
2018-09-10 08:38:24 +00:00
|
|
|
class FilterKeywords:
|
2018-09-17 19:16:19 +00:00
|
|
|
|
2018-09-05 12:08:13 +00:00
|
|
|
def search_keywords(dict_input):
|
2018-09-10 08:38:24 +00:00
|
|
|
'''extracts relevant key-value pairs of in article's input dictionary,
|
2018-09-05 12:08:13 +00:00
|
|
|
output are the contained keywords and their count.
|
2018-09-17 19:16:19 +00:00
|
|
|
'''
|
|
|
|
|
2018-09-10 08:38:24 +00:00
|
|
|
# # list of regular expressions that match merger specific keywords
|
2018-09-17 19:16:19 +00:00
|
|
|
# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
|
|
|
|
# r'business combinations?', r'combined compan(y|ies)',
|
2018-09-10 08:38:24 +00:00
|
|
|
# r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
|
|
|
|
# r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
|
2018-09-17 19:16:19 +00:00
|
|
|
# r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
|
|
|
|
# r'purchase', r'(sell(s|ers?|ing)?|sold)']
|
|
|
|
|
|
|
|
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
|
|
|
|
'acquisition', 'acquire', 'acquisitions', 'acquires',
|
|
|
|
'combine', 'combines', 'combination', 'combined',
|
|
|
|
'joint', 'venture', 'JV', 'takeover', 'take-over',
|
|
|
|
'tie-up', 'deal', 'deals', 'transaction',
|
|
|
|
'transactions', 'approve', 'approves', 'approved',
|
|
|
|
'approving', 'approval', 'approvals', 'buy', 'buys',
|
|
|
|
'buying', 'bought', 'buyout', 'buy-out', 'purchase',
|
2018-09-10 08:38:24 +00:00
|
|
|
'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
|
2018-09-17 19:16:19 +00:00
|
|
|
|
2018-09-05 12:08:13 +00:00
|
|
|
# reduce words to stem
|
|
|
|
stemmer = PorterStemmer()
|
|
|
|
for i in range(len(keyword_list)):
|
2018-09-17 19:16:19 +00:00
|
|
|
keyword_list[i] = stemmer.stem(keyword_list[i])
|
|
|
|
|
2018-09-05 12:08:13 +00:00
|
|
|
# remove duplicates
|
|
|
|
keywords = set(keyword_list)
|
2018-09-17 19:16:19 +00:00
|
|
|
|
2018-09-05 12:08:13 +00:00
|
|
|
# counts keywords in article
|
|
|
|
dict_keywords = {}
|
2018-09-17 19:16:19 +00:00
|
|
|
|
2018-09-05 12:08:13 +00:00
|
|
|
# search for matchings in dictionary of input article
|
|
|
|
for key in dict_input.keys():
|
|
|
|
# iterate over all regular expressions
|
|
|
|
for kword in keywords:
|
2018-09-17 19:16:19 +00:00
|
|
|
if re.match(kword, key):
|
2018-09-05 12:08:13 +00:00
|
|
|
# if match, increase value of matching key
|
|
|
|
if str(kword) in dict_keywords:
|
|
|
|
dict_keywords[str(kword)] += dict_input[key]
|
|
|
|
else:
|
|
|
|
dict_keywords[str(kword)] = dict_input[key]
|
2018-09-17 19:16:19 +00:00
|
|
|
|
2018-09-05 12:08:13 +00:00
|
|
|
return dict_keywords
|
2018-09-17 19:16:19 +00:00
|
|
|
|
2018-09-05 12:08:13 +00:00
|
|
|
def count_keywords(dict_keywords):
|
2018-09-10 08:38:24 +00:00
|
|
|
'''input: dict with article's keywords (key) and their count (value),
|
2018-09-05 12:08:13 +00:00
|
|
|
returns number of keywords that are found.
|
|
|
|
'''
|
2018-09-17 19:16:19 +00:00
|
|
|
return sum(dict_keywords.values())
|