thesis-anne/src/FilterKeywords.py

'''
Filter Keywords
===============

FilterKeywords searches for merger specific keywords
in an article and counts them.
'''

# toDo: replace dict by vector/matrix

from collections import defaultdict
import re

from nltk.stem.porter import PorterStemmer

class FilterKeywords:

    def search_keywords(dict_input):
        '''extracts relevant key-value pairs of in article's input dictionary,
        output are the contained keywords and their count.
        '''

        keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
                        'acquisition', 'acquire', 'acquisitions', 'acquires',
                        'combine', 'combines', 'combination', 'combined',
                        'joint', 'venture', 'JV', 'takeover', 'take-over',
                        'tie-up', 'deal', 'deals', 'transaction',
                        'transactions', 'approve', 'approves', 'approved',
                        'approving', 'approval', 'approvals', 'buy', 'buys',
                        'buying', 'bought', 'buyout', 'buy-out', 'purchase',
                        'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']

        # reduce words to stem
        stemmer = PorterStemmer()
        for i in range(len(keyword_list)):
            keyword_list[i] = stemmer.stem(keyword_list[i])

        # remove duplicates
        keywords = set(keyword_list)

        # # counts keywords in article (default value: 0)
        # dict_keywords = defaultdict(int)

        # # search for matchings in dictionary of input article
        # for key in dict_input.keys():
            # # iterate over all regular expressions
            # for kword in keywords:
                # if re.match(kword, key):
                    # # if match, increase value of matching key
                    # if str(kword) in dict_keywords:
                        # dict_keywords[str(kword)] += dict_input[key]
                    # else:
                        # dict_keywords[str(kword)] = dict_input[key]

        # return dict_keywords

if __name__ == '__main__':
    # dict_test={'example':2, 'combined':5, 'sells':3}
    # print(FilterKeywords.search_keywords(dict_test))