thesis-anne/FilterKeywords.py

'''
Filter Keywords
===============

FilterKeywords searches for merger specific keywords
in an article and counts them.
'''

# toDo: dict ändern!

import re

from nltk.stem.porter import PorterStemmer

class FilterKeywords:

    def search_keywords(dict_input):
        '''extracts relevant key-value pairs of in article's input dictionary,
        output are the contained keywords and their count.
        '''

        # # list of regular expressions that match merger specific keywords
        # regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
                      # r'business combinations?', r'combined compan(y|ies)',
                      # r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
                      # r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
                      # r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
                      # r'purchase', r'(sell(s|ers?|ing)?|sold)']

        keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
                        'acquisition', 'acquire', 'acquisitions', 'acquires',
                        'combine', 'combines', 'combination', 'combined',
                        'joint', 'venture', 'JV', 'takeover', 'take-over',
                        'tie-up', 'deal', 'deals', 'transaction',
                        'transactions', 'approve', 'approves', 'approved',
                        'approving', 'approval', 'approvals', 'buy', 'buys',
                        'buying', 'bought', 'buyout', 'buy-out', 'purchase',
                        'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']

        # reduce words to stem
        stemmer = PorterStemmer()
        for i in range(len(keyword_list)):
            keyword_list[i] = stemmer.stem(keyword_list[i])

        # remove duplicates
        keywords = set(keyword_list)

        # counts keywords in article
        dict_keywords = {}

        # search for matchings in dictionary of input article
        for key in dict_input.keys():
            # iterate over all regular expressions
            for kword in keywords:
                if re.match(kword, key):
                    # if match, increase value of matching key
                    if str(kword) in dict_keywords:
                        dict_keywords[str(kword)] += dict_input[key]
                    else:
                        dict_keywords[str(kword)] = dict_input[key]

        return dict_keywords

    def count_keywords(dict_keywords):
        '''input: dict with article's keywords (key) and their count (value),
        returns number of keywords that are found.
        '''
        return sum(dict_keywords.values())
initial project version 2018-09-05 12:08:13 +00:00			`'''`
			`Filter Keywords`
			`===============`

callable scripts 2018-09-17 19:16:19 +00:00			`FilterKeywords searches for merger specific keywords`
initial project version 2018-09-05 12:08:13 +00:00			`in an article and counts them.`
			`'''`

callable scripts 2018-09-17 19:16:19 +00:00			`# toDo: dict ändern!`

initial project version 2018-09-05 12:08:13 +00:00			`import re`

			`from nltk.stem.porter import PorterStemmer`

added .gitignore file 2018-09-10 08:38:24 +00:00			`class FilterKeywords:`
callable scripts 2018-09-17 19:16:19 +00:00
initial project version 2018-09-05 12:08:13 +00:00			`def search_keywords(dict_input):`
added .gitignore file 2018-09-10 08:38:24 +00:00			`'''extracts relevant key-value pairs of in article's input dictionary,`
initial project version 2018-09-05 12:08:13 +00:00			`output are the contained keywords and their count.`
callable scripts 2018-09-17 19:16:19 +00:00			`'''`

added .gitignore file 2018-09-10 08:38:24 +00:00			`# # list of regular expressions that match merger specific keywords`
callable scripts 2018-09-17 19:16:19 +00:00			`# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',`
			`# r'business combinations?', r'combined compan(y\|ies)',`
added .gitignore file 2018-09-10 08:38:24 +00:00			`# r'(joint venture\|JV)s?', r'take[ -]?overs?', r'tie-up',`
			`# r'deals?', r'transactions?', r'approv(e\|ing\|al\|ed)s?',`
callable scripts 2018-09-17 19:16:19 +00:00			`# r'(buy(s\|ers?\|ing)?\|bought)', r'buy[ -]?outs?',`
			`# r'purchase', r'(sell(s\|ers?\|ing)?\|sold)']`

			`keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',`
			`'acquisition', 'acquire', 'acquisitions', 'acquires',`
			`'combine', 'combines', 'combination', 'combined',`
			`'joint', 'venture', 'JV', 'takeover', 'take-over',`
			`'tie-up', 'deal', 'deals', 'transaction',`
			`'transactions', 'approve', 'approves', 'approved',`
			`'approving', 'approval', 'approvals', 'buy', 'buys',`
			`'buying', 'bought', 'buyout', 'buy-out', 'purchase',`
added .gitignore file 2018-09-10 08:38:24 +00:00			`'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']`
callable scripts 2018-09-17 19:16:19 +00:00
initial project version 2018-09-05 12:08:13 +00:00			`# reduce words to stem`
			`stemmer = PorterStemmer()`
			`for i in range(len(keyword_list)):`
callable scripts 2018-09-17 19:16:19 +00:00			`keyword_list[i] = stemmer.stem(keyword_list[i])`

initial project version 2018-09-05 12:08:13 +00:00			`# remove duplicates`
			`keywords = set(keyword_list)`
callable scripts 2018-09-17 19:16:19 +00:00
initial project version 2018-09-05 12:08:13 +00:00			`# counts keywords in article`
			`dict_keywords = {}`
callable scripts 2018-09-17 19:16:19 +00:00
initial project version 2018-09-05 12:08:13 +00:00			`# search for matchings in dictionary of input article`
			`for key in dict_input.keys():`
			`# iterate over all regular expressions`
			`for kword in keywords:`
callable scripts 2018-09-17 19:16:19 +00:00			`if re.match(kword, key):`
initial project version 2018-09-05 12:08:13 +00:00			`# if match, increase value of matching key`
			`if str(kword) in dict_keywords:`
			`dict_keywords[str(kword)] += dict_input[key]`
			`else:`
			`dict_keywords[str(kword)] = dict_input[key]`
callable scripts 2018-09-17 19:16:19 +00:00
initial project version 2018-09-05 12:08:13 +00:00			`return dict_keywords`
callable scripts 2018-09-17 19:16:19 +00:00
initial project version 2018-09-05 12:08:13 +00:00			`def count_keywords(dict_keywords):`
added .gitignore file 2018-09-10 08:38:24 +00:00			`'''input: dict with article's keywords (key) and their count (value),`
initial project version 2018-09-05 12:08:13 +00:00			`returns number of keywords that are found.`
			`'''`
callable scripts 2018-09-17 19:16:19 +00:00			`return sum(dict_keywords.values())`