thesis-anne/FilterKeywords.py

'''
Filter Keywords
===============

FilterKeywords searches for merger specific keywords 
in an article and counts them.
'''

import re

from nltk.stem.porter import PorterStemmer

class FilterKeywords():
          
    def search_keywords(dict_input):
        '''extracts relevant key-value pairs of in article's input dictionary.
        output are the contained keywords and their count.
        ''' 
                    
        keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition',
                        'acquire', 'acquisitions', 'acquires', 'combine', 'combines',
                        'combination', 'combined', 'joint', 'venture', 'JV', 'takeover',
                        'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions',
                        'approve', 'approves', 'approved', 'approving', 'approval', 
                        'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out', 
                        'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
                        
        # reduce words to stem
        stemmer = PorterStemmer()
        for i in range(len(keyword_list)):
            keyword_list[i] = stemmer.stem(keyword_list[i])       
        
        # remove duplicates
        keywords = set(keyword_list)
    
        # counts keywords in article
        dict_keywords = {}
        
        # search for matchings in dictionary of input article
        for key in dict_input.keys():
            # iterate over all regular expressions
            for kword in keywords:
                if re.match(kword, key):  
                    # if match, increase value of matching key
                    if str(kword) in dict_keywords:
                        dict_keywords[str(kword)] += dict_input[key]
                    else:
                        dict_keywords[str(kword)] = dict_input[key]
                        
        return dict_keywords
        
    def count_keywords(dict_keywords):
        '''input: dict with article's keywords (key) and their count (value). 
        returns number of keywords that are found.
        '''
        return sum(dict_keywords.values())
initial project version 2018-09-05 12:08:13 +00:00			`'''`
			`Filter Keywords`
			`===============`

			`FilterKeywords searches for merger specific keywords`
			`in an article and counts them.`
			`'''`

			`import re`

			`from nltk.stem.porter import PorterStemmer`

			`class FilterKeywords():`

			`def search_keywords(dict_input):`
			`'''extracts relevant key-value pairs of in article's input dictionary.`
			`output are the contained keywords and their count.`
			`'''`

			`keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition',`
			`'acquire', 'acquisitions', 'acquires', 'combine', 'combines',`
			`'combination', 'combined', 'joint', 'venture', 'JV', 'takeover',`
			`'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions',`
			`'approve', 'approves', 'approved', 'approving', 'approval',`
			`'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out',`
			`'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']`

			`# reduce words to stem`
			`stemmer = PorterStemmer()`
			`for i in range(len(keyword_list)):`
			`keyword_list[i] = stemmer.stem(keyword_list[i])`

			`# remove duplicates`
			`keywords = set(keyword_list)`

			`# counts keywords in article`
			`dict_keywords = {}`

			`# search for matchings in dictionary of input article`
			`for key in dict_input.keys():`
			`# iterate over all regular expressions`
			`for kword in keywords:`
			`if re.match(kword, key):`
			`# if match, increase value of matching key`
			`if str(kword) in dict_keywords:`
			`dict_keywords[str(kword)] += dict_input[key]`
			`else:`
			`dict_keywords[str(kword)] = dict_input[key]`

			`return dict_keywords`

			`def count_keywords(dict_keywords):`
			`'''input: dict with article's keywords (key) and their count (value).`
			`returns number of keywords that are found.`
			`'''`
			`return sum(dict_keywords.values())`