thesis-anne/CosineSimilarity.py

'''
Cosine Similarity
=================

CosineSimilarity measures the similarity between to articles.
It calculates c: the cosine of the angle between the articles
vectors dict_1 and dict_2.
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
c = 1, if articles are equal => identicalness is 100%
0 > c > 1, else => identicalness is (c*100)%
(The greater c, the more similar two articles are.)
'''

#TODO:uses dictionaries of each article
#=>ToDo:has to be changed as we are now using vectors

import math

from BagOfWords import BagOfWords

class CosineSimilarity:

    def cos_sim(dict_1, dict_2):

        # list of all different words
        vocab = []

        # insert words of 1st article into vocab
        for key in dict_1.keys():
            if key not in vocab:
                vocab.append(key)

        # insert words of 2nd article into vocab
        for key in dict_2.keys():
            if key not in vocab:
                vocab.append(key)

        # delete first entry ('sum_words')
        vocab.pop(0)

        # create vectors
        vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
        vector_2 = CosineSimilarity.create_vector(dict_2, vocab)

        # start calculation
        # calculate numerator of formula
        sum_1 = 0

        for i in range (0,len(vector_1)):
            sum_1 += vector_1[i] * vector_2[i]

        # calculate denominator of formula
        sum_2 = 0

        for entry in vector_1:
            sum_2 += entry ** 2

        sum_3 = 0
        for entry in vector_2:
            sum_3 += entry ** 2

        return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))

    def create_vector(dict, vocab):
        # word frequency vector
        vector = []
        for word in vocab:
            # check if word occurs in article
            if word in dict:
                # insert word count
                vector.append(dict[word])
            else:
                # insert zero
                vector.append(0)
        # delete first entry ('sum_words')
        vector.pop(0)
        return vector