''' Cosine Similarity ================= CosineSimilarity measures the similarity between to articles. It calculates c: the cosine of the angle between the articles vectors dict_1 and dict_2. c = (dict_1 * dict_2) / (|dict_1| * |dict_2|). c = 1, if articles are equal => identicalness is 100% 0 > c > 1, else => identicalness is (c*100)% (The greater c, the more similar two articles are.) ''' #TODO:uses dictionaries of each article #=>ToDo:has to be changed as we are now using vectors import math from BagOfWords import BagOfWords class CosineSimilarity: def cos_sim(dict_1, dict_2): # list of all different words vocab = [] # insert words of 1st article into vocab for key in dict_1.keys(): if key not in vocab: vocab.append(key) # insert words of 2nd article into vocab for key in dict_2.keys(): if key not in vocab: vocab.append(key) # delete first entry ('sum_words') vocab.pop(0) # create vectors vector_1 = CosineSimilarity.create_vector(dict_1, vocab) vector_2 = CosineSimilarity.create_vector(dict_2, vocab) # start calculation # calculate numerator of formula sum_1 = 0 for i in range (0,len(vector_1)): sum_1 += vector_1[i] * vector_2[i] # calculate denominator of formula sum_2 = 0 for entry in vector_1: sum_2 += entry ** 2 sum_3 = 0 for entry in vector_2: sum_3 += entry ** 2 return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3)) def create_vector(dict, vocab): # word frequency vector vector = [] for word in vocab: # check if word occurs in article if word in dict: # insert word count vector.append(dict[word]) else: # insert zero vector.append(0) # delete first entry ('sum_words') vector.pop(0) return vector