thesis-anne/CosineSimilarity.py

77 lines
2.0 KiB
Python

'''
Cosine Similarity
=================
CosineSimilarity measures the similarity between to articles.
It calculates c: the cosine of the angle between the articles
vectors dict_1 and dict_2.
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
c = 1, if articles are equal => identicalness is 100%
0 > c > 1, else => identicalness is (c*100)%
(The greater c, the more similar two articles are.)
'''
#TODO:uses dictionaries of each article
#=>ToDo:has to be changed as we are now using vectors
import math
from BagOfWords import BagOfWords
class CosineSimilarity:
def cos_sim(dict_1, dict_2):
# list of all different words
vocab = []
# insert words of 1st article into vocab
for key in dict_1.keys():
if key not in vocab:
vocab.append(key)
# insert words of 2nd article into vocab
for key in dict_2.keys():
if key not in vocab:
vocab.append(key)
# delete first entry ('sum_words')
vocab.pop(0)
# create vectors
vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
vector_2 = CosineSimilarity.create_vector(dict_2, vocab)
# start calculation
# calculate numerator of formula
sum_1 = 0
for i in range (0,len(vector_1)):
sum_1 += vector_1[i] * vector_2[i]
# calculate denominator of formula
sum_2 = 0
for entry in vector_1:
sum_2 += entry ** 2
sum_3 = 0
for entry in vector_2:
sum_3 += entry ** 2
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
def create_vector(dict, vocab):
# word frequency vector
vector = []
for word in vocab:
# check if word occurs in article
if word in dict:
# insert word count
vector.append(dict[word])
else:
# insert zero
vector.append(0)
# delete first entry ('sum_words')
vector.pop(0)
return vector