2018-09-07 12:16:47 +00:00
|
|
|
'''
|
|
|
|
Cosine Similarity
|
|
|
|
=================
|
|
|
|
|
|
|
|
CosineSimilarity measures the similarity between to articles.
|
2018-09-17 12:47:50 +00:00
|
|
|
It calculates c: the cosine of the angle between the articles
|
2018-09-07 12:16:47 +00:00
|
|
|
vectors dict_1 and dict_2.
|
2018-09-17 12:47:50 +00:00
|
|
|
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
|
|
|
|
c = 1, if articles are equal => identicalness is 100%
|
2018-09-07 12:16:47 +00:00
|
|
|
0 > c > 1, else => identicalness is (c*100)%
|
|
|
|
(The greater c, the more similar two articles are.)
|
|
|
|
'''
|
|
|
|
|
2018-09-17 12:47:50 +00:00
|
|
|
#TODO:uses dictionaries of each article
|
|
|
|
#=>ToDo:has to be changed as we are now using vectors
|
2018-09-07 12:16:47 +00:00
|
|
|
|
|
|
|
import math
|
|
|
|
|
|
|
|
from BagOfWords import BagOfWords
|
|
|
|
|
|
|
|
class CosineSimilarity:
|
|
|
|
|
|
|
|
def cos_sim(dict_1, dict_2):
|
|
|
|
|
|
|
|
# list of all different words
|
2018-09-17 12:47:50 +00:00
|
|
|
vocab = []
|
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
# insert words of 1st article into vocab
|
|
|
|
for key in dict_1.keys():
|
|
|
|
if key not in vocab:
|
|
|
|
vocab.append(key)
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
# insert words of 2nd article into vocab
|
|
|
|
for key in dict_2.keys():
|
|
|
|
if key not in vocab:
|
|
|
|
vocab.append(key)
|
2018-09-17 12:47:50 +00:00
|
|
|
|
|
|
|
# delete first entry ('sum_words')
|
2018-09-07 12:16:47 +00:00
|
|
|
vocab.pop(0)
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
# create vectors
|
|
|
|
vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
|
|
|
|
vector_2 = CosineSimilarity.create_vector(dict_2, vocab)
|
2018-09-17 12:47:50 +00:00
|
|
|
|
|
|
|
# start calculation
|
2018-09-07 12:16:47 +00:00
|
|
|
# calculate numerator of formula
|
2018-09-17 12:47:50 +00:00
|
|
|
sum_1 = 0
|
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
for i in range (0,len(vector_1)):
|
2018-09-17 12:47:50 +00:00
|
|
|
sum_1 += vector_1[i] * vector_2[i]
|
|
|
|
|
|
|
|
# calculate denominator of formula
|
2018-09-07 12:16:47 +00:00
|
|
|
sum_2 = 0
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
for entry in vector_1:
|
|
|
|
sum_2 += entry ** 2
|
2018-09-17 12:47:50 +00:00
|
|
|
|
|
|
|
sum_3 = 0
|
2018-09-07 12:16:47 +00:00
|
|
|
for entry in vector_2:
|
|
|
|
sum_3 += entry ** 2
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
|
2018-09-17 12:47:50 +00:00
|
|
|
|
2018-09-07 12:16:47 +00:00
|
|
|
def create_vector(dict, vocab):
|
2018-09-17 12:47:50 +00:00
|
|
|
# word frequency vector
|
|
|
|
vector = []
|
2018-09-07 12:16:47 +00:00
|
|
|
for word in vocab:
|
|
|
|
# check if word occurs in article
|
|
|
|
if word in dict:
|
|
|
|
# insert word count
|
|
|
|
vector.append(dict[word])
|
|
|
|
else:
|
|
|
|
# insert zero
|
|
|
|
vector.append(0)
|
|
|
|
# delete first entry ('sum_words')
|
|
|
|
vector.pop(0)
|
|
|
|
return vector
|