thesis-anne/CosineSimilarity.py

77 lines
2.0 KiB
Python
Raw Normal View History

2018-09-07 12:16:47 +00:00
'''
Cosine Similarity
=================
CosineSimilarity measures the similarity between to articles.
2018-09-17 12:47:50 +00:00
It calculates c: the cosine of the angle between the articles
2018-09-07 12:16:47 +00:00
vectors dict_1 and dict_2.
2018-09-17 12:47:50 +00:00
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
c = 1, if articles are equal => identicalness is 100%
2018-09-07 12:16:47 +00:00
0 > c > 1, else => identicalness is (c*100)%
(The greater c, the more similar two articles are.)
'''
2018-09-17 12:47:50 +00:00
#TODO:uses dictionaries of each article
#=>ToDo:has to be changed as we are now using vectors
2018-09-07 12:16:47 +00:00
import math
from BagOfWords import BagOfWords
class CosineSimilarity:
def cos_sim(dict_1, dict_2):
# list of all different words
2018-09-17 12:47:50 +00:00
vocab = []
2018-09-07 12:16:47 +00:00
# insert words of 1st article into vocab
for key in dict_1.keys():
if key not in vocab:
vocab.append(key)
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
# insert words of 2nd article into vocab
for key in dict_2.keys():
if key not in vocab:
vocab.append(key)
2018-09-17 12:47:50 +00:00
# delete first entry ('sum_words')
2018-09-07 12:16:47 +00:00
vocab.pop(0)
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
# create vectors
vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
vector_2 = CosineSimilarity.create_vector(dict_2, vocab)
2018-09-17 12:47:50 +00:00
# start calculation
2018-09-07 12:16:47 +00:00
# calculate numerator of formula
2018-09-17 12:47:50 +00:00
sum_1 = 0
2018-09-07 12:16:47 +00:00
for i in range (0,len(vector_1)):
2018-09-17 12:47:50 +00:00
sum_1 += vector_1[i] * vector_2[i]
# calculate denominator of formula
2018-09-07 12:16:47 +00:00
sum_2 = 0
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
for entry in vector_1:
sum_2 += entry ** 2
2018-09-17 12:47:50 +00:00
sum_3 = 0
2018-09-07 12:16:47 +00:00
for entry in vector_2:
sum_3 += entry ** 2
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
2018-09-17 12:47:50 +00:00
2018-09-07 12:16:47 +00:00
def create_vector(dict, vocab):
2018-09-17 12:47:50 +00:00
# word frequency vector
vector = []
2018-09-07 12:16:47 +00:00
for word in vocab:
# check if word occurs in article
if word in dict:
# insert word count
vector.append(dict[word])
else:
# insert zero
vector.append(0)
# delete first entry ('sum_words')
vector.pop(0)
return vector