문서 유사도(Document Similarity)
Last updated
Was this helpful?
Last updated
Was this helpful?
Was this helpful?
from numpy import dot
from numpy.linalg import norm
import numpy as np
def cos_sim(A, B):
return dot(A, B)/(norm(A)*norm(B))
doc1=np.array([0,1,1,1])
doc2=np.array([1,0,1,1])
doc3=np.array([2,0,2,2])
print(cos_sim(doc1, doc2))
print(cos_sim(doc1, doc3))
print(cos_sim(doc2, doc3))
# 0.67
# 0.67
# 1.00import numpy as np
def dist(x,y):
return np.sqrt(np.sum((x-y)**2))
doc1 = np.array((2,3,0,1))
doc2 = np.array((1,2,3,1))
doc3 = np.array((2,1,2,2))
docQ = np.array((1,1,0,1))
print(dist(doc1,docQ))
print(dist(doc2,docQ))
print(dist(doc3,docQ))
# 2.23606797749979
# 3.1622776601683795
# 2.449489742783178doc1 = "apple banana everyone like likey watch card holder"
doc2 = "apple banana coupon passport love you"
# 토큰화를 수행합니다.
tokenized_doc1 = doc1.split()
tokenized_doc2 = doc2.split()
# 토큰화 결과 출력
print(tokenized_doc1)
print(tokenized_doc2)
# ['apple', 'banana', 'everyone', 'like', 'likey', 'watch', 'card', 'holder']
# ['apple', 'banana', 'coupon', 'passport', 'love', 'you']
union = set(tokenized_doc1).union(set(tokenized_doc2))
intersection = set(tokenized_doc1).intersection(set(tokenized_doc2))
print(len(intersection)/len(union))
# 0.16666666666666666