-
Notifications
You must be signed in to change notification settings - Fork 0
/
dist.py
134 lines (110 loc) · 4.19 KB
/
dist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Authors: Christian Thurau
# License: BSD 3 Clause
"""
PyMF several distance functions
kl_divergence(): KL Divergence
l1_distance(): L1 distance
l2_distance(): L2 distance
cosine_distance(): Cosine distance
pdist(): Pairwise distance computation
vq(): Vector quantization
"""
import numpy as np
import scipy.sparse
__all__ = ["abs_cosine_distance", "kl_divergence", "l1_distance", "l2_distance",
"weighted_abs_cosine_distance","cosine_distance","vq", "pdist"]
def kl_divergence(d, vec):
"""
"""
b = vec*(1/d)
b = np.where(b>0, np.log(b),0)
b = vec * b
b = np.sum(b - vec + d, axis=0).reshape((-1))
return b
def l1_distance(d, vec):
ret_val = np.sum(np.abs(d - vec), axis=0)
ret_val = ret_val.reshape((-1))
return ret_val
def sparse_l2_distance(d, vec):
# compute the norm of d
nd = (d.multiply(d)).sum(axis=0)
nv = (vec.multiply(vec)).sum(axis=0)
ret_val = nd + nv - 2.0*(d.T * vec).T
return np.sqrt(ret_val)
def approx_l2_distance(d, vec):
# Use random projections to approximate the conventional l2 distance
k = np.round(np.log(d.shape[0]))
#k = d.shape[0]
R = np.random.randn(k, d.shape[0])
R = R / np.sqrt((R**2).sum(axis=0))
A = np.dot(R,d)
B = np.dot(R, vec)
ret_val = np.sum( (A - B)**2, axis=0)
ret_val = np.sqrt(R.shape[1]/R.shape[0]) * np.sqrt(ret_val)
ret_val = ret_val.reshape((-1))
return ret_val
def l2_distance(d, vec):
if scipy.sparse.issparse(d):
ret_val = sparse_l2_distance(d, vec)
else:
ret_val = np.sqrt(((d[:,:] - vec)**2.0).sum(axis=0))
return ret_val.reshape((-1))
def l2_distance_new(d,vec):
# compute the norm of d
nd = (d**2).sum(axis=0)
nv = (vec**2).sum(axis=0)
ret_val = nd + nv - 2.0*np.dot(d.T,vec.reshape((-1,1))).T
return np.sqrt(ret_val)
def cosine_distance(d, vec):
tmp = np.dot(np.transpose(d), vec)
a = np.sqrt(np.sum(d**2, axis=0))
b = np.sqrt(np.sum(vec**2))
k = (a*b).reshape(-1) + (10**-9)
# compute distance
ret_val = 1.0 - tmp/k
return ret_val.reshape((-1))
def abs_cosine_distance(d, vec, weighted=False):
if scipy.sparse.issparse(d):
tmp = np.array((d.T * vec).todense(), dtype=np.float32).reshape(-1)
a = np.sqrt(np.array(d.multiply(d).sum(axis=0), dtype=np.float32).reshape(-1))
b = np.sqrt(np.array(vec.multiply(vec).sum(axis=0), dtype=np.float32).reshape(-1))
else:
tmp = np.dot(np.transpose(d), vec).reshape(-1)
a = np.sqrt(np.sum(d**2, axis=0)).reshape(-1)
b = np.sqrt(np.sum(vec**2)).reshape(-1)
k = (a*b).reshape(-1) + 10**-9
# compute distance
ret_val = 1.0 - np.abs(tmp/k)
if weighted:
ret_val = ret_val * a
return ret_val.reshape((-1))
def weighted_abs_cosine_distance(d, vec):
ret_val = abs_cosine_distance(d, vec, weighted=True)
return ret_val
def pdist(A, B, metric='l2' ):
# compute pairwise distance between a data matrix A (d x n) and B (d x m).
# Returns a distance matrix d (n x m).
d = np.zeros((A.shape[1], B.shape[1]))
if A.shape[1] <= B.shape[1]:
for aidx in range(A.shape[1]):
if metric == 'l2':
d[aidx:aidx+1,:] = l2_distance(B[:,:], A[:,aidx:aidx+1]).reshape((1,-1))
if metric == 'l1':
d[aidx:aidx+1,:] = l1_distance(B[:,:], A[:,aidx:aidx+1]).reshape((1,-1))
else:
for bidx in range(B.shape[1]):
if metric == 'l2':
d[:, bidx:bidx+1] = l2_distance(A[:,:], B[:,bidx:bidx+1]).reshape((-1,1))
if metric == 'l1':
d[:, bidx:bidx+1] = l1_distance(A[:,:], B[:,bidx:bidx+1]).reshape((-1,1))
return d
def vq(A, B, metric='l2'):
# assigns data samples in B to cluster centers A and
# returns an index list [assume n column vectors, d x n]
assigned = np.argmin(pdist(A,B, metric=metric), axis=0)
return assigned
def _test():
import doctest
doctest.testmod()
if __name__ == "__main__":
_test()