-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathstring_kernel.pyx
93 lines (72 loc) · 3.27 KB
/
string_kernel.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import numpy as np
cimport numpy as np
from cpython cimport array
import array
cimport cython
def ssk(s, t, int n, float lbda, accum=False):
"""s and t are strings, either numpy.str_ or python str, or a list of chars"""
s_array = array.array('l', [ord(c) for c in s])
t_array = array.array('l', [ord(c) for c in t])
return ssk_array(s_array, t_array, n, lbda, accum)
# Kernel defined by Lodhi et al. (2002)
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
def ssk_array(array.array s_, array.array t_, int n, float lbda, accum=False):
cdef int lens, lent
cdef int i, sj, tk
cdef float toret
cdef long[:] s # this reduces the overhead 10x fold!!!
cdef long[:] t
s = s_ if s_.typecode == 'l' else array.array('l', [int(c) for c in s_])
t = t_ if t_.typecode == 'l' else array.array('l', [int(c) for c in t_])
lens, lent = len(s), len(t)
#k_prim = (-1)*np.ones( (n+1, lens, lent) )
cdef np.ndarray[np.float64_t, ndim=3] \
k_prim = np.zeros((n, lens, lent), dtype=np.float)
k_prim[0,:,:] = 1
for i in range(1,n):
for sj in range(i,lens):
toret = 0.
for tk in range(i,lent):
if s[sj-1]==t[tk-1]: # trick taken from shogun implemantion of SSK
toret = lbda * (toret + lbda*k_prim[i-1,sj-1,tk-1])
else:
toret *= lbda
k_prim[i,sj,tk] = toret + lbda * k_prim[i, sj-1, tk]
cdef int start = 0 if accum else n-1
cdef float k = 0.
for i in range(n):
for sj in range(i,lens):
for tk in range(i,lent):
if s[sj]==t[tk]:
k += lbda*lbda*k_prim[i,sj,tk]
# print( [len(list(i for (sj,tk,i) in k_prim if i==m-1)) for m in range(n)] )
return k
def string_kernel(xs, ys, n, lbda):
"""xs and ys are numpy arrays of strings or arrays of ints, n an integer and lbda a bool"""
if len(xs.shape) != 2 or len(ys.shape) != 2 or xs.shape[1] != 1 or ys.shape[1] != 1:
raise "The shape of the features is wrong, it must be (n,1)"
cdef int lenxs, lenys
cdef int i, j
cdef np.ndarray[np.float64_t, ndim=2] mat, mat_xs, mat_ys
lenxs, lenys = xs.shape[0], ys.shape[0]
mat = np.zeros((lenxs, lenys))
ssk_fun = ssk_array if xs.dtype == 'O' and isinstance(xs[0,0], array.array) else ssk
# If both lists are equal, then the resulting matrix is symetric, there is no need to
# calculate the hole thing
if lenxs == lenys and np.array_equal(xs, ys):
for i in range(lenxs):
for j in range(i,lenys):
mat[j,i] = mat[i,j] = ssk_fun(xs[i,0], ys[j,0], n, lbda, accum=True)
mat_xs = mat_ys = mat.diagonal().reshape((lenxs, 1))
else:
for i in range(lenxs):
for j in range(lenys):
mat[i,j] = ssk_fun(xs[i,0], ys[j,0], n, lbda, accum=True)
mat_xs = np.zeros((lenxs, 1))
mat_ys = np.zeros((lenys, 1))
for i in range(lenxs):
mat_xs[i] = ssk_fun(xs[i,0], xs[i,0], n, lbda, accum=True)
for j in range(lenys):
mat_ys[j] = ssk_fun(ys[j,0], ys[j,0], n, lbda, accum=True)
return np.divide(mat, np.sqrt(mat_ys.T * mat_xs))