-
Notifications
You must be signed in to change notification settings - Fork 0
/
HPC2.py
235 lines (179 loc) · 8.21 KB
/
HPC2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
from __future__ import absolute_import, print_function
import numpy as np
import pyopencl as cl
from scipy import sparse
from scipy.sparse.linalg import LinearOperator
from scipy.sparse import csr_matrix
import time
class Timer:
'''
Class for measuring time, returns time interval
taken by the code execution.
'''
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, *args):
self.end = time.time()
self.interval = self.end - self.start
def rand_csr(N,density=0.4):
'''
Generate a random sparce matrix in CSR format of the specified density.
'''
# Create random square sparse matrix in COO format
coo_sparse = sparse.random(N,N,density,dtype='float32')
# Convert to CSR format and return
return sparse.csr_matrix(coo_sparse) # Convert to CSR format and return
class Sparce(LinearOperator):
'''
Class derived from LinearOperator, initialised with a sparce matrix in
the CSR format. It overrides the _matvec function to implement
algorithm for CSR multiplication, accelerated by OpenCL.
'''
def __init__(self, inMat):
'''
Initiaises Sparce matrix with input type scipy.sparse.csr_matrix.
shape and dtype fields are required by LinearOperator.
'''
self.shape = inMat.get_shape()
self.dtype = 'float32'
self.data = inMat.data
self.indices = inMat.indices
self.indptr = inMat.indptr
def _matvec(self, v):
'''
OpenCL implementation of product of sparce matrix object with vector v.
Returns A*v. Dimensions of v should match first dimension of the sparce matrix
'''
platform = cl.get_platforms()[0] # Select the first platform [0]
device = platform.get_devices()[0] # Select the first device on this platform [0]
ctx = cl.Context([device])
queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # Create (non-custom) command queue
mf = cl.mem_flags # Memory flags
# 4 compute units available on my CPU (2 cores + HT)
# AVX2 -> 256 bit AVX --> double4
vec = v.astype(np.float32)
data = self.data.astype(np.float32) # Data vector
ind = self.indices.astype(np.int32) # Vector of column indices
indptr = self.indptr.astype(np.int32) # Index pointer of column
#Create buffers
data_buff = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data)
ind_buff = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=ind)
indptr_buff = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=indptr)
vec_buff = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=vec)
res_buff = cl.Buffer(ctx,mf.WRITE_ONLY,vec.nbytes)
N = np.float32(len(indptr) - 1) # Length of the output vector
### Create a kernel which carries out the CSR multiplication
matvec_kernel ="""
__kernel void matvec( const int N,
__global const float * data,
__global const int * index,
__global const int * indptr,
__global const float * vec,
__global float * res )
{
int gid = get_global_id(0);
float acc = 0.0f;
for (int k = indptr[gid]; k < indptr[gid + 1]; k++)
{
acc += data[k] * vec[index[k]];
}
res[gid] = acc;
} """
prg = cl.Program(ctx,matvec_kernel).build()
res_np = np.empty_like(vec).astype(np.float32)
prg.matvec(queue,vec.shape,None,np.int32(N),
data_buff,ind_buff,indptr_buff,vec_buff,res_buff).wait()
cl.enqueue_copy(queue, res_np, res_buff).wait()
return res_np
def _matvecsimd(self, v):
'''
Vectorised sparse matrix product.
Breaks problem down into double4 vector types, carries out multiplications
in vector form, then accumulates the values corresponding to each matrix
row, therefore to each result element.
'''
platform = cl.get_platforms()[0] # Select the first platform [0]
device = platform.get_devices()[0] # Select the first device on this platform [0]
ctx = cl.Context([device])
queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # Create (non-custom) command queue
mf = cl.mem_flags # Memory flags
# 4 compute units available on my CPU (2 cores + HT)
# AVX2 -> 256 bit AVX --> double4
vec = v.astype(np.float32)
data = self.data.astype(np.float32) # Data vector
ind = self.indices.astype(np.int32) # Vector of column indices
indptr = self.indptr.astype(np.int32) # Index pointer of column
remainder = np.uint32(np.size(data)%4)
### Pad data and indices to be divisible by 8
if remainder != 0:
data = np.concatenate([data, np.zeros(4 - remainder)]).astype(np.float32)
ind = np.concatenate([ind, np.zeros(4 - remainder)]).astype(np.uint32)
iteration_len = np.size(data)
#Create buffers
data_buff = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data)
ind_buff = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=ind)
indptr_buff = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=indptr)
vec_buff = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=vec)
res_buff = cl.Buffer(ctx,mf.WRITE_ONLY,vec.nbytes)
res_tmp_buff = cl.Buffer(ctx, mf.READ_WRITE, data.nbytes)
matvec_simd_kernel ="""
__kernel void mult(__global const float * data,
__global const int * index,
__global const float * vec,
__global float * resTmp )
{
int gid = get_global_id(0);
double4 dataSlice;
double4 vecSlice;
double4 resSlice;
dataSlice = (double4)(data[gid*4+0],
data[gid*4+1],
data[gid*4+2],
data[gid*4+3]);
vecSlice = (double4)(vec[index[gid*4+0]],
vec[index[gid*4+1]],
vec[index[gid*4+2]],
vec[index[gid*4+3]]);
resSlice = dataSlice * vecSlice;
resTmp[gid*4+0] = resSlice.s0;
resTmp[gid*4+1] = resSlice.s1;
resTmp[gid*4+2] = resSlice.s2;
resTmp[gid*4+3] = resSlice.s3;
}
__kernel void add(__global const int* indptr,
__global const float* resTmp,
__global float* res) {
int gid = get_global_id(0);
float acc = 0.0f;
for(uint j = indptr[gid]; j < indptr[gid+1]; j++){
acc += resTmp[j];
}
res[gid] = acc;
}
"""
prg = cl.Program(ctx,matvec_simd_kernel).build()
prg.mult(queue, (int(iteration_len/4), ), None,
data_buff, ind_buff, vec_buff, res_tmp_buff)
prg.add(queue, (np.size(indptr) - 1,), None,
indptr_buff, res_tmp_buff, res_buff)
res = np.empty_like(v)
cl.enqueue_copy(queue, res, res_buff).wait()
# Trim result so only the original size vector is returned without padding.
return res[0:np.size(v)]
def runCode(N):
'''
Generates a random Sparce matrix of dimensions N*N in CSR format.
Also generates a random vector of length N, and carries out the
multiplication using both OpenCL functions, with and without SIMD optimisation.
'''
vec = np.random.rand(N).astype(np.float32)
foo = rand_csr(N, 0.10)
bar = Sparce(foo)
with Timer() as t1:
a = bar._matvec(vec)
with Timer() as t2:
b = bar._matvecsimd(vec)
print("time to run without SIMD: {0}".format(t1.interval))
print("time to run with SIMD: {0}".format(t2.interval))
runCode(10000)