forked from UoB-HPC/miniBUDE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bude_cuda.cu
296 lines (250 loc) · 8.83 KB
/
bude_cuda.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
/**
* BUDE CUDA kernel file
**/
#include <float.h>
#include <stdio.h>
#include "shared.h"
// Numeric constants
#define ZERO 0.0f
#define QUARTER 0.25f
#define HALF 0.5f
#define ONE 1.0f
#define TWO 2.0f
#define FOUR 4.0f
#define CNSTNT 45.0f
#define HBTYPE_F 70
#define HBTYPE_E 69
// The data structure for one atom - 16 bytes
typedef struct
{
float x, y, z, w;
} Transform;
#define HARDNESS 38.0f
#define NPNPDIST 5.5f
#define NPPDIST 1.0f
__global__ void fasten_main(const int natlig,
const int natpro,
const Atom* protein_molecule,
const Atom* ligand_molecule,
const float* transforms_0,
const float* transforms_1,
const float* transforms_2,
const float* transforms_3,
const float* transforms_4,
const float* transforms_5,
float* etotals,
const FFParams* forcefield,
const int num_atom_types,
const int numTransforms);
extern "C"
void runCUDA(float* results)
{
printf("\nRunning CUDA\n");
cudaSetDevice(0);
cudaMalloc((void**)&_cuda.d_protein, params.natpro*sizeof(Atom));
cudaDeviceSynchronize();
cudaMemcpy(_cuda.d_protein, params.protein, params.natpro*sizeof(Atom), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
cudaMalloc((void**)&_cuda.d_ligand, params.natlig*sizeof(Atom));
cudaDeviceSynchronize();
cudaMemcpy(_cuda.d_ligand, params.ligand, params.natlig*sizeof(Atom), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
cudaMalloc((void**)&_cuda.d_forcefield, params.ntypes*sizeof(FFParams));
cudaDeviceSynchronize();
cudaMemcpy(_cuda.d_forcefield, params.forcefield, params.ntypes*sizeof(FFParams), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
cudaMalloc((void**)&_cuda.d_results, params.nposes*sizeof(float));
cudaDeviceSynchronize();
for(int ii = 0; ii < 6; ++ii)
{
cudaMalloc((void**)&_cuda.d_poses[ii], params.nposes*sizeof(float));
cudaDeviceSynchronize();
cudaMemcpy(_cuda.d_poses[ii], params.poses[ii], params.nposes*sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
}
size_t global = ceil(params.nposes/(double)_cuda.posesPerWI);
global = ceil(global/(double)_cuda.wgsize);
size_t local = _cuda.wgsize;
size_t shared = params.ntypes * sizeof(FFParams);
cudaDeviceSynchronize();
double start = getTimestamp();
for(int ii = 0; ii < params.iterations; ++ii)
{
fasten_main<<<global, local, shared>>>(
params.natlig,
params.natpro,
_cuda.d_protein,
_cuda.d_ligand,
_cuda.d_poses[0],
_cuda.d_poses[1],
_cuda.d_poses[2],
_cuda.d_poses[3],
_cuda.d_poses[4],
_cuda.d_poses[5],
_cuda.d_results,
_cuda.d_forcefield,
params.ntypes,
params.nposes);
}
cudaDeviceSynchronize();
double end = getTimestamp();
cudaMemcpy(results, _cuda.d_results, params.nposes*sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printTimings(start, end, _cuda.posesPerWI);
}
__device__ void compute_transformation_matrix(const float transform_0,
const float transform_1,
const float transform_2,
const float transform_3,
const float transform_4,
const float transform_5,
Transform* transform)
{
const float sx = sin(transform_0);
const float cx = cos(transform_0);
const float sy = sin(transform_1);
const float cy = cos(transform_1);
const float sz = sin(transform_2);
const float cz = cos(transform_2);
transform[0].x = cy*cz;
transform[0].y = sx*sy*cz - cx*sz;
transform[0].z = cx*sy*cz + sx*sz;
transform[0].w = transform_3;
transform[1].x = cy*sz;
transform[1].y = sx*sy*sz + cx*cz;
transform[1].z = cx*sy*sz - sx*cz;
transform[1].w = transform_4;
transform[2].x = -sy;
transform[2].y = sx*cy;
transform[2].z = cx*cy;
transform[2].w = transform_5;
}
__global__ void fasten_main(const int natlig,
const int natpro,
const Atom* __restrict protein_molecule,
const Atom* __restrict ligand_molecule,
const float* __restrict transforms_0,
const float* __restrict transforms_1,
const float* __restrict transforms_2,
const float* __restrict transforms_3,
const float* __restrict transforms_4,
const float* __restrict transforms_5,
float* __restrict etotals,
const FFParams* global_forcefield,
const int num_atom_types,
const int numTransforms)
{
// Get index of first TD
int ix = blockIdx.x*blockDim.x*NUM_TD_PER_THREAD + threadIdx.x;
// Have extra threads do the last member intead of return.
// A return would disable use of barriers, so not using return is better
ix = ix < numTransforms ? ix : numTransforms - NUM_TD_PER_THREAD;
#ifdef USE_SHARED
extern __shared__ FFParams forcefield[];
if(ix < num_atom_types)
{
forcefield[ix] = global_forcefield[ix];
}
#else
const FFParams* forcefield = global_forcefield;
#endif
// Compute transformation matrix to private memory
float etot[NUM_TD_PER_THREAD];
Transform transform[NUM_TD_PER_THREAD][3];
const int lsz = blockDim.x;
for (int i = 0; i < NUM_TD_PER_THREAD; i++)
{
int index = ix + i*lsz;
compute_transformation_matrix(
transforms_0[index],
transforms_1[index],
transforms_2[index],
transforms_3[index],
transforms_4[index],
transforms_5[index],
transform[i]);
etot[i] = ZERO;
}
#ifdef USE_SHARED
__syncthreads();
#endif
// Loop over ligand atoms
int il = 0;
do
{
// Load ligand atom data
const Atom l_atom = ligand_molecule[il];
const FFParams l_params = forcefield[l_atom.index];
const bool lhphb_ltz = l_params.hphb<ZERO;
const bool lhphb_gtz = l_params.hphb>ZERO;
float3 lpos[NUM_TD_PER_THREAD];
const float4 linitpos = make_float4(l_atom.x,l_atom.y,l_atom.z,ONE);
for (int i = 0; i < NUM_TD_PER_THREAD; i++)
{
// Transform ligand atom
lpos[i].x = transform[i][0].w + linitpos.x*transform[i][0].x +
linitpos.y*transform[i][0].y + linitpos.z*transform[i][0].z;
lpos[i].y = transform[i][1].w + linitpos.x*transform[i][1].x +
linitpos.y*transform[i][1].y + linitpos.z*transform[i][1].z;
lpos[i].z = transform[i][2].w + linitpos.x*transform[i][2].x +
linitpos.y*transform[i][2].y + linitpos.z*transform[i][2].z;
}
// Loop over protein atoms
int ip = 0;
do
{
// Load protein atom data
const Atom p_atom = protein_molecule[ip];
const FFParams p_params = forcefield[p_atom.index];
const float radij = p_params.radius + l_params.radius;
const float r_radij = 1.0f/radij;
const float elcdst = (p_params.hbtype==HBTYPE_F && l_params.hbtype==HBTYPE_F) ? FOUR : TWO;
const float elcdst1 = (p_params.hbtype==HBTYPE_F && l_params.hbtype==HBTYPE_F) ? QUARTER : HALF;
const bool type_E = ((p_params.hbtype==HBTYPE_E || l_params.hbtype==HBTYPE_E));
const bool phphb_ltz = p_params.hphb<ZERO;
const bool phphb_gtz = p_params.hphb>ZERO;
const bool phphb_nz = p_params.hphb!=ZERO;
const float p_hphb = p_params.hphb * (phphb_ltz && lhphb_gtz ? -ONE : ONE);
const float l_hphb = l_params.hphb * (phphb_gtz && lhphb_ltz ? -ONE : ONE);
const float distdslv = (phphb_ltz ? (lhphb_ltz ? NPNPDIST : NPPDIST) : (lhphb_ltz ? NPPDIST : -FLT_MAX));
const float r_distdslv = 1.0f/distdslv;
const float chrg_init = l_params.elsc * p_params.elsc;
const float dslv_init = p_hphb + l_hphb;
for (int i = 0; i < NUM_TD_PER_THREAD; i++)
{
// Calculate distance between atoms
const float x = lpos[i].x - p_atom.x;
const float y = lpos[i].y - p_atom.y;
const float z = lpos[i].z - p_atom.z;
const float distij = sqrt(x*x + y*y + z*z);
// Calculate the sum of the sphere radii
const float distbb = distij - radij;
const bool zone1 = (distbb < ZERO);
// Calculate steric energy
etot[i] += (ONE - (distij*r_radij)) * (zone1 ? 2*HARDNESS : ZERO);
// Calculate formal and dipole charge interactions
float chrg_e = chrg_init * ((zone1 ? 1 : (ONE - distbb*elcdst1))
* (distbb<elcdst ? 1 : ZERO));
const float neg_chrg_e = -fabs(chrg_e);
chrg_e = type_E ? neg_chrg_e : chrg_e;
etot[i] += chrg_e*CNSTNT;
// Calculate the two cases for Nonpolar-Polar repulsive interactions
const float coeff = (ONE - (distbb*r_distdslv));
float dslv_e = dslv_init * ((distbb<distdslv && phphb_nz) ? 1 : ZERO);
dslv_e *= (zone1 ? 1 : coeff);
etot[i] += dslv_e;
}
}
while (++ip < natpro); // loop over protein atoms
}
while (++il < natlig); // loop over ligand atoms
// Write results
const int td_base = blockIdx.x*blockDim.x*NUM_TD_PER_THREAD + threadIdx.x;
if (td_base < numTransforms)
{
for (int i = 0; i < NUM_TD_PER_THREAD; i++)
{
etotals[td_base+i*blockDim.x] = etot[i]*HALF;
}
}
} //end of fasten_main