-
Notifications
You must be signed in to change notification settings - Fork 0
/
qft_gpu_v6.cu
305 lines (239 loc) · 7.71 KB
/
qft_gpu_v6.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#include <cuComplex.h>
#include "cutil.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h> // M_PI
#include "qft_gpu_v6.h"
#define BLOCKS 32768
// Uses shared memory for the lower targets
__device__ static void qft_gpu_v6_single_lower_state(int tgt, unsigned long i, int width, unsigned long block_base, cuDoubleComplex *sh_v){
unsigned long tgt_bit = (1ul << tgt);
unsigned long i_other = i^tgt_bit;
// First half warp gets coefficients from shared memory.
cuDoubleComplex v_i, v_iother;
if ((i%16)>=8) {
v_i = sh_v[i-block_base];
v_iother = sh_v[i_other-block_base];
}
// Calculate the phase.
unsigned long phase_coef = 1ul;
unsigned long normal = (1ul << (width - tgt - 1));
unsigned long ctl_bit = (1ul << tgt);
for (int ctl=tgt+1; ctl<width; ctl++) {
ctl_bit = (ctl_bit << 1);
phase_coef = (phase_coef <<1);
if ((i & ctl_bit) != 0) {
phase_coef = phase_coef ^ 1ul;
}
}
phase_coef = phase_coef ^ (normal);
// Second half warp gets coefficients from shared memory.
if ((i%16)<8) {
v_iother = sh_v[i_other-block_base];
v_i = sh_v[i-block_base];
}
// Finish calculating the gate.
float phi = float(phase_coef) * float(M_PI) / float(normal);
float c = cosf(phi);
float s = sqrtf(1.0f-c*c);
if (phi>float(M_PI)) {
s *= -1.0f;
}
cuDoubleComplex phase = {c, s};
v_i = cuCmul(v_i, phase);
cuDoubleComplex ai, aother; // coefficient i and (i^tgt_bit)
cuDoubleComplex cuM_SQRT1_2 = make_cuDoubleComplex(M_SQRT1_2, 0);
ai = cuCmul(cuM_SQRT1_2, cuCsub(v_iother, v_i));
aother = cuCmul(cuM_SQRT1_2, cuCadd(v_iother, v_i));
if ((i%16)>=8) {
sh_v[i-block_base] = ai;
sh_v[i_other-block_base] = aother;
} else {
sh_v[i_other-block_base] = aother;
sh_v[i-block_base] = ai;
}
}
// This applies the phase shifts and Hadamard transform for qubit 'tgt' and state 'i'.
// Note: This should ONLY be called with ((i & tgt_bit) == 1)
__device__ static void qft_gpu_v6_single_state(int tgt, unsigned long i, int width, cuDoubleComplex *v){
unsigned long phase_coef = 1ul;
unsigned long tgt_bit = (1ul << tgt);
unsigned long i_other = i^tgt_bit;
unsigned long normal = (1ul << (width - tgt - 1));
/*
if ((i & tgt_bit) == 0) {
// This function should not have been called in this case.
return;
}
*/
// Note: Phase shifts (with the same target) commute.
unsigned long ctl_bit = (1ul << tgt);
for (int ctl=tgt+1; ctl<width; ctl++) {
ctl_bit = (ctl_bit << 1);
phase_coef = (phase_coef <<1);
if ((i & ctl_bit) != 0) {
phase_coef = phase_coef ^ 1ul;
}
}
phase_coef = phase_coef ^ (normal);
float phi = float(phase_coef) * float(M_PI) / float(normal);
float c = cosf(phi);
float s = sqrtf(1.0f-c*c);
if (phi>float(M_PI)) {
s *= -1.0f;
}
cuDoubleComplex phase = {c, s};
cuDoubleComplex v_i = v[i];
cuDoubleComplex v_iother = v[i_other];
v_i = cuCmul(v_i, phase);
cuDoubleComplex ai, aother; // koefisien i dan (i^tgt_bit)
cuDoubleComplex cuM_SQRT1_2 = make_cuDoubleComplex(M_SQRT1_2, 0);
ai = cuCmul(cuM_SQRT1_2, cuCsub(v_iother, v_i));
aother = cuCmul(cuM_SQRT1_2, cuCadd(v_iother, v_i));
v[i] = ai;
v[i_other] = aother;
}
// This kernel performs a single stage of the QFT.
__global__ static void K_qft_gpu_v6_stage(int width, cuDoubleComplex *v, int tgt){
unsigned long N = (1ul << width);
// Split threads over states.
unsigned long long bidx = blockIdx.y*gridDim.x + blockIdx.x;
unsigned long long i = bidx*blockDim.x + threadIdx.x;
if (i >= N) {
return;
}
unsigned long tgt_bit = (1ul << tgt);
if ((i & tgt_bit) != 0) {
qft_gpu_v6_single_state(tgt, i, width, v);
}
}
// This kernel performs a single stage of the QFT.
__global__ static void K_qft_gpu_v6_stage_0to8(int width, cuDoubleComplex *v)
{
unsigned long N = (1ul << width);
// Split threads over states.
unsigned long long bidx = blockIdx.y*gridDim.x + blockIdx.x;
unsigned long long i = bidx*blockDim.x + threadIdx.x;
// copy in
unsigned long block_base = bidx*blockDim.x;
__shared__ cuDoubleComplex sh_v[512]; // Note: hardcoded shared memory size
if (i < N) {
sh_v[threadIdx.x] = v[i];
}
__syncthreads();
// calculate
for (int tgt = min(width-1,8); tgt >= 0; tgt--) {
unsigned long tgt_bit = (1ul << tgt);
if ( (i < N) && ((i & tgt_bit) != 0) ) {
qft_gpu_v6_single_lower_state(tgt, i, width, block_base, sh_v);
}
__syncthreads();
}
// copy out
if (i < N) {
v[i] = sh_v[threadIdx.x];
}
}
// We use this portion from v3 for part of the gate calulation.
__device__ static void qft_gpu_v3_single_state(int tgt, unsigned long i, int width, cuDoubleComplex *v){
unsigned long phase_coef = 1ul;
unsigned long tgt_bit = (1ul << tgt);
unsigned long i_other = i^tgt_bit;
unsigned long normal = (1ul << (width - tgt - 1));
if ((i & tgt_bit) == 0) {
// This function should not have been called in this case.
return;
}
// Note: Phase shifts (with the same target) commute.
unsigned long ctl_bit = (1ul << tgt);
for (int ctl=tgt+1; ctl<width; ctl++) {
ctl_bit = (ctl_bit << 1);
phase_coef = (phase_coef <<1);
if ((i & ctl_bit) != 0) {
phase_coef = phase_coef ^ 1ul;
}
}
phase_coef = phase_coef ^ (normal);
float phi = float(phase_coef) * float(M_PI) / float(normal);
float c = cosf(phi);
float s = sqrtf(1.0f-c*c);
if (phi>float(M_PI)) {
s *= -1.0f;
}
cuDoubleComplex phase = {c, s};
cuDoubleComplex v_i = v[i];
cuDoubleComplex v_iother = v[i_other];
v_i = cuCmul(v_i, phase);
//hadamard_gpu(tgt, i, width, v);
cuDoubleComplex ai, aother; // coefficients i and (i^tgt_bit)
cuDoubleComplex cuM_SQRT1_2 = make_cuDoubleComplex(M_SQRT1_2, 0);
ai = cuCmul(cuM_SQRT1_2, cuCsub(v_iother, v_i));
aother = cuCmul(cuM_SQRT1_2, cuCadd(v_iother, v_i));
v[i] = ai;
v[i_other] = aother;
}
__global__ static void K_qft_gpu_v6_stage_M_bits(int width, cuDoubleComplex *v, int tgt, int M){
unsigned long N = (1ul << width);
// Split threads over states.
unsigned long long bidx = blockIdx.y*gridDim.x + blockIdx.x;
unsigned long long i = bidx*blockDim.x + threadIdx.x;
if (i >= N) {
return;
}
// calculate
int m;
unsigned long mask = 0;
for (m = 0; m < M; m++) {
mask = mask | (1ul << (tgt-m));
}
unsigned long B = (1ul<<(tgt-(M-1)));
int G = (1ul<<M);
int tgtBit = tgt;
if ( (i & mask) == mask) {
for (m = (1ul<<(M-1)); m > 0; m = m>>1) {
for (int g = G-1; g >= 0; g--) {
if ((g & m) != 0) {
unsigned long iother = i - B*(G-1-g);
qft_gpu_v3_single_state(tgtBit, iother, width, v);
}
}
tgtBit--;
}
}
}
// Implement the QFT gate by gate using the GPU.
void qft_gpu_v6_helper(int width, cuDoubleComplex *d_v, int threadsPerBlock){
// M is the number of qubits to process at once. This parameter is tunable.
int M=2;
unsigned long N = (1ul << width);
unsigned long long nblocks = (N+threadsPerBlock-1)/threadsPerBlock;
unsigned long long xblocks, yblocks;
yblocks = (nblocks+BLOCKS-1)/BLOCKS;
xblocks = BLOCKS;
if (nblocks < xblocks) {
xblocks = nblocks;
}
dim3 blocks(xblocks, yblocks);
// For each qubit...
int band = 0;
int tgt;
int leftover = (width-9-band)%M;
int uneven_stop = width - leftover;
for (tgt=width-1; tgt>=uneven_stop; tgt--) {
printf("tgt=%d\n", tgt);
K_qft_gpu_v6_stage<<<blocks, threadsPerBlock>>>(width, d_v, tgt);
CUT_CHECK_ERROR("K_qft_gpu_v6_stage failed.");
}
for (; tgt>=9+band; tgt-=M) {
printf("stage_M_bits tgt=%d\n", tgt);
K_qft_gpu_v6_stage_M_bits<<<blocks, threadsPerBlock>>>(width, d_v, tgt, M);
CUT_CHECK_ERROR("K_qft_gpu_v6_stage_M_bits failed.");
}
for (; tgt>=9; tgt--) {
printf("band: %d\n", tgt);
K_qft_gpu_v6_stage<<<blocks, threadsPerBlock>>>(width, d_v, tgt);
CUT_CHECK_ERROR("K_qft_gpu_v6_stage failed.");
}
K_qft_gpu_v6_stage_0to8<<<blocks, threadsPerBlock>>>(width, d_v);
CUT_CHECK_ERROR("K_qft_gpu_v6_stage_0to8 failed.");
}