-
Notifications
You must be signed in to change notification settings - Fork 0
/
qft_gpu_v5.cu
194 lines (154 loc) · 5.02 KB
/
qft_gpu_v5.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#include <cuComplex.h>
#include "cutil.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h> // M_PI
#include "qft_gpu_v5.h"
#define BLOCKS 32768
// Uses shared memory for the lower targets (0to8)
__device__ static void qft_gpu_v5_single_lower_state(int tgt, unsigned long i, int width, unsigned long block_base, cuDoubleComplex *sh_v){
unsigned long tgt_bit = (1ul << tgt);
unsigned long i_other = i^tgt_bit;
// First half warp gets coefficients from shared memory.
cuDoubleComplex v_i, v_iother;
if ((i % 16) >= 8) {
v_i = sh_v[i - block_base];
v_iother = sh_v[i_other - block_base];
}
// Calculate the phase.
unsigned long phase_coef = 1ul;
unsigned long normal = (1ul << (width - tgt - 1));
unsigned long ctl_bit = (1ul << tgt);
for (int ctl = tgt + 1; ctl < width; ctl++) {
ctl_bit = (ctl_bit << 1);
phase_coef = (phase_coef <<1);
if ((i & ctl_bit) != 0) {
phase_coef = phase_coef ^ 1ul;
}
}
phase_coef = phase_coef ^ (normal);
// Second half warp gets coefficients from shared memory.
if ((i % 16) < 8) {
v_iother = sh_v[i_other-block_base];
v_i = sh_v[i-block_base];
}
// Finish calculating the gate.
float phi = float(phase_coef) * float(M_PI) / float(normal);
float c = cosf(phi);
float s = sqrtf(1.0f-c*c);
if (phi>float(M_PI)) {
s *= -1.0f;
}
cuDoubleComplex phase = {c, s};
v_i = cuCmul(v_i, phase);
cuDoubleComplex ai, aother; // koefisien i dan (i^tgt_bit)
cuDoubleComplex cuM_SQRT1_2 = make_cuDoubleComplex(M_SQRT1_2, 0);
ai = cuCmul(cuM_SQRT1_2, cuCsub(v_iother, v_i));
aother = cuCmul(cuM_SQRT1_2, cuCadd(v_iother, v_i));
if ((i % 16) >= 8) {
sh_v[i - block_base] = ai;
sh_v[i_other - block_base] = aother;
} else {
sh_v[i_other - block_base] = aother;
sh_v[i - block_base] = ai;
}
}
// This applies the phase shifts and Hadamard transform for qubit 'tgt' and state 'i'.
// Note: This should ONLY be called with ((i & tgt_bit) == 1)
__device__ static void qft_gpu_v5_single_state(int tgt, unsigned long i, int width, cuDoubleComplex *v){
unsigned long phase_coef = 1ul;
unsigned long tgt_bit = (1ul << tgt);
unsigned long i_other = i^tgt_bit;
unsigned long normal = (1ul << (width - tgt - 1));
/*
if ((i & tgt_bit) == 0) {
return;
}
*/
// Note: Phase shifts (with the same target) commute.
unsigned long ctl_bit = (1ul << tgt);
for (int ctl=tgt+1; ctl<width; ctl++) {
ctl_bit = (ctl_bit << 1);
phase_coef = (phase_coef <<1);
if ((i & ctl_bit) != 0) {
phase_coef = phase_coef ^ 1ul;
}
}
phase_coef = phase_coef ^ (normal);
float phi = float(phase_coef) * float(M_PI) / float(normal);
float c = cosf(phi);
float s = sqrtf(1.0f - c * c);
if (phi>float(M_PI)) {
s *= -1.0f;
}
cuDoubleComplex phase = {c, s};
cuDoubleComplex v_i = v[i];
cuDoubleComplex v_iother = v[i_other];
v_i = cuCmul(v_i, phase);
cuDoubleComplex ai, aother; // coefficients i and (i^tgt_bit)
cuDoubleComplex cuM_SQRT1_2 = make_cuDoubleComplex(M_SQRT1_2, 0);
ai = cuCmul(cuM_SQRT1_2, cuCsub(v_iother, v_i));
aother = cuCmul(cuM_SQRT1_2, cuCadd(v_iother, v_i));
v[i] = ai;
v[i_other] = aother;
}
// This kernel performs a single stage of the QFT.
__global__ static void K_qft_gpu_v5_stage(int width, cuDoubleComplex *v, int tgt){
unsigned long N = (1ul << width);
// Split threads over states.
unsigned long long bidx = blockIdx.y*gridDim.x + blockIdx.x;
unsigned long long i = bidx*blockDim.x + threadIdx.x;
if (i >= N) {
return;
}
unsigned long tgt_bit = (1ul << tgt);
if ((i & tgt_bit) != 0) {
qft_gpu_v5_single_state(tgt, i, width, v);
}
}
// This kernel performs a single stage of the QFT.
__global__ static void K_qft_gpu_v5_stage_0to8(int width, cuDoubleComplex *v){
unsigned long N = (1ul << width);
// Split threads over states.
unsigned long long bidx = blockIdx.y*gridDim.x + blockIdx.x;
unsigned long long i = bidx*blockDim.x + threadIdx.x;
// copy in
unsigned long block_base = bidx*blockDim.x;
__shared__ cuDoubleComplex sh_v[512]; // Note: hardcoded shared memory size
if (i < N) {
sh_v[threadIdx.x] = v[i];
}
__syncthreads();
// calculate
for (int tgt = min(width-1,8); tgt >= 0; tgt--) {
unsigned long tgt_bit = (1ul << tgt);
if ( (i < N) && ((i & tgt_bit) != 0) ) {
qft_gpu_v5_single_lower_state(tgt, i, width, block_base, sh_v);
}
__syncthreads();
}
// copy out
if (i < N) {
v[i] = sh_v[threadIdx.x];
}
}
// Implement the QFT gate by gate using the GPU.
void qft_gpu_v5_helper(int width, cuDoubleComplex *d_v, int threadsPerBlock){
unsigned long N = (1ul << width);
unsigned long long nblocks = (N+threadsPerBlock-1)/threadsPerBlock;
unsigned long long xblocks, yblocks;
yblocks = (nblocks+BLOCKS-1)/BLOCKS;
xblocks = BLOCKS;
if (nblocks < xblocks) {
xblocks = nblocks;
}
dim3 blocks(xblocks, yblocks);
// For each qubit...
int tgt;
for (tgt=width-1; tgt>=9; tgt--) {
K_qft_gpu_v5_stage<<<blocks, threadsPerBlock>>>(width, d_v, tgt);
CUT_CHECK_ERROR("K_qft_gpu_v5_stage failed.");
}
K_qft_gpu_v5_stage_0to8<<<blocks, threadsPerBlock>>>(width, d_v);
CUT_CHECK_ERROR("K_qft_gpu_v5_stage_0to8 failed.");
}