-
Notifications
You must be signed in to change notification settings - Fork 1
/
fdtd8.cu
130 lines (126 loc) · 7.33 KB
/
fdtd8.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <chrono>
#include <random>
#include "stencil.h"
const size_t nz = 512;
const size_t ny = 512;
const size_t nx = 512;
const size_t na = 10000;
#define GET(z, y, x, nz, ny, nx) ((z)*((ny)*(nx))+(y)*(nx)+(x))
typedef uint32_t gpu_size_t;
typedef int32_t gpu_signed_size_t;
int main(const int argc, const char* argv[])
{
Stencil<float, nz, ny, nx, 8, true, gpu_size_t, gpu_signed_size_t> stencil;
float *p0 = new float[nz*ny*nx];
float *p1 = new float[nz*ny*nx];
#pragma omp parallel for
for(size_t i=0;i<nz;i++)
{
auto rand = std::mt19937_64(19950918+i);
for(size_t j=0;j<ny;j++)
for(size_t k=0;k<nx;k++)
{
p0[GET(i,j,k,nz,ny,nx)] = float(rand())/rand.max();
p1[GET(i,j,k,nz,ny,nx)] = float(rand())/rand.max();
}
}
float *gpu_p0 = new float[nz*ny*nx];
float *gpu_p1 = new float[nz*ny*nx];
memcpy(gpu_p0, p0, sizeof(float)*nz*ny*nx);
memcpy(gpu_p1, p1, sizeof(float)*nz*ny*nx);
std::chrono::time_point<std::chrono::system_clock> start, end;
#ifndef SKIP_CPU
start = std::chrono::system_clock::now();
for(size_t t=0;t<1000;t++)
{
#pragma omp parallel for
for(size_t i=8;i<nz-8;i++)
for(size_t j=8;j<ny-8;j++)
for(size_t k=8;k<nx-8;k++)
p0[GET(i,j,k,nz,ny,nx)] = (0.01f*p1[GET(i,j,k,nz,ny,nx)]+
(0.02f*p1[GET(i,j,k+1,nz,ny,nx)]+0.03f*p1[GET(i,j,k-1,nz,ny,nx)])+
(0.04f*p1[GET(i,j+1,k,nz,ny,nx)]+0.05f*p1[GET(i,j-1,k,nz,ny,nx)])+
(0.06f*p1[GET(i+1,j,k,nz,ny,nx)]+0.07f*p1[GET(i-1,j,k,nz,ny,nx)])+
(0.02f*p1[GET(i,j,k+2,nz,ny,nx)]+0.03f*p1[GET(i,j,k-2,nz,ny,nx)])+
(0.04f*p1[GET(i,j+2,k,nz,ny,nx)]+0.05f*p1[GET(i,j-2,k,nz,ny,nx)])+
(0.06f*p1[GET(i+2,j,k,nz,ny,nx)]+0.07f*p1[GET(i-2,j,k,nz,ny,nx)])+
(0.02f*p1[GET(i,j,k+3,nz,ny,nx)]+0.03f*p1[GET(i,j,k-3,nz,ny,nx)])+
(0.04f*p1[GET(i,j+3,k,nz,ny,nx)]+0.05f*p1[GET(i,j-3,k,nz,ny,nx)])+
(0.06f*p1[GET(i+3,j,k,nz,ny,nx)]+0.07f*p1[GET(i-3,j,k,nz,ny,nx)])+
(0.02f*p1[GET(i,j,k+4,nz,ny,nx)]+0.03f*p1[GET(i,j,k-4,nz,ny,nx)])+
(0.04f*p1[GET(i,j+4,k,nz,ny,nx)]+0.05f*p1[GET(i,j-4,k,nz,ny,nx)])+
(0.06f*p1[GET(i+4,j,k,nz,ny,nx)]+0.07f*p1[GET(i-4,j,k,nz,ny,nx)])+
(0.02f*p1[GET(i,j,k+5,nz,ny,nx)]+0.03f*p1[GET(i,j,k-5,nz,ny,nx)])+
(0.04f*p1[GET(i,j+5,k,nz,ny,nx)]+0.05f*p1[GET(i,j-5,k,nz,ny,nx)])+
(0.06f*p1[GET(i+5,j,k,nz,ny,nx)]+0.07f*p1[GET(i-5,j,k,nz,ny,nx)])+
(0.02f*p1[GET(i,j,k+6,nz,ny,nx)]+0.03f*p1[GET(i,j,k-6,nz,ny,nx)])+
(0.04f*p1[GET(i,j+6,k,nz,ny,nx)]+0.05f*p1[GET(i,j-6,k,nz,ny,nx)])+
(0.06f*p1[GET(i+6,j,k,nz,ny,nx)]+0.07f*p1[GET(i-6,j,k,nz,ny,nx)])+
(0.02f*p1[GET(i,j,k+7,nz,ny,nx)]+0.03f*p1[GET(i,j,k-7,nz,ny,nx)])+
(0.04f*p1[GET(i,j+7,k,nz,ny,nx)]+0.05f*p1[GET(i,j-7,k,nz,ny,nx)])+
(0.06f*p1[GET(i+7,j,k,nz,ny,nx)]+0.07f*p1[GET(i-7,j,k,nz,ny,nx)])+
(0.02f*p1[GET(i,j,k+8,nz,ny,nx)]+0.03f*p1[GET(i,j,k-8,nz,ny,nx)])+
(0.04f*p1[GET(i,j+8,k,nz,ny,nx)]+0.05f*p1[GET(i,j-8,k,nz,ny,nx)])+
(0.06f*p1[GET(i+8,j,k,nz,ny,nx)]+0.07f*p1[GET(i-8,j,k,nz,ny,nx)]));
if(stencil.getRank() == 0) printf("loop %lu\n", t);
std::swap(p0, p1);
}
end = std::chrono::system_clock::now();
printf("CPU time %.6lfs\n", 1e-6*(std::chrono::time_point_cast<std::chrono::microseconds>(end)-std::chrono::time_point_cast<std::chrono::microseconds>(start)));
#endif
stencil.mallocCube("p0", true);
stencil.mallocCube("p1", true);
stencil.transferCubeToGPU("p0", gpu_p0);
stencil.transferCubeToGPU("p1", gpu_p1);
auto prop_kernel = [=] __device__ (gpu_size_t z, gpu_size_t y, gpu_size_t x, gpu_size_t addr, float *output, float* zl, gpu_signed_size_t sz, float *yl, gpu_signed_size_t sy, float *xl, gpu_signed_size_t sx)
{
output[addr] = (0.01f*zl[0]+
0.02f*xl[sx]+0.03f*xl[-sx]+0.04f*yl[sy]+0.05f*yl[-sy]+0.06f*zl[sz]+0.07f*zl[-sz]+
0.02f*xl[2*sx]+0.03f*xl[-2*sx]+0.04f*yl[2*sy]+0.05f*yl[-2*sy]+0.06f*zl[2*sz]+0.07f*zl[-2*sz]+
0.02f*xl[3*sx]+0.03f*xl[-3*sx]+0.04f*yl[3*sy]+0.05f*yl[-3*sy]+0.06f*zl[3*sz]+0.07f*zl[-3*sz]+
0.02f*xl[4*sx]+0.03f*xl[-4*sx]+0.04f*yl[4*sy]+0.05f*yl[-4*sy]+0.06f*zl[4*sz]+0.07f*zl[-4*sz]+
0.02f*xl[5*sx]+0.03f*xl[-5*sx]+0.04f*yl[5*sy]+0.05f*yl[-5*sy]+0.06f*zl[5*sz]+0.07f*zl[-5*sz]+
0.02f*xl[6*sx]+0.03f*xl[-6*sx]+0.04f*yl[6*sy]+0.05f*yl[-6*sy]+0.06f*zl[6*sz]+0.07f*zl[-6*sz]+
0.02f*xl[7*sx]+0.03f*xl[-7*sx]+0.04f*yl[7*sy]+0.05f*yl[-7*sy]+0.06f*zl[7*sz]+0.07f*zl[-7*sz]+
0.02f*xl[8*sx]+0.03f*xl[-8*sx]+0.04f*yl[8*sy]+0.05f*yl[-8*sy]+0.06f*zl[8*sz]+0.07f*zl[-8*sz]);
};
stencil.barrier();
start = std::chrono::system_clock::now();
std::string s0 = "p0", s1 = "p1";
for(size_t t=0;t<1000;t++)
{
stencil.backupCubeHaloBackup(s0);
stencil.propagateHaloTopBackup(s0, s1, true, prop_kernel);
stencil.propagateHaloButtomBackup(s0, s1, true, prop_kernel);
stencil.sync();
stencil.propagate(s0, s1, true, prop_kernel);
stencil.commCubeHaloBackup(s0);
stencil.sync();
stencil.restoreCubeHaloBackup(s0);
stencil.sync();
std::swap(s0, s1);
}
end = std::chrono::system_clock::now();
printf("GPU time %.6lfs\n", 1e-6*(std::chrono::time_point_cast<std::chrono::microseconds>(end)-std::chrono::time_point_cast<std::chrono::microseconds>(start)));
stencil.transferCubeToCPU(gpu_p0, s0);
stencil.transferCubeToCPU(gpu_p1, s1);
size_t rank = stencil.getRank();
#ifndef SKIP_CPU
#pragma omp parallel for
for(size_t i=0;i<nz;i++)
for(size_t j=0;j<ny;j++)
for(size_t k=0;k<nx;k++)
{
float dp0 = fabs(p0[GET(i,j,k,nz,ny,nx)]-gpu_p0[GET(i,j,k,nz,ny,nx)]),
dp1 = fabs(p1[GET(i,j,k,nz,ny,nx)]-gpu_p1[GET(i,j,k,nz,ny,nx)]);
if((dp0/fabs(gpu_p0[GET(i,j,k,nz,ny,nx)])>1e-2 && dp0>1e-3) ||
(dp1/fabs(gpu_p0[GET(i,j,k,nz,ny,nx)])>1e-2 && dp0>1e-3))
fprintf(stderr, "rank = %lu, [%lu][%lu][%lu]: %lf %lf, %lf %lf\n", rank, i, j, k, p0[GET(i,j,k,nz,ny,nx)], gpu_p0[GET(i,j,k,nz,ny,nx)], p1[GET(i,j,k,nz,ny,nx)], gpu_p1[GET(i,j,k,nz,ny,nx)]);
}
#endif
return 0;
}