diff --git a/Makefile b/Makefile index c444ae4a8..3b615f1d5 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ DIRS := src src/analysis src/chemistry_gpu src/cooling src/cooling_grackle s src/cpu src/global src/gravity src/gravity/paris src/grid src/hydro \ src/integrators src/io src/main.cpp src/main_tests.cpp src/mhd\ src/model src/mpi src/old_cholla src/particles src/reconstruction \ - src/riemann_solvers src/system_tests src/utils src/dust + src/riemann_solvers src/system_tests src/utils src/dust src/cloud_tracking SUFFIX ?= .$(TYPE).$(MACHINE) diff --git a/builds/make.type.dust b/builds/make.type.dust index 24a765302..31629c566 100644 --- a/builds/make.type.dust +++ b/builds/make.type.dust @@ -9,7 +9,7 @@ MPI_GPU ?= DFLAGS += -DCUDA DFLAGS += -DMPI_CHOLLA DFLAGS += -DPRECISION=2 -DFLAGS += -DPLMP +DFLAGS += -DPPMP DFLAGS += -DHLLC DFLAGS += -DDE @@ -36,6 +36,8 @@ DFLAGS += -DCLOUDY_COOLING DFLAGS += -DSLICES DFLAGS += -DPROJECTION +DFLAGS += -DCLOUD_TRACKING + DFLAGS += $(OUTPUT) DFLAGS += -DOUTPUT_ALWAYS diff --git a/builds/make.type.hydro b/builds/make.type.hydro index f34d78172..7d006ac10 100644 --- a/builds/make.type.hydro +++ b/builds/make.type.hydro @@ -11,18 +11,20 @@ DFLAGS += -DSIMPLE #DFLAGS += -DVL # Apply a density and temperature floor -DFLAGS += -DDENSITY_FLOOR +# DFLAGS += -DDENSITY_FLOOR DFLAGS += -DTEMPERATURE_FLOOR # Solve the Gas Internal Energy usisng a Dual Energy Formalism #DFLAGS += -DDE # Apply cooling on the GPU from precomputed tables -#DFLAGS += -DCOOLING_GPU +DFLAGS += -DCOOLING_GPU # Measure the Timing of the different stages #DFLAGS += -DCPU_TIME +#DFLAGS += -DCLOUD_TRACKING + # Select output format # Can also add -DSLICES and -DPROJECTIONS OUTPUT ?= -DOUTPUT -DHDF5 diff --git a/src/cloud_tracking/cloud_tracking.cu b/src/cloud_tracking/cloud_tracking.cu new file mode 100644 index 000000000..4e8acc752 --- /dev/null +++ b/src/cloud_tracking/cloud_tracking.cu @@ -0,0 +1,159 @@ +/*! + * \file cloud_tracking.cu + * \author Helena Richie (helenarichie@gmail.com) + * \brief + */ + +#ifdef CLOUD_TRACKING + + // STL includes + #include + + #include + #include + #include + +// Local includes + + #include "../cloud_tracking/cloud_tracking.h" + #include "../global/global.h" + #include "../global/global_cuda.h" + #include "../grid/grid3D.h" + #include "../grid/grid_enum.h" + #include "../utils/DeviceVector.h" + #include "../utils/cuda_utilities.h" + #include "../utils/gpu.hpp" + #include "../utils/hydro_utilities.h" + #include "../utils/reduction_utilities.h" + +void Cloud_Frame_Update(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz, int n_ghost, + int n_fields, Real dt, Real gamma, Real density_cloud_init, Real *integrand, + Real *density_cloud_tot, Real *mass_cloud_tot) +{ + // cuda_utilities::AutomaticLaunchParams static const launchParams(Cloud_Tracking_Kernel) + + int n_cells = nx * ny * nz; + int ngrid = (n_cells + TPB - 1) / TPB; + dim3 dim1dGrid(ngrid, 1, 1); + dim3 dim1dBlock(TPB, 1, 1); + + // Allocate the device memory + cuda_utilities::DeviceVector static integrand_cloud(1, true); + cuda_utilities::DeviceVector static density_cloud(1, true); + cuda_utilities::DeviceVector static mass_cloud(1, true); + + cuda_utilities::AutomaticLaunchParams static const launchParams(Cloud_Tracking_Kernel); + + // printf("TPB: %d\n", TPB); + // printf("ngrid: %d\n", ngrid); + // printf("nx, ny, nz: %d %d %d\n", nx, ny, nz); + // printf("ngrid: %d\n", n_cells); + // printf("dim1dGrid: %d\n", dim1dGrid); + // printf("dim1dBlock: %d\n", dim1dBlock); + // printf("numBLocks: %d\n", launchParams.numBlocks); + // printf("threadsPerBLock: %d\n", launchParams.threadsPerBlock); + + hipLaunchKernelGGL(Cloud_Tracking_Kernel, launchParams.numBlocks, launchParams.threadsPerBlock, 0, 0, dev_conserved, + nx, ny, nz, dx, dy, dz, n_ghost, n_fields, dt, gamma, density_cloud_init, integrand_cloud.data(), + density_cloud.data(), mass_cloud.data()); + GPU_Error_Check(); + + *integrand = integrand_cloud[0]; + *density_cloud_tot = density_cloud[0]; + *mass_cloud_tot = mass_cloud[0]; +} + +__global__ void Cloud_Tracking_Kernel(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz, + int n_ghost, int n_fields, Real dt, Real gamma, Real density_cloud_init, + Real *integrand_cloud, Real *density_cloud, Real *mass_cloud) +{ + int xid, yid, zid, n_cells; + n_cells = nx * ny * nz; + __shared__ Real density_stride[TPB]; + __shared__ Real velocity_x_stride[TPB]; + __shared__ Real mass_stride[TPB]; + Real density, velocity_x, mass; + + for (int i = 0; i < TPB; i++) { + density_stride[i] = 0; + velocity_x_stride[i] = 0; + mass_stride[i] = 0; + } + + // Grid stride loop to perform as much of the reduction as possible. The + // fact that `id` has type `size_t` is important. I'm not totally sure why + // but setting it to int results in some kind of silent over/underflow issue + // even though we're not hitting those kinds of numbers. Setting it to type + // uint or size_t fixes them + for (size_t id = threadIdx.x + blockIdx.x * blockDim.x; id < n_cells; id += blockDim.x * gridDim.x) { + // get a global thread ID + cuda_utilities::compute3DIndices(id, nx, ny, xid, yid, zid); + + // threads corresponding to real cells do the calculation + if (xid > n_ghost - 1 && xid < nx - n_ghost && yid > n_ghost - 1 && yid < ny - n_ghost && zid > n_ghost - 1 && + zid < nz - n_ghost) { + density = dev_conserved[id + n_cells * grid_enum::density]; + velocity_x = dev_conserved[id + n_cells * grid_enum::momentum_x] / density; + mass = density * dx * dy * dz; + + if (density > (1 / 3 * (density_cloud_init / DENSITY_UNIT))) { + // printf("inside if statement: %d %e\n", id, mass); + density_stride[threadIdx.x] += density; + velocity_x_stride[threadIdx.x] += velocity_x; + mass_stride[threadIdx.x] += mass; + // Do grid-wide reduction to compute mass-averaged cloud velocity (Shin et al. 2008, eq. 9) + } + } + } + __syncthreads(); + + reduction_utilities::Grid_Reduction_Add(density_stride[threadIdx.x] * velocity_x_stride[threadIdx.x], + integrand_cloud); + reduction_utilities::Grid_Reduction_Add(density_stride[threadIdx.x], density_cloud); + reduction_utilities::Grid_Reduction_Add(mass_stride[threadIdx.x], mass_cloud); +} + +void Update_Grid_Velocities(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt, Real gamma, + Real velocity_cloud, Real density_cloud_tot, Real mass_cloud_tot) +{ + // cuda_utilities::AutomaticLaunchParams static const launchParams(Velocity_Update); + int n_cells = nx * ny * nz; + int ngrid = (n_cells + TPB - 1) / TPB; + dim3 dim1dGrid(ngrid, 1, 1); + dim3 dim1dBlock(TPB, 1, 1); + + hipLaunchKernelGGL(Velocity_Update, dim1dGrid, dim1dBlock, 0, 0, dev_conserved, nx, ny, nz, n_ghost, n_fields, dt, + gamma, velocity_cloud, density_cloud_tot, mass_cloud_tot); + GPU_Error_Check();; +} + +__global__ void Velocity_Update(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt, + Real gamma, Real velocity_cloud, Real density_cloud_tot, Real mass_cloud_tot) +{ + // get grid indices + int n_cells = nx * ny * nz; + int is, ie, js, je, ks, ke; + cuda_utilities::Get_Real_Indices(n_ghost, nx, ny, nz, is, ie, js, je, ks, ke); + // get a global thread ID + int blockId = blockIdx.x + blockIdx.y * gridDim.x; + int id = threadIdx.x + blockId * blockDim.x; + int id_z = id / (nx * ny); + int id_y = (id - id_z * nx * ny) / nx; + int id_x = id - id_z * nx * ny - id_y * nx; + + Real density, momentum_x, velocity_x, energy; + + Real energy_kinetic_cloud = 0.5 * mass_cloud_tot * pow(velocity_cloud, 2); + + // threads corresponding to real cells do the calculation + if (id_x >= is && id_x < ie && id_y >= js && id_y < je && id_z >= ks && id_z < ke) { + density = dev_conserved[id + n_cells * grid_enum::density]; + momentum_x = dev_conserved[id + n_cells * grid_enum::momentum_x]; + velocity_x = density * momentum_x; + energy = dev_conserved[id + n_cells * grid_enum::Energy]; + dev_conserved[id + n_cells * grid_enum::momentum_x] = (velocity_x - velocity_cloud) * density; + dev_conserved[id + n_cells * grid_enum::Energy] = energy - energy_kinetic_cloud; + } +} + +#endif // CLOUD_TRACKING diff --git a/src/cloud_tracking/cloud_tracking.h b/src/cloud_tracking/cloud_tracking.h new file mode 100644 index 000000000..913fa4031 --- /dev/null +++ b/src/cloud_tracking/cloud_tracking.h @@ -0,0 +1,31 @@ +/*! + * \file cloud_tracking.h + * \author Helena Richie (helenarichie@pitt.edu) + * \brief Contains declaration for the kernel that does frame of reference tracking to track clouds. + * + */ +#ifdef CLOUD_TRACKING + #ifndef CLOUD_TRACKING_CUDA_H + #define CLOUD_TRACKING_CUDA_H + + #include + + #include "../global/global.h" + #include "../utils/gpu.hpp" + +void Cloud_Frame_Update(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz, int n_ghost, + int n_fields, Real dt, Real gamma, Real density_cloud_init, Real *integrand, + Real *density_cloud_tot, Real *mass_cloud_tot); + +__global__ void Cloud_Tracking_Kernel(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz, + int n_ghost, int n_fields, Real dt, Real gamma, Real density_cloud_init, + Real *integrand_cloud, Real *density_cloud, Real *mass_cloud); + +void Update_Grid_Velocities(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt, Real gamma, + Real velocity_cloud, Real density_cloud_tot, Real mass_cloud_tot); + +__global__ void Velocity_Update(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt, + Real gamma, Real velocity_cloud, Real density_cloud_tot, Real mass_cloud_tot); + + #endif // CLOUD_TRACKING_CUDA_H +#endif // CLOUD_TRACKING \ No newline at end of file diff --git a/src/global/global.cpp b/src/global/global.cpp index 6579d9955..a9ff0384d 100644 --- a/src/global/global.cpp +++ b/src/global/global.cpp @@ -101,10 +101,11 @@ char *Trim(char *s) } // NOLINTNEXTLINE(cert-err58-cpp) -const std::set optionalParams = { - "flag_delta", "ddelta_dt", "n_delta", "Lz", "Lx", "phi", "theta", - "delta", "nzr", "nxr", "H0", "Omega_M", "Omega_L", "Init_redshift", - "End_redshift", "tile_length", "n_proc_x", "n_proc_y", "n_proc_z"}; +const std::set optionalParams = {"flag_delta", "ddelta_dt", "n_delta", "Lz", + "Lx", "phi", "theta", "delta", + "nzr", "nxr", "H0", "Omega_M", + "Omega_L", "Init_redshift", "End_redshift", "tile_length", + "n_proc_x", "n_proc_y", "n_proc_z", "density_cloud_init"}; /*! \fn int Is_Param_Valid(char *name); * \brief Verifies that a param is valid (even if not needed). Avoids @@ -453,6 +454,10 @@ void Parse_Param(char *name, char *value, struct Parameters *parms) } else if (strcmp(name, "skewersdir") == 0) { strncpy(parms->skewersdir, value, MAXLEN); #endif +#endif +#ifdef CLOUD_TRACKING + } else if (strcmp(name, "density_cloud_init") == 0) { + parms->density_cloud_init = atof(value); #endif } else if (!Is_Param_Valid(name)) { chprintf("WARNING: %s/%s: Unknown parameter/value pair!\n", name, value); diff --git a/src/global/global.h b/src/global/global.h index 6c9524001..5f3dc36a7 100644 --- a/src/global/global.h +++ b/src/global/global.h @@ -309,7 +309,9 @@ struct Parameters { #ifdef TILED_INITIAL_CONDITIONS Real tile_length; #endif // TILED_INITIAL_CONDITIONS - +#ifdef CLOUD_TRACKING + Real density_cloud_init; +#endif #ifdef SET_MPI_GRID // Set the MPI Processes grid [n_proc_x, n_proc_y, n_proc_z] int n_proc_x; diff --git a/src/grid/cuda_boundaries.cu b/src/grid/cuda_boundaries.cu index baf846d3c..715840322 100644 --- a/src/grid/cuda_boundaries.cu +++ b/src/grid/cuda_boundaries.cu @@ -318,7 +318,7 @@ __global__ void Wind_Boundary_kernel(Real *c_device, int nx, int ny, int nz, int d_0 = n_0 * mu * MP / DENSITY_UNIT; P_0 = n_0 * KB * T_0 / PRESSURE_UNIT; - vx = 100 * TIME_UNIT / KPC; // km/s * (cholla unit conversion) + vx = 1000 * TIME_UNIT / KPC; // km/s * (cholla unit conversion) vy = 0.0; vz = 0.0; diff --git a/src/grid/grid3D.cpp b/src/grid/grid3D.cpp index 93499fea0..fc65279d5 100644 --- a/src/grid/grid3D.cpp +++ b/src/grid/grid3D.cpp @@ -29,6 +29,9 @@ #ifdef CLOUDY_COOL #include "../cooling/load_cloudy_texture.h" // provides Load_Cuda_Textures and Free_Cuda_Textures #endif +#ifdef CLOUD_TRACKING + #include "../cloud_tracking/cloud_tracking.h" +#endif #ifdef PARALLEL_OMP #include "../utils/parallel_omp.h" @@ -464,6 +467,26 @@ void Grid3D::Execute_Hydro_Integrator(void) } else if (H.nx > 1 && H.ny > 1 && H.nz > 1) // 3D { #ifdef CUDA + #ifdef CLOUD_TRACKING + // ==Subtract average cloud velocity from grid== + // Variables to store the mass-averaged cloud velocity and total cloud mass + Real velocity_cloud; + Real integrand, density_cloud_tot, mass_cloud_tot; + Real density; + // Do the grid-wide reduction to get the sum of rho*vx and total density for the entire cloud + Cloud_Frame_Update(C.device, H.nx, H.ny, H.nz, H.dx, H.dy, H.dz, H.n_ghost, H.n_fields, H.dt, gama, + H.density_cloud_init, &integrand, &density_cloud_tot, &mass_cloud_tot); + + // chprintf("Cloud mass = %e\n", mass_cloud_tot); + velocity_cloud = integrand / mass_cloud_tot; + Update_Grid_Velocities(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, H.dt, gama, velocity_cloud, + density_cloud_tot, mass_cloud_tot); + // chprintf("Cloud frame update = %d\n", state); + chprintf("Average cloud velocity = %e\n", velocity_cloud); + chprintf("Integrand = %e\n", integrand); + chprintf("Mass cloud = %e\n", mass_cloud_tot); + #endif // CLOUD_TRACKING + #ifdef VL VL_Algorithm_3D_CUDA(C.device, C.d_Grav_potential, H.nx, H.ny, H.nz, x_off, y_off, z_off, H.n_ghost, H.dx, H.dy, H.dz, H.xbound, H.ybound, H.zbound, H.dt, H.n_fields, H.custom_grav, density_floor, U_floor, @@ -527,6 +550,24 @@ Real Grid3D::Update_Hydro_Grid() Dust_Update(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, H.dt, gama); #endif // DUST + #ifdef CLOUD_TRACKING + // ==Subtract average cloud velocity from grid== + // Variables to store the mass-averaged cloud velocity and total cloud mass + Real velocity_cloud; + Real integrand, density_cloud_tot, mass_cloud_tot; + Real density; + // Do the grid-wide reduction to get the sum of rho*vx and total density for the entire cloud + Cloud_Frame_Update(C.device, H.nx, H.ny, H.nz, H.dx, H.dy, H.dz, H.n_ghost, H.n_fields, H.dt, gama, + H.density_cloud_init, &integrand, &density_cloud_tot, &mass_cloud_tot); + velocity_cloud = integrand / mass_cloud_tot; + Update_Grid_Velocities(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, H.dt, gama, velocity_cloud, + density_cloud_tot, mass_cloud_tot); + // chprintf("Cloud frame update = %d\n", state); + chprintf("Average cloud velocity = %e\n", velocity_cloud); + chprintf("Integrand = %e\n", integrand); + chprintf("Mass cloud = %e\n", mass_cloud_tot); + #endif // CLOUD_TRACKING + #endif // CUDA #ifdef CHEMISTRY_GPU @@ -536,6 +577,7 @@ Real Grid3D::Update_Hydro_Grid() Timer.Chemistry.RecordTime(Chem.H.runtime_chemistry_step); non_hydro_elapsed_time += Chem.H.runtime_chemistry_step; #endif + C.HI_density = &C.host[H.n_cells * grid_enum::HI_density]; C.HII_density = &C.host[H.n_cells * grid_enum::HII_density]; C.HeI_density = &C.host[H.n_cells * grid_enum::HeI_density]; diff --git a/src/grid/grid3D.h b/src/grid/grid3D.h index 5354acf0a..554471c6e 100644 --- a/src/grid/grid3D.h +++ b/src/grid/grid3D.h @@ -214,6 +214,10 @@ struct Header { Real min_dt_slow; #endif +#ifdef CLOUD_TRACKING + Real density_cloud_init; +#endif // CLOUD_TRACKING + /*! \var t_wall * \brief Wall time */ Real t_wall; @@ -675,7 +679,7 @@ class Grid3D * gravitational collapse */ void Spherical_Overdensity_3D(); - void Clouds(); + void Clouds(struct Parameters P); void Uniform_Grid(); diff --git a/src/grid/initial_conditions.cpp b/src/grid/initial_conditions.cpp index 30e3eb459..f25694c5b 100644 --- a/src/grid/initial_conditions.cpp +++ b/src/grid/initial_conditions.cpp @@ -68,7 +68,7 @@ void Grid3D::Set_Initial_Conditions(Parameters P) } else if (strcmp(P.init, "Spherical_Overdensity_3D") == 0) { Spherical_Overdensity_3D(); } else if (strcmp(P.init, "Clouds") == 0) { - Clouds(); + Clouds(P); } else if (strcmp(P.init, "Read_Grid") == 0) { #ifndef ONLY_PARTICLES Read_Grid(P); @@ -1310,12 +1310,12 @@ void Grid3D::Spherical_Overdensity_3D() /*! \fn void Clouds() * \brief Bunch of clouds. */ -void Grid3D::Clouds() +void Grid3D::Clouds(struct Parameters P) { int i, j, k, id; int istart, jstart, kstart, iend, jend, kend; Real x_pos, y_pos, z_pos; - Real n_bg, n_cl; // background and cloud number density + Real n_bg; // background and cloud number density Real rho_bg, rho_cl; // background and cloud density Real vx_bg, vx_cl; // background and cloud velocity Real vy_bg, vy_cl; @@ -1324,7 +1324,7 @@ void Grid3D::Clouds() Real p_bg, p_cl; // background and cloud pressure Real mu = 0.6; // mean atomic weight int N_cl = 1; // number of clouds - Real R_cl = 2.5; // cloud radius in code units (kpc) + Real R_cl = 0.001; // cloud radius in code units (kpc) Real cl_pos[N_cl][3]; // array of cloud positions Real r; @@ -1339,17 +1339,19 @@ void Grid3D::Clouds() // single centered cloud setup for (int nn = 0; nn < N_cl; nn++) { - cl_pos[nn][0] = 0.5 * H.xdglobal; + cl_pos[nn][0] = 0.1 * H.xdglobal; cl_pos[nn][1] = 0.5 * H.ydglobal; cl_pos[nn][2] = 0.5 * H.zdglobal; printf("Cloud positions: %f %f %f\n", cl_pos[nn][0], cl_pos[nn][1], cl_pos[nn][2]); } - n_bg = 1.68e-4; - n_cl = 5.4e-2; + n_bg = 1e-2; + rho_cl = 1e-24 / DENSITY_UNIT; +#ifdef CLOUD_TRACKING + rho_cl = P.density_cloud_init / DENSITY_UNIT; +#endif rho_bg = n_bg * mu * MP / DENSITY_UNIT; - rho_cl = n_cl * mu * MP / DENSITY_UNIT; - vx_bg = 0.0; + vx_bg = 1000 * TIME_UNIT / KPC; // vx_c = -200*TIME_UNIT/KPC; // convert from km/s to kpc/kyr vx_cl = 0.0; vy_bg = vy_cl = 0.0; diff --git a/src/utils/reduction_utilities.cu b/src/utils/reduction_utilities.cu index 518572cd2..5b31d0b2a 100644 --- a/src/utils/reduction_utilities.cu +++ b/src/utils/reduction_utilities.cu @@ -11,6 +11,7 @@ // External Includes // Local Includes +#include "../utils/DeviceVector.h" #include "../utils/reduction_utilities.h" #ifdef CUDA @@ -39,5 +40,27 @@ __global__ void kernelReduceMax(Real* in, Real* out, size_t N) gridReduceMax(maxVal, out); } // ===================================================================== + +// ===================================================================== +__global__ void Kernel_Reduce_Add(Real* in, Real* out, size_t N) +{ + __shared__ Real sum_stride[TPB]; // array shared between each block + + for (int i = 0; i < TPB; i++) { + sum_stride[i] = 0; + } + +// Grid stride loop to read global array into shared block-wide array + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { + sum_stride[threadIdx.x] += in[i]; + } + __syncthreads(); + + Grid_Reduction_Add(sum_stride[threadIdx.x], out); + if (threadIdx.x == 0) { + printf("kernel: %f\n", *out); + } +} +// ===================================================================== } // namespace reduction_utilities #endif // CUDA \ No newline at end of file diff --git a/src/utils/reduction_utilities.h b/src/utils/reduction_utilities.h index e47f72d26..66288dffd 100644 --- a/src/utils/reduction_utilities.h +++ b/src/utils/reduction_utilities.h @@ -44,6 +44,24 @@ __inline__ __device__ Real warpReduceMax(Real val) } // ===================================================================== +// ===================================================================== +/*! + * \brief Perform a reduction within the warp/wavefront to find the + * maximum value of `val` + * + * \param[in] val The thread local variable to find the maximum of across + * the warp + * \return Real The maximum value of `val` within the warp + */ +__inline__ __device__ Real Warp_Reduction_Add(Real val) +{ + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + val += __shfl_down(val, offset); + } + return val; +} +// ===================================================================== + // ===================================================================== /*! * \brief Perform a reduction within the block to find the maximum value @@ -79,6 +97,43 @@ __inline__ __device__ Real blockReduceMax(Real val) return val; } +// ===================================================================== + +// ===================================================================== +/*! + * \brief Perform a reduction within the block to find the maximum value + * of `val` + * + * \param[in] val The thread local variable to find the maximum of across + * the block + * \return Real The maximum value of `val` within the block + */ +__inline__ __device__ Real Block_Reduction_Add(Real val) +{ + // Shared memory for storing the results of each warp-wise partial + // reduction + __shared__ Real shared[::maxWarpsPerBlock]; + + int lane = threadIdx.x % warpSize; // thread ID within the warp, + int warpId = threadIdx.x / warpSize; // ID of the warp itself + + val = Warp_Reduction_Add(val); // Each warp performs partial reduction + + if (lane == 0) { + shared[warpId] = val; + } // Write reduced value to shared memory + + __syncthreads(); // Wait for all partial reductions + + // read from shared memory only if that warp existed + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + + if (warpId == 0) { + val = Warp_Reduction_Add(val); + } // Final reduce within first warp + + return val; +} // ===================================================================== #ifndef O_HIP @@ -197,6 +252,24 @@ inline __device__ double atomicMaxBits(double* address, double val) #endif // O_HIP } +/*! + * \brief Perform an atomic reduction to find the sum of `val` + * + * \param[out] address The pointer to where to store the reduced scalar + * value in device memory + * \param[in] val The thread local variable to find the maximum of across + * the grid. Typically this should be a partial reduction that has + * already been reduced to the block level + */ +inline __device__ double Atomic_Add_Bits(double* address, double val) +{ + #ifdef O_HIP + return atomicAdd(address, val); + #else // O_HIP + return atomicAdd(address, val); + #endif +} + /*! * \brief Perform an atomic reduction to find the minimum value of `val` * @@ -263,7 +336,7 @@ inline __device__ double atomicMinBits(double* address, double val) * after this function call you cannot use the reduced value in global * memory since there is no grid wide sync. You can get around this by * either launching a second kernel to do the next steps or by using - * cooperative groups to perform a grid wide sync. During it's execution + * cooperative groups to perform a grid wide sync. During its execution * it also calls multiple __synchThreads and so cannot be called from * within any kind of thread guard. * @@ -284,6 +357,46 @@ __inline__ __device__ void gridReduceMax(Real val, Real* out) } // ===================================================================== +// ===================================================================== +/*! + * \brief + * + * \details This function can perform a reduction to find the sum + * across the entire grid. It relies on a + * warp-wise reduction using registers followed by a block-wise reduction + * using shared memory, and finally a grid-wise reduction using atomics. + * As a result the performance of this function is substantally improved + * by using as many threads per block as possible and as few blocks as + * possible since each block has to perform an atomic operation. To + * accomplish this it is reccommened that you use the + * `AutomaticLaunchParams` functions to get the optimal number of blocks + * and threads per block to launch rather than relying on Cholla defaults + * and then within the kernel using a grid-stride loop to make sure the + * kernel works with any combination of threads and blocks. Note that + * after this function call you cannot use the reduced value in global + * memory since there is no grid wide sync. You can get around this by + * either launching a second kernel to do the next steps or by using + * cooperative groups to perform a grid wide sync. During its execution + * it also calls multiple __synchThreads and so cannot be called from + * within any kind of thread guard. + * + * \param[in] val The thread local variable to find the maximum of across + * the grid + * \param[out] out The pointer to where to store the reduced scalar value + * in device memory + */ +__inline__ __device__ void Grid_Reduction_Add(Real val, Real* out) +{ + // Reduce the entire block in parallel + val = Block_Reduction_Add(val); + + // Write block level reduced value to the output scalar atomically + if (threadIdx.x == 0) { + Atomic_Add_Bits(out, val); + } +} +// ===================================================================== + // ===================================================================== /*! * \brief Find the maximum value in the array. Make sure to initialize @@ -303,6 +416,8 @@ __inline__ __device__ void gridReduceMax(Real val, Real* out) * \param[in] N The size of the `in` array */ __global__ void kernelReduceMax(Real* in, Real* out, size_t N); + +__global__ void Kernel_Reduce_Add(Real* in, Real* out, size_t N); // ===================================================================== } // namespace reduction_utilities #endif // CUDA diff --git a/src/utils/reduction_utilities_tests.cu b/src/utils/reduction_utilities_tests.cu index 5dd18c197..b6653010c 100644 --- a/src/utils/reduction_utilities_tests.cu +++ b/src/utils/reduction_utilities_tests.cu @@ -65,6 +65,47 @@ TEST(tALLKernelReduceMax, CorrectInputExpectCorrectOutput) // Perform comparison testing_utilities::Check_Results(maxValue, dev_max.at(0), "maximum value found"); } -// ============================================================================= -// Tests for divergence max reduction -// ============================================================================= + +TEST(tALLKernelReduceSum, CorrectInputExpectCorrectOutput) +{ + // Launch parameters + // ================= + cuda_utilities::AutomaticLaunchParams static const launch_params(reduction_utilities::Kernel_Reduce_Add); + + // Grid Parameters & testing parameters + // ==================================== + size_t const size = std::pow(64, 3); + std::vector host_grid(size); // host copy of array to be summed + std::vector host_sum(1); // variable to store result of sum reduction + + host_sum[0] = 5.0; + // Fill grid with ones + for (Real& host_data : host_grid) { + host_data = 1; + } + + // Allocating and copying to device + // ================================ + cuda_utilities::DeviceVector dev_grid(host_grid.size()); + dev_grid.cpyHostToDevice(host_grid); + + cuda_utilities::DeviceVector static dev_sum(1); + dev_sum.assign(0, 0); + + // Do the reduction + // ================ + // .data() passes the kernel a pointer + hipLaunchKernelGGL(reduction_utilities::Kernel_Reduce_Add, launch_params.numBlocks, launch_params.threadsPerBlock, 0, 0, + dev_grid.data(), dev_sum.data(), host_grid.size()); + cudaDeviceSynchronize(); + CudaCheckError(); + + dev_sum.cpyDeviceToHost(host_sum); + CudaCheckError(); + + printf("size: %d\n", size); + printf("sum: %e\n", host_sum[0]); + + // Perform comparison + testingUtilities::checkResults(size, host_sum[0], "sum found"); +} \ No newline at end of file