diff --git a/Makefile b/Makefile
index c444ae4a8..3b615f1d5 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,7 @@ DIRS     := src src/analysis src/chemistry_gpu src/cooling src/cooling_grackle s
             src/cpu src/global src/gravity src/gravity/paris src/grid src/hydro \
             src/integrators src/io src/main.cpp src/main_tests.cpp src/mhd\
             src/model src/mpi src/old_cholla src/particles src/reconstruction \
-            src/riemann_solvers src/system_tests src/utils src/dust
+            src/riemann_solvers src/system_tests src/utils src/dust src/cloud_tracking
 
 SUFFIX ?= .$(TYPE).$(MACHINE)
 
diff --git a/builds/make.type.dust b/builds/make.type.dust
index 24a765302..31629c566 100644
--- a/builds/make.type.dust
+++ b/builds/make.type.dust
@@ -9,7 +9,7 @@ MPI_GPU   ?=
 DFLAGS    += -DCUDA
 DFLAGS    += -DMPI_CHOLLA
 DFLAGS    += -DPRECISION=2
-DFLAGS    += -DPLMP
+DFLAGS    += -DPPMP
 DFLAGS    += -DHLLC
 
 DFLAGS    += -DDE
@@ -36,6 +36,8 @@ DFLAGS    += -DCLOUDY_COOLING
 DFLAGS    += -DSLICES
 DFLAGS    += -DPROJECTION
 
+DFLAGS += -DCLOUD_TRACKING
+
 DFLAGS    += $(OUTPUT)
 
 DFLAGS    += -DOUTPUT_ALWAYS
diff --git a/builds/make.type.hydro b/builds/make.type.hydro
index f34d78172..7d006ac10 100644
--- a/builds/make.type.hydro
+++ b/builds/make.type.hydro
@@ -11,18 +11,20 @@ DFLAGS    += -DSIMPLE
 #DFLAGS    += -DVL
 
 # Apply a density and temperature floor
-DFLAGS    += -DDENSITY_FLOOR
+# DFLAGS    += -DDENSITY_FLOOR
 DFLAGS    += -DTEMPERATURE_FLOOR
 
 # Solve the Gas Internal Energy usisng a Dual Energy Formalism
 #DFLAGS    += -DDE
 
 # Apply cooling on the GPU from precomputed tables
-#DFLAGS    += -DCOOLING_GPU
+DFLAGS    += -DCOOLING_GPU
 
 # Measure the Timing of the different stages
 #DFLAGS    += -DCPU_TIME
 
+#DFLAGS    += -DCLOUD_TRACKING
+
 # Select output format
 # Can also add -DSLICES and -DPROJECTIONS
 OUTPUT    ?=  -DOUTPUT -DHDF5
diff --git a/src/cloud_tracking/cloud_tracking.cu b/src/cloud_tracking/cloud_tracking.cu
new file mode 100644
index 000000000..4e8acc752
--- /dev/null
+++ b/src/cloud_tracking/cloud_tracking.cu
@@ -0,0 +1,159 @@
+/*!
+ * \file cloud_tracking.cu
+ * \author Helena Richie (helenarichie@gmail.com)
+ * \brief
+ */
+
+#ifdef CLOUD_TRACKING
+
+  // STL includes
+  #include <stdio.h>
+
+  #include <cstdio>
+  #include <fstream>
+  #include <vector>
+
+// Local includes
+
+  #include "../cloud_tracking/cloud_tracking.h"
+  #include "../global/global.h"
+  #include "../global/global_cuda.h"
+  #include "../grid/grid3D.h"
+  #include "../grid/grid_enum.h"
+  #include "../utils/DeviceVector.h"
+  #include "../utils/cuda_utilities.h"
+  #include "../utils/gpu.hpp"
+  #include "../utils/hydro_utilities.h"
+  #include "../utils/reduction_utilities.h"
+
+void Cloud_Frame_Update(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz, int n_ghost,
+                        int n_fields, Real dt, Real gamma, Real density_cloud_init, Real *integrand,
+                        Real *density_cloud_tot, Real *mass_cloud_tot)
+{
+  // cuda_utilities::AutomaticLaunchParams static const launchParams(Cloud_Tracking_Kernel)
+
+  int n_cells = nx * ny * nz;
+  int ngrid   = (n_cells + TPB - 1) / TPB;
+  dim3 dim1dGrid(ngrid, 1, 1);
+  dim3 dim1dBlock(TPB, 1, 1);
+
+  // Allocate the device memory
+  cuda_utilities::DeviceVector<Real> static integrand_cloud(1, true);
+  cuda_utilities::DeviceVector<Real> static density_cloud(1, true);
+  cuda_utilities::DeviceVector<Real> static mass_cloud(1, true);
+
+  cuda_utilities::AutomaticLaunchParams static const launchParams(Cloud_Tracking_Kernel);
+
+  // printf("TPB: %d\n", TPB);
+  // printf("ngrid: %d\n", ngrid);
+  // printf("nx, ny, nz: %d %d %d\n", nx, ny, nz);
+  // printf("ngrid: %d\n", n_cells);
+  // printf("dim1dGrid: %d\n", dim1dGrid);
+  // printf("dim1dBlock: %d\n", dim1dBlock);
+  // printf("numBLocks: %d\n", launchParams.numBlocks);
+  // printf("threadsPerBLock: %d\n", launchParams.threadsPerBlock);
+
+  hipLaunchKernelGGL(Cloud_Tracking_Kernel, launchParams.numBlocks, launchParams.threadsPerBlock, 0, 0, dev_conserved,
+                     nx, ny, nz, dx, dy, dz, n_ghost, n_fields, dt, gamma, density_cloud_init, integrand_cloud.data(),
+                     density_cloud.data(), mass_cloud.data());
+  GPU_Error_Check();
+
+  *integrand         = integrand_cloud[0];
+  *density_cloud_tot = density_cloud[0];
+  *mass_cloud_tot    = mass_cloud[0];
+}
+
+__global__ void Cloud_Tracking_Kernel(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz,
+                                      int n_ghost, int n_fields, Real dt, Real gamma, Real density_cloud_init,
+                                      Real *integrand_cloud, Real *density_cloud, Real *mass_cloud)
+{
+  int xid, yid, zid, n_cells;
+  n_cells = nx * ny * nz;
+  __shared__ Real density_stride[TPB];
+  __shared__ Real velocity_x_stride[TPB];
+  __shared__ Real mass_stride[TPB];
+  Real density, velocity_x, mass;
+
+  for (int i = 0; i < TPB; i++) {
+    density_stride[i]    = 0;
+    velocity_x_stride[i] = 0;
+    mass_stride[i]       = 0;
+  }
+
+  // Grid stride loop to perform as much of the reduction as possible. The
+  // fact that `id` has type `size_t` is important. I'm not totally sure why
+  // but setting it to int results in some kind of silent over/underflow issue
+  // even though we're not hitting those kinds of numbers. Setting it to type
+  // uint or size_t fixes them
+  for (size_t id = threadIdx.x + blockIdx.x * blockDim.x; id < n_cells; id += blockDim.x * gridDim.x) {
+    // get a global thread ID
+    cuda_utilities::compute3DIndices(id, nx, ny, xid, yid, zid);
+
+    // threads corresponding to real cells do the calculation
+    if (xid > n_ghost - 1 && xid < nx - n_ghost && yid > n_ghost - 1 && yid < ny - n_ghost && zid > n_ghost - 1 &&
+        zid < nz - n_ghost) {
+      density    = dev_conserved[id + n_cells * grid_enum::density];
+      velocity_x = dev_conserved[id + n_cells * grid_enum::momentum_x] / density;
+      mass       = density * dx * dy * dz;
+
+      if (density > (1 / 3 * (density_cloud_init / DENSITY_UNIT))) {
+        // printf("inside if statement: %d %e\n", id, mass);
+        density_stride[threadIdx.x] += density;
+        velocity_x_stride[threadIdx.x] += velocity_x;
+        mass_stride[threadIdx.x] += mass;
+        // Do grid-wide reduction to compute mass-averaged cloud velocity (Shin et al. 2008, eq. 9)
+      }
+    }
+  }
+  __syncthreads();
+
+  reduction_utilities::Grid_Reduction_Add(density_stride[threadIdx.x] * velocity_x_stride[threadIdx.x],
+                                          integrand_cloud);
+  reduction_utilities::Grid_Reduction_Add(density_stride[threadIdx.x], density_cloud);
+  reduction_utilities::Grid_Reduction_Add(mass_stride[threadIdx.x], mass_cloud);
+}
+
+void Update_Grid_Velocities(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt, Real gamma,
+                            Real velocity_cloud, Real density_cloud_tot, Real mass_cloud_tot)
+{
+  // cuda_utilities::AutomaticLaunchParams static const launchParams(Velocity_Update);
+  int n_cells = nx * ny * nz;
+  int ngrid   = (n_cells + TPB - 1) / TPB;
+  dim3 dim1dGrid(ngrid, 1, 1);
+  dim3 dim1dBlock(TPB, 1, 1);
+
+  hipLaunchKernelGGL(Velocity_Update, dim1dGrid, dim1dBlock, 0, 0, dev_conserved, nx, ny, nz, n_ghost, n_fields, dt,
+                     gamma, velocity_cloud, density_cloud_tot, mass_cloud_tot);
+  GPU_Error_Check();;
+}
+
+__global__ void Velocity_Update(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt,
+                                Real gamma, Real velocity_cloud, Real density_cloud_tot, Real mass_cloud_tot)
+{
+  // get grid indices
+  int n_cells = nx * ny * nz;
+  int is, ie, js, je, ks, ke;
+  cuda_utilities::Get_Real_Indices(n_ghost, nx, ny, nz, is, ie, js, je, ks, ke);
+  // get a global thread ID
+  int blockId = blockIdx.x + blockIdx.y * gridDim.x;
+  int id      = threadIdx.x + blockId * blockDim.x;
+  int id_z    = id / (nx * ny);
+  int id_y    = (id - id_z * nx * ny) / nx;
+  int id_x    = id - id_z * nx * ny - id_y * nx;
+
+  Real density, momentum_x, velocity_x, energy;
+
+  Real energy_kinetic_cloud = 0.5 * mass_cloud_tot * pow(velocity_cloud, 2);
+
+  //  threads corresponding to real cells do the calculation
+  if (id_x >= is && id_x < ie && id_y >= js && id_y < je && id_z >= ks && id_z < ke) {
+    density                                             = dev_conserved[id + n_cells * grid_enum::density];
+    momentum_x                                          = dev_conserved[id + n_cells * grid_enum::momentum_x];
+    velocity_x                                          = density * momentum_x;
+    energy                                              = dev_conserved[id + n_cells * grid_enum::Energy];
+    dev_conserved[id + n_cells * grid_enum::momentum_x] = (velocity_x - velocity_cloud) * density;
+    dev_conserved[id + n_cells * grid_enum::Energy]     = energy - energy_kinetic_cloud;
+  }
+}
+
+#endif  // CLOUD_TRACKING
diff --git a/src/cloud_tracking/cloud_tracking.h b/src/cloud_tracking/cloud_tracking.h
new file mode 100644
index 000000000..913fa4031
--- /dev/null
+++ b/src/cloud_tracking/cloud_tracking.h
@@ -0,0 +1,31 @@
+/*!
+ * \file cloud_tracking.h
+ * \author Helena Richie (helenarichie@pitt.edu)
+ * \brief Contains declaration for the kernel that does frame of reference tracking to track clouds.
+ *
+ */
+#ifdef CLOUD_TRACKING
+  #ifndef CLOUD_TRACKING_CUDA_H
+    #define CLOUD_TRACKING_CUDA_H
+
+    #include <math.h>
+
+    #include "../global/global.h"
+    #include "../utils/gpu.hpp"
+
+void Cloud_Frame_Update(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz, int n_ghost,
+                        int n_fields, Real dt, Real gamma, Real density_cloud_init, Real *integrand,
+                        Real *density_cloud_tot, Real *mass_cloud_tot);
+
+__global__ void Cloud_Tracking_Kernel(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz,
+                                      int n_ghost, int n_fields, Real dt, Real gamma, Real density_cloud_init,
+                                      Real *integrand_cloud, Real *density_cloud, Real *mass_cloud);
+
+void Update_Grid_Velocities(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt, Real gamma,
+                            Real velocity_cloud, Real density_cloud_tot, Real mass_cloud_tot);
+
+__global__ void Velocity_Update(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt,
+                                Real gamma, Real velocity_cloud, Real density_cloud_tot, Real mass_cloud_tot);
+
+  #endif  // CLOUD_TRACKING_CUDA_H
+#endif    // CLOUD_TRACKING
\ No newline at end of file
diff --git a/src/global/global.cpp b/src/global/global.cpp
index 6579d9955..a9ff0384d 100644
--- a/src/global/global.cpp
+++ b/src/global/global.cpp
@@ -101,10 +101,11 @@ char *Trim(char *s)
 }
 
 // NOLINTNEXTLINE(cert-err58-cpp)
-const std::set<const char *> optionalParams = {
-    "flag_delta",   "ddelta_dt",   "n_delta",  "Lz",       "Lx",      "phi",     "theta",
-    "delta",        "nzr",         "nxr",      "H0",       "Omega_M", "Omega_L", "Init_redshift",
-    "End_redshift", "tile_length", "n_proc_x", "n_proc_y", "n_proc_z"};
+const std::set<const char *> optionalParams = {"flag_delta", "ddelta_dt",     "n_delta",      "Lz",
+                                               "Lx",         "phi",           "theta",        "delta",
+                                               "nzr",        "nxr",           "H0",           "Omega_M",
+                                               "Omega_L",    "Init_redshift", "End_redshift", "tile_length",
+                                               "n_proc_x",   "n_proc_y",      "n_proc_z",     "density_cloud_init"};
 
 /*! \fn int Is_Param_Valid(char *name);
  * \brief Verifies that a param is valid (even if not needed).  Avoids
@@ -453,6 +454,10 @@ void Parse_Param(char *name, char *value, struct Parameters *parms)
   } else if (strcmp(name, "skewersdir") == 0) {
     strncpy(parms->skewersdir, value, MAXLEN);
   #endif
+#endif
+#ifdef CLOUD_TRACKING
+  } else if (strcmp(name, "density_cloud_init") == 0) {
+    parms->density_cloud_init = atof(value);
 #endif
   } else if (!Is_Param_Valid(name)) {
     chprintf("WARNING: %s/%s: Unknown parameter/value pair!\n", name, value);
diff --git a/src/global/global.h b/src/global/global.h
index 6c9524001..5f3dc36a7 100644
--- a/src/global/global.h
+++ b/src/global/global.h
@@ -309,7 +309,9 @@ struct Parameters {
 #ifdef TILED_INITIAL_CONDITIONS
   Real tile_length;
 #endif  // TILED_INITIAL_CONDITIONS
-
+#ifdef CLOUD_TRACKING
+  Real density_cloud_init;
+#endif
 #ifdef SET_MPI_GRID
   // Set the MPI Processes grid [n_proc_x, n_proc_y, n_proc_z]
   int n_proc_x;
diff --git a/src/grid/cuda_boundaries.cu b/src/grid/cuda_boundaries.cu
index baf846d3c..715840322 100644
--- a/src/grid/cuda_boundaries.cu
+++ b/src/grid/cuda_boundaries.cu
@@ -318,7 +318,7 @@ __global__ void Wind_Boundary_kernel(Real *c_device, int nx, int ny, int nz, int
   d_0 = n_0 * mu * MP / DENSITY_UNIT;
   P_0 = n_0 * KB * T_0 / PRESSURE_UNIT;
 
-  vx = 100 * TIME_UNIT / KPC;  // km/s * (cholla unit conversion)
+  vx = 1000 * TIME_UNIT / KPC;  // km/s * (cholla unit conversion)
   vy = 0.0;
   vz = 0.0;
 
diff --git a/src/grid/grid3D.cpp b/src/grid/grid3D.cpp
index 93499fea0..fc65279d5 100644
--- a/src/grid/grid3D.cpp
+++ b/src/grid/grid3D.cpp
@@ -29,6 +29,9 @@
 #ifdef CLOUDY_COOL
   #include "../cooling/load_cloudy_texture.h"  // provides Load_Cuda_Textures and Free_Cuda_Textures
 #endif
+#ifdef CLOUD_TRACKING
+  #include "../cloud_tracking/cloud_tracking.h"
+#endif
 
 #ifdef PARALLEL_OMP
   #include "../utils/parallel_omp.h"
@@ -464,6 +467,26 @@ void Grid3D::Execute_Hydro_Integrator(void)
   } else if (H.nx > 1 && H.ny > 1 && H.nz > 1)  // 3D
   {
 #ifdef CUDA
+  #ifdef CLOUD_TRACKING
+    // ==Subtract average cloud velocity from grid==
+    // Variables to store the mass-averaged cloud velocity and total cloud mass
+    Real velocity_cloud;
+    Real integrand, density_cloud_tot, mass_cloud_tot;
+    Real density;
+    // Do the grid-wide reduction to get the sum of rho*vx and total density for the entire cloud
+    Cloud_Frame_Update(C.device, H.nx, H.ny, H.nz, H.dx, H.dy, H.dz, H.n_ghost, H.n_fields, H.dt, gama,
+                       H.density_cloud_init, &integrand, &density_cloud_tot, &mass_cloud_tot);
+
+    // chprintf("Cloud mass = %e\n", mass_cloud_tot);
+    velocity_cloud = integrand / mass_cloud_tot;
+    Update_Grid_Velocities(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, H.dt, gama, velocity_cloud,
+                           density_cloud_tot, mass_cloud_tot);
+    // chprintf("Cloud frame update = %d\n", state);
+    chprintf("Average cloud velocity = %e\n", velocity_cloud);
+    chprintf("Integrand = %e\n", integrand);
+    chprintf("Mass cloud = %e\n", mass_cloud_tot);
+  #endif  // CLOUD_TRACKING
+
   #ifdef VL
     VL_Algorithm_3D_CUDA(C.device, C.d_Grav_potential, H.nx, H.ny, H.nz, x_off, y_off, z_off, H.n_ghost, H.dx, H.dy,
                          H.dz, H.xbound, H.ybound, H.zbound, H.dt, H.n_fields, H.custom_grav, density_floor, U_floor,
@@ -527,6 +550,24 @@ Real Grid3D::Update_Hydro_Grid()
   Dust_Update(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, H.dt, gama);
   #endif  // DUST
 
+  #ifdef CLOUD_TRACKING
+  // ==Subtract average cloud velocity from grid==
+  // Variables to store the mass-averaged cloud velocity and total cloud mass
+  Real velocity_cloud;
+  Real integrand, density_cloud_tot, mass_cloud_tot;
+  Real density;
+  // Do the grid-wide reduction to get the sum of rho*vx and total density for the entire cloud
+  Cloud_Frame_Update(C.device, H.nx, H.ny, H.nz, H.dx, H.dy, H.dz, H.n_ghost, H.n_fields, H.dt, gama,
+                     H.density_cloud_init, &integrand, &density_cloud_tot, &mass_cloud_tot);
+  velocity_cloud = integrand / mass_cloud_tot;
+  Update_Grid_Velocities(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, H.dt, gama, velocity_cloud,
+                         density_cloud_tot, mass_cloud_tot);
+  // chprintf("Cloud frame update = %d\n", state);
+  chprintf("Average cloud velocity = %e\n", velocity_cloud);
+  chprintf("Integrand = %e\n", integrand);
+  chprintf("Mass cloud = %e\n", mass_cloud_tot);
+  #endif  // CLOUD_TRACKING
+
 #endif  // CUDA
 
 #ifdef CHEMISTRY_GPU
@@ -536,6 +577,7 @@ Real Grid3D::Update_Hydro_Grid()
   Timer.Chemistry.RecordTime(Chem.H.runtime_chemistry_step);
   non_hydro_elapsed_time += Chem.H.runtime_chemistry_step;
   #endif
+
   C.HI_density    = &C.host[H.n_cells * grid_enum::HI_density];
   C.HII_density   = &C.host[H.n_cells * grid_enum::HII_density];
   C.HeI_density   = &C.host[H.n_cells * grid_enum::HeI_density];
diff --git a/src/grid/grid3D.h b/src/grid/grid3D.h
index 5354acf0a..554471c6e 100644
--- a/src/grid/grid3D.h
+++ b/src/grid/grid3D.h
@@ -214,6 +214,10 @@ struct Header {
   Real min_dt_slow;
 #endif
 
+#ifdef CLOUD_TRACKING
+  Real density_cloud_init;
+#endif  // CLOUD_TRACKING
+
   /*! \var t_wall
    *  \brief Wall time */
   Real t_wall;
@@ -675,7 +679,7 @@ class Grid3D
    * gravitational collapse */
   void Spherical_Overdensity_3D();
 
-  void Clouds();
+  void Clouds(struct Parameters P);
 
   void Uniform_Grid();
 
diff --git a/src/grid/initial_conditions.cpp b/src/grid/initial_conditions.cpp
index 30e3eb459..f25694c5b 100644
--- a/src/grid/initial_conditions.cpp
+++ b/src/grid/initial_conditions.cpp
@@ -68,7 +68,7 @@ void Grid3D::Set_Initial_Conditions(Parameters P)
   } else if (strcmp(P.init, "Spherical_Overdensity_3D") == 0) {
     Spherical_Overdensity_3D();
   } else if (strcmp(P.init, "Clouds") == 0) {
-    Clouds();
+    Clouds(P);
   } else if (strcmp(P.init, "Read_Grid") == 0) {
 #ifndef ONLY_PARTICLES
     Read_Grid(P);
@@ -1310,12 +1310,12 @@ void Grid3D::Spherical_Overdensity_3D()
 
 /*! \fn void Clouds()
  *  \brief Bunch of clouds. */
-void Grid3D::Clouds()
+void Grid3D::Clouds(struct Parameters P)
 {
   int i, j, k, id;
   int istart, jstart, kstart, iend, jend, kend;
   Real x_pos, y_pos, z_pos;
-  Real n_bg, n_cl;      // background and cloud number density
+  Real n_bg;            // background and cloud number density
   Real rho_bg, rho_cl;  // background and cloud density
   Real vx_bg, vx_cl;    // background and cloud velocity
   Real vy_bg, vy_cl;
@@ -1324,7 +1324,7 @@ void Grid3D::Clouds()
   Real p_bg, p_cl;       // background and cloud pressure
   Real mu   = 0.6;       // mean atomic weight
   int N_cl  = 1;         // number of clouds
-  Real R_cl = 2.5;       // cloud radius in code units (kpc)
+  Real R_cl = 0.001;     // cloud radius in code units (kpc)
   Real cl_pos[N_cl][3];  // array of cloud positions
   Real r;
 
@@ -1339,17 +1339,19 @@ void Grid3D::Clouds()
 
   // single centered cloud setup
   for (int nn = 0; nn < N_cl; nn++) {
-    cl_pos[nn][0] = 0.5 * H.xdglobal;
+    cl_pos[nn][0] = 0.1 * H.xdglobal;
     cl_pos[nn][1] = 0.5 * H.ydglobal;
     cl_pos[nn][2] = 0.5 * H.zdglobal;
     printf("Cloud positions: %f %f %f\n", cl_pos[nn][0], cl_pos[nn][1], cl_pos[nn][2]);
   }
 
-  n_bg   = 1.68e-4;
-  n_cl   = 5.4e-2;
+  n_bg   = 1e-2;
+  rho_cl = 1e-24 / DENSITY_UNIT;
+#ifdef CLOUD_TRACKING
+  rho_cl = P.density_cloud_init / DENSITY_UNIT;
+#endif
   rho_bg = n_bg * mu * MP / DENSITY_UNIT;
-  rho_cl = n_cl * mu * MP / DENSITY_UNIT;
-  vx_bg  = 0.0;
+  vx_bg  = 1000 * TIME_UNIT / KPC;
   // vx_c  = -200*TIME_UNIT/KPC; // convert from km/s to kpc/kyr
   vx_cl = 0.0;
   vy_bg = vy_cl = 0.0;
diff --git a/src/utils/reduction_utilities.cu b/src/utils/reduction_utilities.cu
index 518572cd2..5b31d0b2a 100644
--- a/src/utils/reduction_utilities.cu
+++ b/src/utils/reduction_utilities.cu
@@ -11,6 +11,7 @@
 // External Includes
 
 // Local Includes
+#include "../utils/DeviceVector.h"
 #include "../utils/reduction_utilities.h"
 
 #ifdef CUDA
@@ -39,5 +40,27 @@ __global__ void kernelReduceMax(Real* in, Real* out, size_t N)
   gridReduceMax(maxVal, out);
 }
 // =====================================================================
+
+// =====================================================================
+__global__ void Kernel_Reduce_Add(Real* in, Real* out, size_t N)
+{
+  __shared__ Real sum_stride[TPB]; // array shared between each block
+ 
+  for (int i = 0; i < TPB; i++) {
+    sum_stride[i] = 0;
+  }
+
+// Grid stride loop to read global array into shared block-wide array
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
+    sum_stride[threadIdx.x] += in[i];
+  }
+  __syncthreads();
+
+  Grid_Reduction_Add(sum_stride[threadIdx.x], out);
+  if (threadIdx.x == 0) {
+    printf("kernel: %f\n", *out);
+  }
+}
+// =====================================================================
 }  // namespace reduction_utilities
 #endif  // CUDA
\ No newline at end of file
diff --git a/src/utils/reduction_utilities.h b/src/utils/reduction_utilities.h
index e47f72d26..66288dffd 100644
--- a/src/utils/reduction_utilities.h
+++ b/src/utils/reduction_utilities.h
@@ -44,6 +44,24 @@ __inline__ __device__ Real warpReduceMax(Real val)
 }
 // =====================================================================
 
+// =====================================================================
+/*!
+ * \brief Perform a reduction within the warp/wavefront to find the
+ * maximum value of `val`
+ *
+ * \param[in] val The thread local variable to find the maximum of across
+ * the warp
+ * \return Real The maximum value of `val` within the warp
+ */
+__inline__ __device__ Real Warp_Reduction_Add(Real val)
+{
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+    val += __shfl_down(val, offset);
+  }
+  return val;
+}
+// =====================================================================
+
 // =====================================================================
 /*!
  * \brief Perform a reduction within the block to find the maximum value
@@ -79,6 +97,43 @@ __inline__ __device__ Real blockReduceMax(Real val)
 
   return val;
 }
+// =====================================================================
+
+// =====================================================================
+/*!
+ * \brief Perform a reduction within the block to find the maximum value
+ * of `val`
+ *
+ * \param[in] val The thread local variable to find the maximum of across
+ * the block
+ * \return Real The maximum value of `val` within the block
+ */
+__inline__ __device__ Real Block_Reduction_Add(Real val)
+{
+  // Shared memory for storing the results of each warp-wise partial
+  // reduction
+  __shared__ Real shared[::maxWarpsPerBlock];
+
+  int lane   = threadIdx.x % warpSize;  // thread ID within the warp,
+  int warpId = threadIdx.x / warpSize;  // ID of the warp itself
+
+  val = Warp_Reduction_Add(val);  // Each warp performs partial reduction
+
+  if (lane == 0) {
+    shared[warpId] = val;
+  }  // Write reduced value to shared memory
+
+  __syncthreads();  // Wait for all partial reductions
+
+  // read from shared memory only if that warp existed
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
+
+  if (warpId == 0) {
+    val = Warp_Reduction_Add(val);
+  }  // Final reduce within first warp
+
+  return val;
+}
 // =====================================================================
 
   #ifndef O_HIP
@@ -197,6 +252,24 @@ inline __device__ double atomicMaxBits(double* address, double val)
   #endif  // O_HIP
 }
 
+/*!
+ * \brief Perform an atomic reduction to find the sum of `val`
+ *
+ * \param[out] address The pointer to where to store the reduced scalar
+ * value in device memory
+ * \param[in] val The thread local variable to find the maximum of across
+ * the grid. Typically this should be a partial reduction that has
+ * already been reduced to the block level
+ */
+inline __device__ double Atomic_Add_Bits(double* address, double val)
+{
+  #ifdef O_HIP
+  return atomicAdd(address, val);
+  #else  // O_HIP
+  return atomicAdd(address, val);
+  #endif
+}
+
 /*!
  * \brief Perform an atomic reduction to find the minimum value of `val`
  *
@@ -263,7 +336,7 @@ inline __device__ double atomicMinBits(double* address, double val)
  * after this function call you cannot use the reduced value in global
  * memory since there is no grid wide sync. You can get around this by
  * either launching a second kernel to do the next steps or by using
- * cooperative groups to perform a grid wide sync. During it's execution
+ * cooperative groups to perform a grid wide sync. During its execution
  * it also calls multiple __synchThreads and so cannot be called from
  * within any kind of thread guard.
  *
@@ -284,6 +357,46 @@ __inline__ __device__ void gridReduceMax(Real val, Real* out)
 }
 // =====================================================================
 
+// =====================================================================
+/*!
+ * \brief
+ *
+ * \details This function can perform a reduction to find the sum
+ * across the entire grid. It relies on a
+ * warp-wise reduction using registers followed by a block-wise reduction
+ * using shared memory, and finally a grid-wise reduction using atomics.
+ * As a result the performance of this function is substantally improved
+ * by using as many threads per block as possible and as few blocks as
+ * possible since each block has to perform an atomic operation. To
+ * accomplish this it is reccommened that you use the
+ * `AutomaticLaunchParams` functions to get the optimal number of blocks
+ * and threads per block to launch rather than relying on Cholla defaults
+ * and then within the kernel using a grid-stride loop to make sure the
+ * kernel works with any combination of threads and blocks. Note that
+ * after this function call you cannot use the reduced value in global
+ * memory since there is no grid wide sync. You can get around this by
+ * either launching a second kernel to do the next steps or by using
+ * cooperative groups to perform a grid wide sync. During its execution
+ * it also calls multiple __synchThreads and so cannot be called from
+ * within any kind of thread guard.
+ *
+ * \param[in] val The thread local variable to find the maximum of across
+ * the grid
+ * \param[out] out The pointer to where to store the reduced scalar value
+ * in device memory
+ */
+__inline__ __device__ void Grid_Reduction_Add(Real val, Real* out)
+{
+  // Reduce the entire block in parallel
+  val = Block_Reduction_Add(val);
+
+  // Write block level reduced value to the output scalar atomically
+  if (threadIdx.x == 0) {
+    Atomic_Add_Bits(out, val);
+  }
+}
+// =====================================================================
+
 // =====================================================================
 /*!
  * \brief Find the maximum value in the array. Make sure to initialize
@@ -303,6 +416,8 @@ __inline__ __device__ void gridReduceMax(Real val, Real* out)
  * \param[in] N The size of the `in` array
  */
 __global__ void kernelReduceMax(Real* in, Real* out, size_t N);
+
+__global__ void Kernel_Reduce_Add(Real* in, Real* out, size_t N);
 // =====================================================================
 }  // namespace reduction_utilities
 #endif  // CUDA
diff --git a/src/utils/reduction_utilities_tests.cu b/src/utils/reduction_utilities_tests.cu
index 5dd18c197..b6653010c 100644
--- a/src/utils/reduction_utilities_tests.cu
+++ b/src/utils/reduction_utilities_tests.cu
@@ -65,6 +65,47 @@ TEST(tALLKernelReduceMax, CorrectInputExpectCorrectOutput)
   // Perform comparison
   testing_utilities::Check_Results(maxValue, dev_max.at(0), "maximum value found");
 }
-// =============================================================================
-// Tests for divergence max reduction
-// =============================================================================
+
+TEST(tALLKernelReduceSum, CorrectInputExpectCorrectOutput)
+{
+  // Launch parameters
+  // =================
+  cuda_utilities::AutomaticLaunchParams static const launch_params(reduction_utilities::Kernel_Reduce_Add);
+
+  // Grid Parameters & testing parameters
+  // ====================================
+  size_t const size = std::pow(64, 3);
+  std::vector<Real> host_grid(size);  // host copy of array to be summed
+  std::vector<Real> host_sum(1);  // variable to store result of sum reduction
+
+  host_sum[0] = 5.0;
+  // Fill grid with ones
+  for (Real& host_data : host_grid) {
+    host_data = 1;
+  }
+
+  // Allocating and copying to device
+  // ================================
+  cuda_utilities::DeviceVector<Real> dev_grid(host_grid.size());
+  dev_grid.cpyHostToDevice(host_grid);
+
+  cuda_utilities::DeviceVector<Real> static dev_sum(1);
+  dev_sum.assign(0, 0);
+
+  // Do the reduction
+  // ================
+  // .data() passes the kernel a pointer
+  hipLaunchKernelGGL(reduction_utilities::Kernel_Reduce_Add, launch_params.numBlocks, launch_params.threadsPerBlock, 0, 0,
+                     dev_grid.data(), dev_sum.data(), host_grid.size());
+  cudaDeviceSynchronize();
+  CudaCheckError();
+
+  dev_sum.cpyDeviceToHost(host_sum);
+  CudaCheckError();
+
+  printf("size: %d\n", size);
+  printf("sum: %e\n", host_sum[0]);
+
+  // Perform comparison
+  testingUtilities::checkResults(size, host_sum[0], "sum found");
+}
\ No newline at end of file