diff --git a/.gitmodules b/.gitmodules
index c9a26c699..e69de29bb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "cholla-tests-data"]
-	path = cholla-tests-data
-	url = https://github.com/cholla-hydro/cholla-tests-data.git
diff --git a/Makefile b/Makefile
index 7ed600340..31e8c6380 100644
--- a/Makefile
+++ b/Makefile
@@ -11,9 +11,9 @@ CUDA_ARCH ?= sm_70
 
 DIRS     := src src/analysis src/chemistry_gpu src/cooling src/cooling_grackle src/cosmology \
             src/cpu src/global src/gravity src/gravity/paris src/grid src/hydro \
-            src/integrators src/io src/main.cpp src/main_tests.cpp src/mhd\
+            src/integrators src/io src/main.cpp src/main_tests.cpp src/mhd \
             src/model src/mpi src/old_cholla src/particles src/reconstruction \
-            src/riemann_solvers src/system_tests src/utils src/dust src/cluster \
+            src/riemann_solvers src/system_tests src/utils src/dust src/cluster src/cloud_tracking \
 
 SUFFIX ?= .$(TYPE).$(MACHINE)
 
diff --git a/builds/make.type.dust b/builds/make.type.dust
index 1669a4077..a039eff9b 100644
--- a/builds/make.type.dust
+++ b/builds/make.type.dust
@@ -8,7 +8,7 @@ MPI_GPU   ?=
 
 DFLAGS    += -DMPI_CHOLLA
 DFLAGS    += -DPRECISION=2
-DFLAGS    += -DPLMC
+DFLAGS    += -DPPMP
 DFLAGS    += -DHLLC
 
 DFLAGS    += -DDE
@@ -24,10 +24,11 @@ DFLAGS    += -DSCALAR_FLOOR
 
 # Define dust macro
 DFLAGS    += -DDUST
+DFLAGS    += -DOUTFLOW_ANALYSIS
+# DFLAGS    += -DCLOUD_TRACKING
 
 # Apply the cooling in the GPU from precomputed tables
 DFLAGS    += -DCOOLING_GPU
-DFLAGS    += -DCLOUDY_COOLING
 
 #Measure the Timing of the different stages
 #DFLAGS   += -DCPU_TIME
@@ -37,10 +38,10 @@ DFLAGS    += -DPROJECTION
 
 DFLAGS    += $(OUTPUT)
 
-DFLAGS    += -DOUTPUT_ALWAYS
+# DFLAGS    += -DOUTPUT_ALWAYS
 
 #Select if the Hydro Conserved data will reside in the GPU
 #and the MPI transfers are done from the GPU
 #If not specified, MPI_GPU is off by default
 #This is set in the system make.host file
-DFLAGS    += $(MPI_GPU)
\ No newline at end of file
+DFLAGS    += $(MPI_GPU)
diff --git a/builds/make.type.hydro b/builds/make.type.hydro
index 9e9b1d77c..cc8bbfbe6 100644
--- a/builds/make.type.hydro
+++ b/builds/make.type.hydro
@@ -13,19 +13,28 @@ DFLAGS    += -DVL
 endif
 
 # Apply a density and temperature floor
-DFLAGS    += -DDENSITY_FLOOR
-DFLAGS    += -DTEMPERATURE_FLOOR
+#DFLAGS    += -DDENSITY_FLOOR
+#DFLAGS    += -DTEMPERATURE_FLOOR
 
 # Solve the Gas Internal Energy usisng a Dual Energy Formalism
-#DFLAGS    += -DDE
+DFLAGS    += -DDE
+
+DFLAGS += -DSLICES
 
 # Apply cooling on the GPU from precomputed tables
 #DFLAGS    += -DCOOLING_GPU
+DFLAGS    += -DAVERAGE_SLOW_CELLS
+
+# Apply reference frame shift to track cloud
+DFLAGS += -DCLOUD_TRACKING
 
 # Measure the Timing of the different stages
 #DFLAGS    += -DCPU_TIME
 
 # Select output format
-# Can also add -DSLICES and -DPROJECTIONS
-OUTPUT    ?=  -DOUTPUT -DHDF5
+# Can also add -DSLICES and -DPROJECTION
+OUTPUT    ?=  -DOUTPUT -DHDF5 
 DFLAGS    += $(OUTPUT)
+
+DFLAGS    += -DSLICES
+DFLAGS    += -DPROJECTION
\ No newline at end of file
diff --git a/builds/make.type.m82 b/builds/make.type.m82
index f5c4ef54b..b3996715f 100644
--- a/builds/make.type.m82
+++ b/builds/make.type.m82
@@ -2,7 +2,9 @@
 
 #-- separated output flag so that it can be overriden in target-specific 
 #   for make check
-OUTPUT    ?=  -DOUTPUT -DHDF5 -DSLICES -DPROJECTION -DROTATED_PROJECTION
+# OUTPUT    ?=  -DOUTPUT -DHDF5 -DSLICES -DPROJECTION -DROTATED_PROJECTION
+
+OUTPUT    ?=  -DOUTPUT -DHDF5 -DSLICES
 
 MPI_GPU   ?=
 
@@ -19,10 +21,14 @@ DFLAGS    += -DSTATIC_GRAV
 
 # EVOLVE SCALAR (INJECTED BY SUPERNOVA)
 DFLAGS += -DSCALAR
-DFLAGS += -DBASIC_SCALAR
+# DFLAGS += -DBASIC_SCALAR
+
+DFLAGS += -DDUST
+# DFLAGS += -DOUTFLOW_ANALYSIS
+DFLAGS += -DSCALAR_FLOOR
 
 # MW_MODEL vs m82 (default)
-DFLAGS += -DMW_MODEL
+# DFLAGS += -DMW_MODEL
 
 ifeq ($(findstring cosmology,$(TYPE)),cosmology)
 DFLAGS    += -DSIMPLE
diff --git a/builds/make.type.particles b/builds/make.type.particles
index f718f975b..2951f628f 100644
--- a/builds/make.type.particles
+++ b/builds/make.type.particles
@@ -11,11 +11,12 @@ DFLAGS += -DPARTICLES_GPU
 
 
 #Solve only an N-Body Simulation (No Hydro, DM Only simulation)
-# DFLAGS += -DONLY_PARTICLES
+DFLAGS += -DONLY_PARTICLES
 
 
 # Track Particles IDs and write them to the output files
 DFLAGS += -DPARTICLE_IDS
+DFLAGS += -DCLOUD_TRACKING
 
 
 # Track Particle Ages ( Stellar Populations )
diff --git a/src/analysis/feedback_analysis.cpp b/src/analysis/feedback_analysis.cpp
index 3dab7b6da..5fe0e7543 100644
--- a/src/analysis/feedback_analysis.cpp
+++ b/src/analysis/feedback_analysis.cpp
@@ -87,7 +87,7 @@ void FeedbackAnalysis::Compute_Gas_Velocity_Dispersion(Grid3D& G)
   #ifdef MPI_CHOLLA
   MPI_Allreduce(&partial_mass, &total_mass, 1, MPI_CHREAL, MPI_SUM, world);
   #else
-  total_mass = partial_mass;
+  total_mass         = partial_mass;
   #endif
 
   for (k = G.H.n_ghost; k < G.H.nz - G.H.n_ghost; k++) {
diff --git a/src/analysis/outflow_analysis.cu b/src/analysis/outflow_analysis.cu
new file mode 100644
index 000000000..18ea297e8
--- /dev/null
+++ b/src/analysis/outflow_analysis.cu
@@ -0,0 +1,241 @@
+#ifdef DUST
+  #ifdef OUTFLOW_ANALYSIS
+    // STL includes
+    #include <math.h>
+    #include <stdio.h>
+
+    #include <cstdio>
+    #include <fstream>
+    #include <vector>
+
+    #include "../analysis/outflow_analysis.h"
+    #include "../grid/grid_enum.h"
+    #include "../utils/DeviceVector.h"
+    #include "../utils/cuda_utilities.h"
+    #include "../utils/hydro_utilities.h"
+    #include "../utils/reduction_utilities.h"
+
+void Outflow_Analysis(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz, int n_ghost, int n_fields,
+                      Real *mass_cloud, Real *mass_dust, Real density_cloud_init)
+{
+  cuda_utilities::AutomaticLaunchParams static const launchParams(Outflow_Analysis_Kernel);
+
+  cuda_utilities::DeviceVector<Real> dev_mass_cloud(1);
+  cuda_utilities::DeviceVector<Real> dev_mass_dust(1);
+
+  hipLaunchKernelGGL(Outflow_Analysis_Kernel, launchParams.get_numBlocks(), launchParams.get_threadsPerBlock(), 0, 0, dev_conserved,
+                     nx, ny, nz, dx, dy, dz, n_ghost, n_fields, dev_mass_cloud.data(), dev_mass_dust.data(),
+                     density_cloud_init);
+  cudaDeviceSynchronize();
+
+  *mass_cloud = dev_mass_cloud[0];
+  *mass_dust  = dev_mass_dust[0];
+}
+
+__global__ void Outflow_Analysis_Kernel(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz,
+                                        int n_ghost, int n_fields, Real *mass_cloud, Real *mass_dust,
+                                        Real density_cloud_init)
+{
+  int xid, yid, zid, n_cells;
+  n_cells = nx * ny * nz;
+
+  Real density_gas;
+  Real mass_cloud_stride = 0.0;
+  Real density_dust;
+  Real mass_dust_stride = 0.0;
+
+  for (size_t id = threadIdx.x + blockIdx.x * blockDim.x; id < n_cells; id += blockDim.x * gridDim.x) {
+    cuda_utilities::compute3DIndices(id, nx, ny, xid, yid, zid);
+    // grid cells
+    if (xid > n_ghost - 1 && xid < nx - n_ghost && yid > n_ghost - 1 && yid < ny - n_ghost && zid > n_ghost - 1 &&
+        zid < nz - n_ghost) {
+      density_gas  = dev_conserved[id + n_cells * grid_enum::density];
+      density_dust = dev_conserved[id + n_cells * grid_enum::dust_density];
+
+      mass_dust_stride += density_dust * dx * dy * dz;
+
+      if ((density_gas * DENSITY_UNIT) >= (density_cloud_init / 30)) {
+        mass_cloud_stride += density_gas * dx * dy * dz;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  reduction_utilities::Grid_Reduce_Add(mass_cloud_stride, mass_cloud);
+  reduction_utilities::Grid_Reduce_Add(mass_dust_stride, mass_dust);
+
+  /*
+  // int id, xid, yid, zid, gid;
+
+  // calculate ghost cell ID and i,j,k in GPU grid
+  int id = threadIdx.x + blockIdx.x * blockDim.x;
+
+  int isize, jsize;
+
+  // +x boundary first
+  isize = n_ghost;
+  jsize = ny;
+
+  // not true i,j,k but relative i,j,k in the GPU grid
+  zid = id / (isize * jsize);
+  yid = (id - zid * isize * jsize) / isize;
+  xid = id - zid * isize * jsize - yid * isize;
+
+  // map thread id to ghost cell id
+  xid += nx - n_ghost;  // +x boundary
+  int gid = xid + yid * nx + zid * nx * ny;
+
+  // boundary cells
+  // -x boundary
+  if (xid == n_ghost && xid < nx && yid < ny && zid < nz) {
+    velocity_x = dev_conserved[gid + n_cells * grid_enum::momentum_x] / density_gas;
+    density_gas = dev_conserved[gid + n_cells * grid_enum::density];
+    #ifdef DUST
+    density_dust = dev_conserved[gid + n_cells * grid_enum::dust_density];
+    #endif
+    if (velocity_x < 0) {
+      if ((density_gas * DENSITY_UNIT) >= (density_cloud_init / 3)) {
+        printf("-x %e/n", density_gas * DENSITY_UNIT);
+        mass_cloud_bndry_stride += density_gas * dx * dy * dz;
+        rate_cloud_stride += density_gas * std::abs(velocity_x) * dx * dx;
+      }
+      #ifdef DUST
+      mass_dust_bndry_stride += density_dust * dx * dy * dz;
+      rate_dust_stride += density_dust * std::abs(velocity_x) * dx * dx;
+      #endif  // DUST
+    }
+  }
+  // +x boundary
+  if (xid == nx - n_ghost && xid < nx && yid < ny && zid < nz) {
+    velocity_x = dev_conserved[gid + n_cells * grid_enum::momentum_x] / density_gas;
+    density_gas = dev_conserved[gid + n_cells * grid_enum::density];
+    #ifdef DUST
+    density_dust = dev_conserved[gid + n_cells * grid_enum::dust_density];
+    #endif
+    if (velocity_x > 0) {
+      if ((density_gas * DENSITY_UNIT) >= (density_cloud_init / 3)) {
+        printf("+x %e/n", density_gas * DENSITY_UNIT);
+        mass_cloud_bndry_stride += density_gas * dx * dy * dz;
+        rate_cloud_stride += density_gas * std::abs(velocity_x) * dx * dx;
+      }
+      #ifdef DUST
+      mass_dust_bndry_stride += density_dust * dx * dy * dz;
+      rate_dust_stride += density_dust * std::abs(velocity_x) * dx * dx;
+      #endif  // DUST
+    }
+  }
+
+  // +y boundary next
+  isize = nx;
+  jsize = n_ghost;
+  // ksize = nz;
+
+  // not true i,j,k but relative i,j,k
+  zid = id / (isize * jsize);
+  yid = (id - zid * isize * jsize) / isize;
+  xid = id - zid * isize * jsize - yid * isize;
+
+  // map thread id to ghost cell id
+  yid += ny - n_ghost;
+  gid = xid + yid * nx + zid * nx * ny;
+
+  // -y boundary
+  if (xid < nx && yid == n_ghost && yid < ny && zid < nz) {
+    velocity_y = dev_conserved[gid + n_cells * grid_enum::momentum_y] / density_gas;
+    density_gas = dev_conserved[gid + n_cells * grid_enum::density];
+    #ifdef DUST
+    density_dust = dev_conserved[gid + n_cells * grid_enum::dust_density];
+    #endif
+    if (velocity_y < 0) {
+      if ((density_gas * DENSITY_UNIT) >= (density_cloud_init / 3)) {
+        // printf("-y cloud %e/n", density_gas * DENSITY_UNIT);
+        printf("yid: %d\n", yid);
+        mass_cloud_bndry_stride += density_gas * dx * dy * dz;
+        rate_cloud_stride += density_gas * std::abs(velocity_y) * dx * dx;
+      }
+      #ifdef DUST
+      // printf("-y dust %e/n", density_dust * DENSITY_UNIT);
+      mass_dust_bndry_stride += density_dust * dx * dy * dz;
+      rate_dust_stride += density_dust * std::abs(velocity_y) * dx * dx;
+      #endif  // DUST
+    }
+  }
+  // +y boundary
+  if (xid < nx && yid == ny - n_ghost && yid < ny && zid < nz) {
+    velocity_y = dev_conserved[gid + n_cells * grid_enum::momentum_y] / density_gas;
+    density_gas = dev_conserved[gid + n_cells * grid_enum::density];
+    #ifdef DUST
+    density_dust = dev_conserved[gid + n_cells * grid_enum::dust_density];
+    #endif
+    if (velocity_y > 0) {
+      if ((density_gas * DENSITY_UNIT) >= (density_cloud_init / 3)) {
+        // printf("+y %e/n", density_gas * DENSITY_UNIT);
+        printf("yid: %d\n", yid);
+        printf("yid: %d\n", ny_real);
+        mass_cloud_bndry_stride += density_gas * dx * dy * dz;
+        rate_cloud_stride += density_gas * std::abs(velocity_y) * dx * dx;
+      }
+      #ifdef DUST
+      mass_dust_bndry_stride += density_dust * dx * dy * dz;
+      rate_dust_stride += density_dust * std::abs(velocity_y) * dx * dx;
+      #endif  // DUST
+    }
+  }
+
+  isize = nx;
+  jsize = ny;
+  // ksize = n_ghost;
+
+  // not true i,j,k but relative i,j,k
+  zid = id / (isize * jsize);
+  yid = (id - zid * isize * jsize) / isize;
+  xid = id - zid * isize * jsize - yid * isize;
+
+  // map thread id to ghost cell id
+  zid += nz - n_ghost;
+  gid = xid + yid * nx + zid * nx * ny;
+
+  // -z boundary
+  if (xid < nx && yid < ny && zid == n_ghost && zid < nz) {
+    velocity_z = dev_conserved[gid + n_cells * grid_enum::momentum_z] / density_gas;
+    density_gas = dev_conserved[gid + n_cells * grid_enum::density];
+    #ifdef DUST
+    density_dust = dev_conserved[gid + n_cells * grid_enum::dust_density];
+    #endif
+    if (velocity_z < 0) {
+      if ((density_gas * DENSITY_UNIT) >= (density_cloud_init / 3)) {
+        printf("-z %e/n", density_gas * DENSITY_UNIT);
+        mass_cloud_bndry_stride += density_gas * dx * dy * dz;
+        rate_cloud_stride += density_gas * std::abs(velocity_z) * dx * dx;
+      }
+      #ifdef DUST
+      mass_dust_bndry_stride += density_dust * dx * dy * dz;
+      rate_dust_stride += density_dust * std::abs(velocity_z) * dx * dx;
+      #endif  // DUST
+    }
+  }
+  // +z bondary
+  if (xid < nx && yid < ny && zid == nz - n_ghost && zid < nz) {
+    velocity_z = dev_conserved[gid + n_cells * grid_enum::momentum_z] / density_gas;
+    density_gas = dev_conserved[gid + n_cells * grid_enum::density];
+    #ifdef DUST
+    density_dust = dev_conserved[gid + n_cells * grid_enum::dust_density];
+    #endif
+    if (velocity_z > 0) {
+      if ((density_gas * DENSITY_UNIT) >= (density_cloud_init / 3)) {
+        printf("+z %e/n", density_gas * DENSITY_UNIT);
+        mass_cloud_bndry_stride += density_gas * dx * dy * dz;
+        rate_cloud_stride += density_gas * std::abs(velocity_z) * dx * dx;
+      }
+      #ifdef DUST
+      mass_dust_bndry_stride += density_dust * dx * dy * dz;
+      rate_dust_stride += density_dust * std::abs(velocity_z) * dx * dx;
+      #endif  // DUST
+    }
+  }
+  */
+}
+
+  #endif  // OUTFLOW_ANALYSIS
+#endif    // DUST
diff --git a/src/analysis/outflow_analysis.h b/src/analysis/outflow_analysis.h
new file mode 100644
index 000000000..1f945d4f5
--- /dev/null
+++ b/src/analysis/outflow_analysis.h
@@ -0,0 +1,15 @@
+#ifdef OUTFLOW_ANALYSIS
+  #ifndef OUTFLOW_ANALYSIS_CUDA_H
+    #define OUTFLOW_ANALYSIS_CUDA_H
+
+    #include "../global/global.h"
+    #include "../utils/gpu.hpp"
+
+void Outflow_Analysis(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz, int n_ghost, int n_fields,
+                      Real *mass_cloud, Real *mass_dust, Real density_cloud_init);
+
+__global__ void Outflow_Analysis_Kernel(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz,
+                                        int n_ghost, int n_fields, Real *mass_cloud, Real *mass_dust,
+                                        Real density_cloud_init);
+  #endif  //  OUTFLOW_ANALYSIS_H
+#endif    //  OUTFLOW_ANALYSIS
diff --git a/src/chemistry_gpu/chemistry_functions.cpp b/src/chemistry_gpu/chemistry_functions.cpp
index 7999a6d55..65e3af691 100644
--- a/src/chemistry_gpu/chemistry_functions.cpp
+++ b/src/chemistry_gpu/chemistry_functions.cpp
@@ -228,7 +228,7 @@ void Grid3D::Update_Chemistry()
   #ifdef COSMOLOGY
   Chem.H.current_z = Cosmo.current_z;
   #else
-  Chem.H.current_z = 0;
+  Chem.H.current_z          = 0;
   #endif
 
   Do_Chemistry_Update(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, H.dt, Chem.H);
diff --git a/src/cloud_tracking/cloud_tracking.cu b/src/cloud_tracking/cloud_tracking.cu
new file mode 100644
index 000000000..424fafb3a
--- /dev/null
+++ b/src/cloud_tracking/cloud_tracking.cu
@@ -0,0 +1,125 @@
+#ifdef CLOUD_TRACKING
+
+  // STL includes
+  #include <stdio.h>
+
+  #include <cstdio>
+  #include <fstream>
+  #include <vector>
+
+  #include "../cloud_tracking/cloud_tracking.h"
+  #include "../global/global.h"
+  #include "../global/global_cuda.h"
+  #include "../grid/grid3D.h"
+  #include "../grid/grid_enum.h"
+  #include "../utils/DeviceVector.h"
+  #include "../utils/cuda_utilities.h"
+  #include "../utils/gpu.hpp"
+  #include "../utils/hydro_utilities.h"
+  #include "../utils/reduction_utilities.h"
+
+void Cloud_Velocity_Reduction(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz, int n_ghost,
+                              int n_fields, Real density_cloud_init, Real density_wind_init, Real *mass_cloud,
+                              Real *integrand_cloud)
+{
+  cuda_utilities::AutomaticLaunchParams static const launchParams(Cloud_Reduction_Kernel);
+
+  cuda_utilities::DeviceVector<Real> dev_mass_cloud(1, true);
+  cuda_utilities::DeviceVector<Real> dev_integrand_cloud(1, true);
+
+  // Initialize host vectors to copy results to
+  std::vector<Real> host_mass_cloud{0};
+  std::vector<Real> host_integrand_cloud{0};
+
+  // .data() gets device vector pointers
+  hipLaunchKernelGGL(Cloud_Reduction_Kernel, launchParams.numBlocks, launchParams.threadsPerBlock, 0, 0, dev_conserved,
+                     nx, ny, nz, dx, dy, dz, n_ghost, n_fields, density_cloud_init, density_wind_init,
+                     dev_mass_cloud.data(), dev_integrand_cloud.data());
+  cudaDeviceSynchronize();
+
+  // Copy result of reductions from device to host
+  dev_mass_cloud.cpyDeviceToHost(host_mass_cloud);
+  dev_integrand_cloud.cpyDeviceToHost(host_integrand_cloud);
+
+  *mass_cloud      = host_mass_cloud[0];
+  *integrand_cloud = host_integrand_cloud[0];
+}
+
+void Update_Grid_Frame(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields,
+                       Real velocity_x_cloud_avg)
+{
+  cuda_utilities::AutomaticLaunchParams static const launchParams(Frame_Shift_Kernel);
+
+  // .data() gets device vector pointers
+  hipLaunchKernelGGL(Frame_Shift_Kernel, launchParams.numBlocks, launchParams.threadsPerBlock, 0, 0, dev_conserved, nx,
+                     ny, nz, n_ghost, n_fields, velocity_x_cloud_avg);
+  cudaDeviceSynchronize();
+}
+
+__global__ void Cloud_Reduction_Kernel(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz,
+                                       int n_ghost, int n_fields, Real density_cloud_init, Real density_wind_init,
+                                       Real *mass_cloud, Real *integrand_cloud)
+{
+  int xid, yid, zid, n_cells;
+  n_cells = nx * ny * nz;
+
+  Real integrand_stride = 0.0;
+  Real mass_stride      = 0.0;
+
+  Real density, velocity_x, mass;
+
+  // Grid stride loop
+  for (size_t id = threadIdx.x + blockIdx.x * blockDim.x; id < n_cells; id += blockDim.x * gridDim.x) {
+    cuda_utilities::compute3DIndices(id, nx, ny, xid, yid, zid);
+
+    if (xid > n_ghost - 1 && xid < nx - n_ghost && yid > n_ghost - 1 && yid < ny - n_ghost && zid > n_ghost - 1 &&
+        zid < nz - n_ghost) {
+      density    = dev_conserved[id + n_cells * grid_enum::density];
+      velocity_x = dev_conserved[id + n_cells * grid_enum::momentum_x] / density;
+      mass       = density * dx * dy * dz;
+      // if ((density * DENSITY_UNIT) >= (pow(density_cloud_init * density_wind_init, 0.5))) {
+      if ((density * DENSITY_UNIT) >= (density_cloud_init / pow(1e3, 1 / 2))) {
+        mass_stride += mass;
+        // (Shin et al. (2008) eq. 9)
+        integrand_stride += velocity_x * density * dx * dy * dz;
+      }
+    }
+  }
+  __syncthreads();
+
+  reduction_utilities::Grid_Reduce_Add(mass_stride, mass_cloud);
+  reduction_utilities::Grid_Reduce_Add(integrand_stride, integrand_cloud);
+}
+
+__global__ void Frame_Shift_Kernel(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields,
+                                   Real velocity_x_cloud_avg)
+{
+  int xid, yid, zid, n_cells;
+  n_cells = nx * ny * nz;
+
+  Real density, velocity_x, velocity_y, velocity_z, energy, energy_internal;
+
+  for (size_t id = threadIdx.x + blockIdx.x * blockDim.x; id < n_cells; id += blockDim.x * gridDim.x) {
+    cuda_utilities::compute3DIndices(id, nx, ny, xid, yid, zid);
+
+    if (xid > n_ghost - 1 && xid < nx - n_ghost && yid > n_ghost - 1 && yid < ny - n_ghost && zid > n_ghost - 1 &&
+        zid < nz - n_ghost) {
+      density    = dev_conserved[id + n_cells * grid_enum::density];
+      velocity_x = dev_conserved[id + n_cells * grid_enum::momentum_x] / density;
+      velocity_y = dev_conserved[id + n_cells * grid_enum::momentum_y] / density;
+      velocity_z = dev_conserved[id + n_cells * grid_enum::momentum_z] / density;
+      energy     = dev_conserved[id + n_cells * grid_enum::Energy];
+
+      energy_internal = energy - 0.5 * density * (pow(velocity_x, 2) + pow(velocity_y, 2) + pow(velocity_z, 2));
+
+      // Apply frame of reference shift
+      dev_conserved[id + n_cells * grid_enum::momentum_x] = (velocity_x - velocity_x_cloud_avg) * density;
+      dev_conserved[id + n_cells * grid_enum::Energy] =
+          energy_internal +
+          0.5 * density * (pow(velocity_x - velocity_x_cloud_avg, 2) + pow(velocity_y, 2) + pow(velocity_z, 2));
+    }
+  }
+  __syncthreads();
+}
+
+#endif  // CLOUD_TRACKING
\ No newline at end of file
diff --git a/src/cloud_tracking/cloud_tracking.h b/src/cloud_tracking/cloud_tracking.h
new file mode 100644
index 000000000..5a563db52
--- /dev/null
+++ b/src/cloud_tracking/cloud_tracking.h
@@ -0,0 +1,23 @@
+#ifdef CLOUD_TRACKING
+  #ifndef CLOUD_TRACKING_CUDA_H
+    #define CLOUD_TRACKING_CUDA_H
+
+    #include "../global/global.h"
+    #include "../utils/gpu.hpp"
+
+void Cloud_Velocity_Reduction(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz, int n_ghost,
+                              int n_fields, Real density_cloud_init, Real density_wind_init, Real *mass_cloud,
+                              Real *integrand_cloud);
+
+__global__ void Cloud_Reduction_Kernel(Real *dev_conserved, int nx, int ny, int nz, Real dx, Real dy, Real dz,
+                                       int n_ghost, int n_fields, Real density_cloud_init, Real density_wind_init,
+                                       Real *mass_cloud, Real *integrand_cloud);
+
+void Update_Grid_Frame(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields,
+                       Real velocity_x_cloud_avg);
+
+__global__ void Frame_Shift_Kernel(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields,
+                                   Real velocity_x_cloud_avg);
+
+  #endif  // CLOUD_TRACKING_H
+#endif    // CLOUD_TRACKING
\ No newline at end of file
diff --git a/src/cooling/cooling_cuda.cu b/src/cooling/cooling_cuda.cu
index b5c74ec7c..4c24b657e 100644
--- a/src/cooling/cooling_cuda.cu
+++ b/src/cooling/cooling_cuda.cu
@@ -121,6 +121,7 @@ __global__ void cooling_kernel(Real *dev_conserved, int nx, int ny, int nz, int
   // call the cooling function
   #ifdef CLOUDY_COOL
       cool = Cloudy_cool(n, T, coolTexObj, heatTexObj);
+<<<<<<< 
   #else
     cool = CIE_cool(n, T);
   #endif
@@ -164,7 +165,7 @@ __global__ void cooling_kernel(Real *dev_conserved, int nx, int ny, int nz, int
   #ifdef CLOUDY_COOL
       cool = Cloudy_cool(n, T, coolTexObj, heatTexObj);
   #else
-    cool = CIE_cool(n, T);
+      cool = CIE_cool(n, T);
   // printf("%d %d %d %e %e %e\n", xid, yid, zid, n, T, cool);
   #endif
 
diff --git a/src/cooling_grackle/cool_grackle.cpp b/src/cooling_grackle/cool_grackle.cpp
index a7f5c36cb..c5f2a8078 100644
--- a/src/cooling_grackle/cool_grackle.cpp
+++ b/src/cooling_grackle/cool_grackle.cpp
@@ -89,7 +89,7 @@ void Cool_GK::Initialize(struct Parameters *P, Cosmology &Cosmo)
   data->metal_cooling = 1;  // metal cooling off
   #else
   chprintf("WARNING: Metal Cooling is Off. \n");
-  data->metal_cooling = 0;  // metal cooling off
+  data->metal_cooling       = 0;  // metal cooling off
   #endif
 
   #ifdef PARALLEL_OMP
diff --git a/src/dust/dust_cuda.cu b/src/dust/dust_cuda.cu
index 8b72facdf..f5cb6b7b7 100644
--- a/src/dust/dust_cuda.cu
+++ b/src/dust/dust_cuda.cu
@@ -9,6 +9,7 @@
 #ifdef DUST
 
   // STL includes
+  #include <math.h>
   #include <stdio.h>
 
   #include <cstdio>
@@ -21,24 +22,34 @@
   #include "../global/global_cuda.h"
   #include "../grid/grid3D.h"
   #include "../grid/grid_enum.h"
+  #include "../utils/DeviceVector.h"
   #include "../utils/cuda_utilities.h"
   #include "../utils/gpu.hpp"
   #include "../utils/hydro_utilities.h"
+  #include "../utils/reduction_utilities.h"
 
-void Dust_Update(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt, Real gamma,
-                 Real grain_radius)
+void Dust_Update(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dx, Real dy, Real dz,
+                 Real dt, Real gamma, Real grain_radius, Real *mass_hot, Real *mass_mixed)
 {
   int n_cells = nx * ny * nz;
   int ngrid   = (n_cells + TPB - 1) / TPB;
   dim3 dim1dGrid(ngrid, 1, 1);
   dim3 dim1dBlock(TPB, 1, 1);
-  hipLaunchKernelGGL(Dust_Kernel, dim1dGrid, dim1dBlock, 0, 0, dev_conserved, nx, ny, nz, n_ghost, n_fields, dt, gamma,
-                     grain_radius);
+
+  cuda_utilities::DeviceVector<Real> dev_mass_mixed(1, true);
+  cuda_utilities::DeviceVector<Real> dev_mass_hot(1, true);
+
+  hipLaunchKernelGGL(Dust_Kernel, dim1dGrid, dim1dBlock, 0, 0, dev_conserved, nx, ny, nz, n_ghost, n_fields, dx, dy, dz,
+                     dt, gamma, grain_radius, dev_mass_mixed.data(), dev_mass_hot.data());
   GPU_Error_Check();
+  cudaDeviceSynchronize();
+
+  *mass_mixed = dev_mass_mixed[0];
+  *mass_hot   = dev_mass_hot[0];
 }
 
-__global__ void Dust_Kernel(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt, Real gamma,
-                            Real grain_radius)
+__global__ void Dust_Kernel(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dx, Real dy,
+                            Real dz, Real dt, Real gamma, Real grain_radius, Real *mass_hot, Real *mass_mixed)
 {
   // get grid indices
   int n_cells = nx * ny * nz;
@@ -54,11 +65,13 @@ __global__ void Dust_Kernel(Real *dev_conserved, int nx, int ny, int nz, int n_g
   // define physics variables
   Real density_gas, density_dust;  // fluid mass densities
   Real number_density;             // gas number density
-  Real mu = 0.6;                   // mean molecular weight
+  Real mu              = 0.6;      // mean molecular weight
+  Real sputtered_hot   = 0;
+  Real sputtered_mixed = 0;  // mixed and hot-phase sputtered dust masses
 
   // define integration variables
   Real dd_dt;          // instantaneous rate of change in dust density
-  Real dd;             // change in dust density at current timestep
+  Real dd     = 0;     // change in dust density at current timestep
   Real dd_max = 0.01;  // allowable percentage of dust density increase
   Real dt_sub;         // refined timestep
 
@@ -67,6 +80,7 @@ __global__ void Dust_Kernel(Real *dev_conserved, int nx, int ny, int nz, int n_g
     density_gas  = dev_conserved[id + n_cells * grid_enum::density];
     density_dust = dev_conserved[id + n_cells * grid_enum::dust_density];
 
+    printf("%d %f", id, density_dust);
     // convert mass density to number density
     number_density = density_gas * DENSITY_UNIT / (mu * MP);
 
@@ -110,8 +124,18 @@ __global__ void Dust_Kernel(Real *dev_conserved, int nx, int ny, int nz, int n_g
     // update dust density
     density_dust += dd;
 
+    if (temperature >= 1e6) {
+      sputtered_hot += abs(dd * dx * dy * dz);
+    } else {
+      sputtered_mixed += abs(dd * dx * dy * dz);
+    }
+
     dev_conserved[id + n_cells * grid_enum::dust_density] = density_dust;
   }
+  __syncthreads();
+
+  reduction_utilities::Grid_Reduce_Add(sputtered_hot, mass_hot);
+  reduction_utilities::Grid_Reduce_Add(sputtered_mixed, mass_mixed);
 }
 
 // McKinnon et al. (2017) sputtering timescale
diff --git a/src/dust/dust_cuda.h b/src/dust/dust_cuda.h
index 212901e8a..3b52cf96f 100644
--- a/src/dust/dust_cuda.h
+++ b/src/dust/dust_cuda.h
@@ -27,8 +27,8 @@
  * \param[in] dt Simulation timestep
  * \param[in] gamma Specific heat ratio
  */
-void Dust_Update(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt, Real gamma,
-                 Real grain_radius);
+void Dust_Update(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dx, Real dy, Real dz,
+                 Real dt, Real gamma, Real grain_radius, Real *mass_hot, Real *mass_mixed);
 
 /*!
  * \brief Compute the change in dust density for a cell and update its value in dev_conserved.
@@ -43,8 +43,8 @@ void Dust_Update(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n
  * \param[in] dt Simulation timestep
  * \param[in] gamma Specific heat ratio
  */
-__global__ void Dust_Kernel(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt, Real gamma,
-                            Real grain_radius);
+__global__ void Dust_Kernel(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dx, Real dy,
+                            Real dz, Real dt, Real gamma, Real grain_radius, Real *mass_hot, Real *mass_mixed);
 
 /*!
  * \brief Compute the sputtering timescale based on a cell's density and temperature.
diff --git a/src/global/global.cpp b/src/global/global.cpp
index 39d4fd178..cabf95b0e 100644
--- a/src/global/global.cpp
+++ b/src/global/global.cpp
@@ -101,10 +101,27 @@ char *Trim(char *s)
 }
 
 // NOLINTNEXTLINE(cert-err58-cpp)
-const std::set<const char *> optionalParams = {
-    "flag_delta",   "ddelta_dt",   "n_delta",  "Lz",       "Lx",      "phi",     "theta",
-    "delta",        "nzr",         "nxr",      "H0",       "Omega_M", "Omega_L", "Init_redshift",
-    "End_redshift", "tile_length", "n_proc_x", "n_proc_y", "n_proc_z"};
+const std::set<const char *> optionalParams = {"flag_delta",
+                                               "ddelta_dt",
+                                               "n_delta",
+                                               "Lz",
+                                               "Lx",
+                                               "phi",
+                                               "theta",
+                                               "delta",
+                                               "nzr",
+                                               "nxr",
+                                               "H0",
+                                               "Omega_M",
+                                               "Omega_L",
+                                               "Init_redshift",
+                                               "End_redshift",
+                                               "tile_length",
+                                               "n_proc_x",
+                                               "n_proc_y",
+                                               "n_proc_z",
+                                               "density_cloud_init",
+                                               "density_wind_init"};
 
 /*! \fn int Is_Param_Valid(char *name);
  * \brief Verifies that a param is valid (even if not needed).  Avoids
@@ -495,10 +512,19 @@ void Parse_Param(char *name, char *value, struct Parameters *parms)
   } else if (strcmp(name, "supernova_rcl") == 0) {
     parms->supernova_rcl = atof(value);
 #endif
+#ifdef CLOUD_TRACKING
+  } else if (strcmp(name, "density_wind_init") == 0) {
+    parms->density_wind_init = atof(value);
+#endif
+#if defined(OUTFLOW_ANALYSIS) || defined(CLOUD_TRACKING)
+  } else if (strcmp(name, "density_cloud_init") == 0) {
+    parms->density_cloud_init = atof(value);
+#endif
 #ifdef SCALAR
   #ifdef DUST
   } else if (strcmp(name, "grain_radius") == 0) {
-    parms->grain_radius = atoi(value);
+    parms->grain_radius = atof(value);
+    chprintf("Grain radius: %e\n", parms->grain_radius);
   #endif
 #endif
   } else if (!Is_Param_Valid(name)) {
diff --git a/src/global/global.h b/src/global/global.h
index 74aa5c718..28e3a3672 100644
--- a/src/global/global.h
+++ b/src/global/global.h
@@ -309,6 +309,12 @@ struct Parameters {
   char scale_outputs_file[MAXLEN];  // File for the scale_factor output values
                                     // for cosmological simulations
 #endif                              // COSMOLOGY
+#ifdef CLOUD_TRACKING
+  Real density_wind_init;
+#endif
+#if defined(CLOUD_TRACKING) || defined(OUTFLOW_ANALYSIS)
+  Real density_cloud_init;
+#endif
 #ifdef TILED_INITIAL_CONDITIONS
   Real tile_length;
 #endif  // TILED_INITIAL_CONDITIONS
diff --git a/src/gravity/gravity_functions.cpp b/src/gravity/gravity_functions.cpp
index 744f55825..8ed5fb590 100644
--- a/src/gravity/gravity_functions.cpp
+++ b/src/gravity/gravity_functions.cpp
@@ -134,7 +134,7 @@ void Grid3D::set_dt_Gravity()
   dt_particles = Calc_Particles_dt();
   dt_particles = fmin(dt_particles, Particles.max_dt);
       #ifdef ONLY_PARTICLES
-  dt_min = dt_particles;
+  dt_min       = dt_particles;
   chprintf(" dt_particles: %f \n", dt_particles);
       #else
   chprintf(" dt_hydro: %f   dt_particles: %f \n", dt_hydro, dt_particles);
@@ -208,7 +208,7 @@ Real Grav3D::Get_Average_Density()
   #ifdef MPI_CHOLLA
   dens_avrg_all = ReduceRealAvg(dens_mean);
   #else
-  dens_avrg_all = dens_mean;
+  dens_avrg_all            = dens_mean;
   #endif
 
   dens_avrg = dens_avrg_all;
@@ -527,8 +527,8 @@ void Grid3D::Compute_Gravitational_Potential(struct Parameters *P)
   input_density    = Grav.F.density_d;
   output_potential = Grav.F.potential_d;
   #else
-  input_density    = Grav.F.density_h;
-  output_potential = Grav.F.potential_h;
+  input_density        = Grav.F.density_h;
+  output_potential     = Grav.F.potential_h;
   #endif
 
   #ifdef SOR
diff --git a/src/gravity/gravity_functions_gpu.cu b/src/gravity/gravity_functions_gpu.cu
index b92d19084..15de64a95 100644
--- a/src/gravity/gravity_functions_gpu.cu
+++ b/src/gravity/gravity_functions_gpu.cu
@@ -127,7 +127,7 @@ void Grid3D::Copy_Hydro_Density_to_Gravity_GPU()
   #ifdef COSMOLOGY
   cosmo_rho_0_gas = Cosmo.rho_0_gas;
   #else
-  cosmo_rho_0_gas = 1.0;
+  cosmo_rho_0_gas         = 1.0;
   #endif
 
   // Copy the density from the device array to the Poisson input density array
@@ -261,7 +261,7 @@ void Grid3D::Extrapolate_Grav_Potential_GPU()
   #ifdef COSMOLOGY
   cosmo_factor = Cosmo.current_a * Cosmo.current_a / Cosmo.phi_0_gas;
   #else
-  cosmo_factor = 1.0;
+  cosmo_factor            = 1.0;
   #endif
 
   // set values for GPU kernels
diff --git a/src/grid/boundary_conditions.cpp b/src/grid/boundary_conditions.cpp
index d9201fe8b..d7a1be2bc 100644
--- a/src/grid/boundary_conditions.cpp
+++ b/src/grid/boundary_conditions.cpp
@@ -534,8 +534,13 @@ void Grid3D::Wind_Boundary()
   z_off = nz_local_start;
 #endif
 
+#ifndef CLOUD_TRACKING
   Wind_Boundary_CUDA(C.device, H.nx, H.ny, H.nz, H.n_cells, H.n_ghost, x_off, y_off, z_off, H.dx, H.dy, H.dz, H.xbound,
-                     H.ybound, H.zbound, gama, H.t);
+                     H.ybound, H.zbound, gama, H.t, 0.0, 0.0);
+#else  /*CLOUD_TRACKING*/
+  Wind_Boundary_CUDA(C.device, H.nx, H.ny, H.nz, H.n_cells, H.n_ghost, x_off, y_off, z_off, H.dx, H.dy, H.dz, H.xbound,
+                     H.ybound, H.zbound, gama, H.t, H.velocity_x_cloud_avg, H.density_wind_init);
+#endif /*CLOUD_TRACKING*/
 }
 
 /*! \fn void Noh_Boundary()
diff --git a/src/grid/cuda_boundaries.cu b/src/grid/cuda_boundaries.cu
index baf846d3c..4429b317f 100644
--- a/src/grid/cuda_boundaries.cu
+++ b/src/grid/cuda_boundaries.cu
@@ -304,23 +304,26 @@ __device__ int FindIndex(int ig, int nx, int flag, int face, int n_ghost, Real *
 
 __global__ void Wind_Boundary_kernel(Real *c_device, int nx, int ny, int nz, int n_cells, int n_ghost, int x_off,
                                      int y_off, int z_off, Real dx, Real dy, Real dz, Real xbound, Real ybound,
-                                     Real zbound, Real gamma, Real t)
+                                     Real zbound, Real gamma, Real t, Real velocity_x_cloud_avg, Real density_wind_init)
 {
   int id, xid, yid, zid, gid;
-  Real n_0, T_0;
   Real mu = 0.6;
-  Real vx, vy, vz, d_0, P_0;
 
-  n_0 = 1e-2;  // same value as n_bg in cloud initial condition function (cm^-3)
-  T_0 = 3e6;   // same value as T_bg in cloud initial condition function (K)
+  Real density, velocity_x, velocity_y, velocity_z, pressure, number_density;
+
+  number_density   = 1e-2;  // same value as n_bg in cloud initial condition function (cm^-3)
+  Real temperature = 3e7;   // same value as T_bg in cloud initial condition function (K)
 
   // same values as rho_bg and p_bg in cloud initial condition function
-  d_0 = n_0 * mu * MP / DENSITY_UNIT;
-  P_0 = n_0 * KB * T_0 / PRESSURE_UNIT;
+  density  = number_density * mu * MP / DENSITY_UNIT;
+  pressure = number_density * KB * temperature / PRESSURE_UNIT;
+#ifdef CLOUD_TRACKING
+  density = density_wind_init / DENSITY_UNIT;
+#endif
 
-  vx = 100 * TIME_UNIT / KPC;  // km/s * (cholla unit conversion)
-  vy = 0.0;
-  vz = 0.0;
+  velocity_x = 1000 * TIME_UNIT / KPC;  // km/s * (cholla unit conversion)
+  velocity_y = 0.0;
+  velocity_z = 0.0;
 
   // calculate ghost cell ID and i,j,k in GPU grid
   id = threadIdx.x + blockIdx.x * blockDim.x;
@@ -332,13 +335,29 @@ __global__ void Wind_Boundary_kernel(Real *c_device, int nx, int ny, int nz, int
   xid += 0;  // -x boundary
   gid = xid + yid * nx + zid * nx * ny;
 
-  if (xid <= n_ghost && xid < nx && yid < ny && zid < nz) {
+  if (xid < n_ghost && xid < nx && yid < ny && zid < nz) {
     // set conserved variables
-    c_device[gid]               = d_0;
-    c_device[gid + 1 * n_cells] = vx * d_0;
-    c_device[gid + 2 * n_cells] = vy * d_0;
-    c_device[gid + 3 * n_cells] = vz * d_0;
-    c_device[gid + 4 * n_cells] = P_0 / (gamma - 1.0) + 0.5 * d_0 * (vx * vx + vy * vy + vz * vz);
+    c_device[gid + n_cells * grid_enum::density]    = density;
+    c_device[gid + n_cells * grid_enum::momentum_x] = velocity_x * density;
+    c_device[gid + n_cells * grid_enum::momentum_y] = velocity_y * density;
+    c_device[gid + n_cells * grid_enum::momentum_z] = velocity_z * density;
+    c_device[gid + n_cells * grid_enum::Energy] =
+        pressure / (gamma - 1.0) +
+        0.5 * density * (velocity_x * velocity_x + velocity_y * velocity_y + velocity_z * velocity_z);
+#ifdef CLOUD_TRACKING
+    c_device[gid + n_cells * grid_enum::density]    = density;
+    c_device[gid + n_cells * grid_enum::momentum_x] = (velocity_x - velocity_x_cloud_avg) * density;
+    c_device[gid + n_cells * grid_enum::momentum_y] = velocity_y * density;
+    c_device[gid + n_cells * grid_enum::momentum_z] = velocity_z * density;
+    c_device[gid + n_cells * grid_enum::Energy] =
+        pressure / (gamma - 1.0) +
+        0.5 * density * (pow(velocity_x - velocity_x_cloud_avg, 2) + pow(velocity_y, 2) + pow(velocity_z, 2));
+#endif  // CLOUD_TRACKING
+#ifdef SCALAR
+  #ifdef DUST
+    c_device[gid + n_cells * grid_enum::dust_density] = 0.0;
+  #endif
+#endif
   }
   __syncthreads();
 }
@@ -514,7 +533,8 @@ __global__ void Noh_Boundary_kernel(Real *c_device, int nx, int ny, int nz, int
 }
 
 void Wind_Boundary_CUDA(Real *c_device, int nx, int ny, int nz, int n_cells, int n_ghost, int x_off, int y_off,
-                        int z_off, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound, Real gamma, Real t)
+                        int z_off, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound, Real gamma, Real t,
+                        Real velocity_x_cloud_avg, Real density_wind_init)
 {
   // determine the size of the grid to launch
   // need at least as many threads as the largest boundary face
@@ -529,7 +549,8 @@ void Wind_Boundary_CUDA(Real *c_device, int nx, int ny, int nz, int n_cells, int
 
   // launch the boundary kernel
   hipLaunchKernelGGL(Wind_Boundary_kernel, dim1dGrid, dim1dBlock, 0, 0, c_device, nx, ny, nz, n_cells, n_ghost, x_off,
-                     y_off, z_off, dx, dy, dz, xbound, ybound, zbound, gamma, t);
+                     y_off, z_off, dx, dy, dz, xbound, ybound, zbound, gamma, t, velocity_x_cloud_avg,
+                     density_wind_init);
 }
 
 void Noh_Boundary_CUDA(Real *c_device, int nx, int ny, int nz, int n_cells, int n_ghost, int x_off, int y_off,
@@ -549,4 +570,4 @@ void Noh_Boundary_CUDA(Real *c_device, int nx, int ny, int nz, int n_cells, int
   // launch the boundary kernel
   hipLaunchKernelGGL(Noh_Boundary_kernel, dim1dGrid, dim1dBlock, 0, 0, c_device, nx, ny, nz, n_cells, n_ghost, x_off,
                      y_off, z_off, dx, dy, dz, xbound, ybound, zbound, gamma, t);
-}
\ No newline at end of file
+}
diff --git a/src/grid/cuda_boundaries.h b/src/grid/cuda_boundaries.h
index bbf0a5ab8..da38eee11 100644
--- a/src/grid/cuda_boundaries.h
+++ b/src/grid/cuda_boundaries.h
@@ -16,8 +16,8 @@ void SetGhostCells(Real* c_head, int nx, int ny, int nz, int n_fields, int n_cel
                    int jsize, int ksize, int imin, int jmin, int kmin, int dir);
 
 void Wind_Boundary_CUDA(Real* c_device, int nx, int ny, int nz, int n_cells, int n_ghost, int x_off, int y_off,
-                        int z_off, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound, Real gamma,
-                        Real t);
+                        int z_off, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound, Real gamma, Real t,
+                        Real velocity_x_cloud_avg, Real density_wind_init);
 
 void Noh_Boundary_CUDA(Real* c_device, int nx, int ny, int nz, int n_cells, int n_ghost, int x_off, int y_off,
                        int z_off, Real dx, Real dy, Real dz, Real xbound, Real ybound, Real zbound, Real gamma, Real t);
diff --git a/src/grid/grid3D.cpp b/src/grid/grid3D.cpp
index ffb446665..215a053ab 100644
--- a/src/grid/grid3D.cpp
+++ b/src/grid/grid3D.cpp
@@ -38,10 +38,18 @@
   #include "../cooling/cooling_cuda.h"  // provides Cooling_Update
 #endif
 
+#ifdef CLOUD_TRACKING
+  #include "../cloud_tracking/cloud_tracking.h"
+#endif
+
 #ifdef DUST
   #include "../dust/dust_cuda.h"  // provides Dust_Update
 #endif
 
+#ifdef OUTFLOW_ANALYSIS
+  #include "../analysis/outflow_analysis.h"  // provides Dust_Update
+#endif
+
 /*! \fn Grid3D(void)
  *  \brief Constructor for the Grid. */
 Grid3D::Grid3D(void)
@@ -157,8 +165,16 @@ void Grid3D::Initialize(struct Parameters *P)
   C_cfl = 0.3;
 
 #ifdef AVERAGE_SLOW_CELLS
-  H.min_dt_slow = 1e-100;  // Initialize the minumum dt to a tiny number
-#endif                     // AVERAGE_SLOW_CELLS
+  H.min_dt_slow = 0.005;  // Initialize the minumum dt to a tiny number
+#endif                    // AVERAGE_SLOW_CELLS
+
+#ifdef CLOUD_TRACKING
+  H.density_wind_init = P->density_wind_init;
+#endif
+
+#if defined(CLOUD_TRACKING) || defined(OUTFLOW_ANALYSIS)
+  H.density_cloud_init = P->density_cloud_init;
+#endif
 
 #ifndef MPI_CHOLLA
 
@@ -523,8 +539,74 @@ Real Grid3D::Update_Hydro_Grid()
 
 #ifdef DUST
   // ==Apply dust from dust/dust_cuda.h==
-  Dust_Update(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, H.dt, gama, H.grain_radius);
-#endif  // DUST
+  Real mass_mixed, mass_hot;
+  Real mass_mixed_tot, mass_hot_tot;
+  Dust_Update(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, H.dx, H.dy, H.dz, H.dt, gama, H.grain_radius,
+              &mass_mixed, &mass_hot);
+  #ifdef MPI_CHOLLA
+  MPI_Barrier(world);
+  MPI_Allreduce(&mass_mixed, &mass_mixed_tot, 1, MPI_CHREAL, MPI_SUM, world);
+  MPI_Allreduce(&mass_hot, &mass_hot_tot, 1, MPI_CHREAL, MPI_SUM, world);
+  #endif  // MPI_CHOLLA
+  chprintf("** Mixed sputtered mass: %e  Hot sputtered mass: %e \n", mass_mixed_tot, mass_hot_tot);
+#endif    // DUST
+
+#ifdef CLOUD_TRACKING
+  Real mass_cloud_tracked, integrand_cloud, velocity_x_cloud_avg, mass_cloud_tot;
+  // Do the grid-wide reduction to get the sum of rho*vx*V and the total mass for the entire cloud
+  Cloud_Velocity_Reduction(C.device, H.nx, H.ny, H.nz, H.dx, H.dy, H.dz, H.n_ghost, H.n_fields, H.density_cloud_init,
+                           H.density_wind_init, &mass_cloud_tracked, &integrand_cloud);
+
+  #ifdef MPI_CHOLLA
+
+  Real integrand_reduced;
+  Real mass_reduced;
+
+  MPI_Allreduce(&integrand_cloud, &integrand_reduced, 1, MPI_CHREAL, MPI_SUM, world);
+  MPI_Allreduce(&mass_cloud_tracked, &mass_reduced, 1, MPI_CHREAL, MPI_SUM, world);
+
+  #endif  // MPI_CHOLLA
+
+  // Calculate the mass-averaged x-velocity (Shin et al. (2008) eq. 9)
+  if ((integrand_reduced == 0) or (mass_reduced == 0)) {
+    velocity_x_cloud_avg = 0;
+  } else {
+    velocity_x_cloud_avg = integrand_reduced / mass_reduced;
+  }
+
+  // Update the cumulative reference frame shift
+  H.velocity_x_cloud_avg += velocity_x_cloud_avg;
+
+  // printf("cumulative velocity: %e\n", H.velocity_x_cloud_avg);
+
+  chprintf("Average cloud velocity = %e km/s\n", velocity_x_cloud_avg * KPC / TIME_UNIT);
+  chprintf("Mass = %e M_sun\n", mass_reduced);
+
+  #ifdef MPI_CHOLLA
+  MPI_Barrier(world);
+  #endif
+  // Subtract this timestep's reference frame shift off from the entire grid
+  Update_Grid_Frame(C.device, H.nx, H.ny, H.nz, H.n_ghost, H.n_fields, velocity_x_cloud_avg);
+
+#endif  // CLOUD_TRACKING
+
+#ifdef DUST
+  #ifdef OUTFLOW_ANALYSIS
+  Real mass_cloud, mass_dust = 0;
+
+  Outflow_Analysis(C.device, H.nx, H.ny, H.nz, H.dx, H.dy, H.dz, H.n_ghost, H.n_fields, &mass_cloud, &mass_dust,
+                   H.density_cloud_init);
+
+    #ifdef MPI_CHOLLA
+  MPI_Barrier(world);
+  Real arr_unreduced[2] = {mass_cloud, mass_dust};
+  Real arr_reduced[2];
+  MPI_Allreduce(&arr_unreduced, &arr_reduced, 2, MPI_CHREAL, MPI_SUM, world);
+    #endif  // MPI_CHOLLA
+
+  chprintf("@@ Cloud mass: %e  Dust mass: %e \n", arr_reduced[0], arr_reduced[1]);
+  #endif  // OUTFLOW_ANALYSIS
+#endif    // DUST
 
 #ifdef CHEMISTRY_GPU
   // Update the H and He ionization fractions and apply cooling and photoheating
@@ -533,6 +615,7 @@ Real Grid3D::Update_Hydro_Grid()
   Timer.Chemistry.RecordTime(Chem.H.runtime_chemistry_step);
   non_hydro_elapsed_time += Chem.H.runtime_chemistry_step;
   #endif
+
   C.HI_density    = &C.host[H.n_cells * grid_enum::HI_density];
   C.HII_density   = &C.host[H.n_cells * grid_enum::HII_density];
   C.HeI_density   = &C.host[H.n_cells * grid_enum::HeI_density];
diff --git a/src/grid/grid3D.h b/src/grid/grid3D.h
index e248f6490..4ca127180 100644
--- a/src/grid/grid3D.h
+++ b/src/grid/grid3D.h
@@ -214,6 +214,14 @@ struct Header {
   Real min_dt_slow;
 #endif
 
+#ifdef CLOUD_TRACKING
+  Real density_wind_init;
+  Real velocity_x_cloud_avg = 0;
+#endif  // CLOUD_TRACKING
+
+#if defined(CLOUD_TRACKING) || defined(OUTFLOW_ANALYSIS)
+  Real density_cloud_init;
+#endif
   /*! \var t_wall
    *  \brief Wall time */
   Real t_wall;
@@ -518,6 +526,10 @@ class Grid3D
    *  \brief Write xy, xz, and yz slices of all data to a file. */
   void Write_Slices_HDF5(hid_t file_id);
 
+  /*! \fn void Write_Edges_HDF5(hid_t file_id)
+   *  \brief Write the simulation boundaries. */
+  void Write_Edges_HDF5(hid_t file_id);
+
 #endif
 
   /*! \fn void Read_Grid(struct Parameters P)
@@ -681,7 +693,7 @@ class Grid3D
    * gravitational collapse */
   void Spherical_Overdensity_3D();
 
-  void Clouds();
+  void Clouds(struct Parameters P);
 
   void Uniform_Grid();
 
diff --git a/src/grid/initial_conditions.cpp b/src/grid/initial_conditions.cpp
index af558be8f..d6da8df19 100644
--- a/src/grid/initial_conditions.cpp
+++ b/src/grid/initial_conditions.cpp
@@ -68,7 +68,7 @@ void Grid3D::Set_Initial_Conditions(Parameters P)
   } else if (strcmp(P.init, "Spherical_Overdensity_3D") == 0) {
     Spherical_Overdensity_3D();
   } else if (strcmp(P.init, "Clouds") == 0) {
-    Clouds();
+    Clouds(P);
   } else if (strcmp(P.init, "Read_Grid") == 0) {
 #ifndef ONLY_PARTICLES
     Read_Grid(P);
@@ -1310,7 +1310,7 @@ void Grid3D::Spherical_Overdensity_3D()
 
 /*! \fn void Clouds()
  *  \brief Bunch of clouds. */
-void Grid3D::Clouds()
+void Grid3D::Clouds(struct Parameters P)
 {
   int i, j, k, id;
   int istart, jstart, kstart, iend, jend, kend;
@@ -1324,7 +1324,7 @@ void Grid3D::Clouds()
   Real p_bg, p_cl;       // background and cloud pressure
   Real mu   = 0.6;       // mean atomic weight
   int N_cl  = 1;         // number of clouds
-  Real R_cl = 2.5;       // cloud radius in code units (kpc)
+  Real R_cl = 0.1;       // cloud radius in code units (kpc)
   Real cl_pos[N_cl][3];  // array of cloud positions
   Real r;
 
@@ -1339,22 +1339,28 @@ void Grid3D::Clouds()
 
   // single centered cloud setup
   for (int nn = 0; nn < N_cl; nn++) {
-    cl_pos[nn][0] = 0.5 * H.xdglobal;
+    cl_pos[nn][0] = 0.0375 * H.xdglobal;
     cl_pos[nn][1] = 0.5 * H.ydglobal;
     cl_pos[nn][2] = 0.5 * H.zdglobal;
     printf("Cloud positions: %f %f %f\n", cl_pos[nn][0], cl_pos[nn][1], cl_pos[nn][2]);
   }
 
-  n_bg   = 1.68e-4;
-  n_cl   = 5.4e-2;
+  n_bg   = 1.0e-2;
+  n_cl   = 10;
   rho_bg = n_bg * mu * MP / DENSITY_UNIT;
   rho_cl = n_cl * mu * MP / DENSITY_UNIT;
-  vx_bg  = 0.0;
+#ifdef CLOUD_TRACKING
+  rho_cl = P.density_cloud_init / DENSITY_UNIT;
+  rho_bg = P.density_wind_init / DENSITY_UNIT;
+  printf("Cloud initial density: %e\n", P.density_cloud_init);
+  printf("Wind initial density: %e\n", P.density_wind_init);
+#endif
+  vx_bg = 1000 * TIME_UNIT / KPC;
   // vx_c  = -200*TIME_UNIT/KPC; // convert from km/s to kpc/kyr
-  vx_cl = 0.0;
+  vx_cl = 0 * TIME_UNIT / KPC;
   vy_bg = vy_cl = 0.0;
   vz_bg = vz_cl = 0.0;
-  T_bg          = 3e6;
+  T_bg          = 3e7;
   T_cl          = 1e4;
   p_bg          = n_bg * KB * T_bg / PRESSURE_UNIT;
   p_cl          = p_bg;
@@ -1399,7 +1405,7 @@ void Grid3D::Clouds()
   #ifdef DUST
         C.host[id + H.n_cells * grid_enum::dust_density] = 0.0;
   #endif
-#endif
+#endif  // SCALAR
         // add clouds
         for (int nn = 0; nn < N_cl; nn++) {
           r = sqrt((x_pos - cl_pos[nn][0]) * (x_pos - cl_pos[nn][0]) +
@@ -1417,6 +1423,9 @@ void Grid3D::Clouds()
 #ifdef SCALAR
   #ifdef DUST
             C.host[id + H.n_cells * grid_enum::dust_density] = rho_cl * 1e-2;
+            if (isnan(C.host[id + H.n_cells * grid_enum::dust_density])) {
+              printf("there's a nan in IC %d %d %d\n", i, j, k);
+            }
   #endif  // DUST
 #endif    // SCALAR
           }
@@ -1482,18 +1491,18 @@ void Grid3D::Zeldovich_Pancake(struct Parameters P)
   Real H0, h, Omega_M, rho_0, G, z_zeldovich, z_init, x_center, T_init, k_x;
 
   chprintf("Setting Zeldovich Pancake initial conditions...\n");
-  H0      = P.H0;
-  h       = H0 / 100;
+  H0 = P.H0;
+  h = H0 / 100;
   Omega_M = P.Omega_M;
 
   chprintf(" h = %f \n", h);
   chprintf(" Omega_M = %f \n", Omega_M);
 
   H0 /= 1000;  //[km/s / kpc]
-  G           = G_COSMO;
-  rho_0       = 3 * H0 * H0 / (8 * M_PI * G) * Omega_M / h / h;
+  G = G_COSMO;
+  rho_0 = 3 * H0 * H0 / (8 * M_PI * G) * Omega_M / h / h;
   z_zeldovich = 1;
-  z_init      = P.Init_redshift;
+  z_init = P.Init_redshift;
   chprintf(" rho_0 = %f \n", rho_0);
   chprintf(" z_init = %f \n", z_init);
   chprintf(" z_zeldovich = %f \n", z_zeldovich);
@@ -1553,17 +1562,17 @@ void Grid3D::Zeldovich_Pancake(struct Parameters P)
         index = (int(x_pos / H.dx) + 0) % 256;
         // index = ( index + 16 ) % 256;
         dens = ics_values[0 * nPoints + index];
-        vel  = ics_values[1 * nPoints + index];
-        E    = ics_values[2 * nPoints + index];
-        U    = ics_values[3 * nPoints + index];
+        vel = ics_values[1 * nPoints + index];
+        E = ics_values[2 * nPoints + index];
+        U = ics_values[3 * nPoints + index];
         // //
 
         // chprintf( "%f \n", vel );
-        C.density[id]    = dens;
+        C.density[id] = dens;
         C.momentum_x[id] = dens * vel;
         C.momentum_y[id] = 0;
         C.momentum_z[id] = 0;
-        C.Energy[id]     = E;
+        C.Energy[id] = E;
 
   #ifdef DE
         C.GasEnergy[id] = U;
diff --git a/src/integrators/VL_3D_cuda.cu b/src/integrators/VL_3D_cuda.cu
index 1d30cc0e2..96f2f6a5d 100644
--- a/src/integrators/VL_3D_cuda.cu
+++ b/src/integrators/VL_3D_cuda.cu
@@ -122,7 +122,7 @@ void VL_Algorithm_3D_CUDA(Real *d_conserved, Real *d_grav_potential, int nx, int
   #if defined(GRAVITY)
     dev_grav_potential = d_grav_potential;
   #else   // not GRAVITY
-    dev_grav_potential = NULL;
+    dev_grav_potential     = NULL;
   #endif  // GRAVITY
 
     // If memory is single allocated: memory_allocated becomes true and
diff --git a/src/io/io.cpp b/src/io/io.cpp
index 3408bbce7..e4d9a4769 100644
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -155,6 +155,12 @@ void Write_Data(Grid3D &G, struct Parameters P, int nfile)
   }
 #endif /*SLICES*/
 
+#ifdef OUTFLOW_ANALYSIS
+  if (nfile % P.n_slice == 0) {
+    Output_Edges(G, P, nfile);
+  }
+#endif /*OUTFLOW_ANALYSIS*/
+
 #ifdef PARTICLES
   if (nfile % P.n_particle == 0) {
     G.WriteData_Particles(P, nfile);
@@ -552,6 +558,50 @@ void Output_Slices(Grid3D &G, struct Parameters P, int nfile)
 #endif    // HDF5
 }
 
+/* Output xy, xz, and yz slices of the grid data. */
+void Output_Edges(Grid3D &G, struct Parameters P, int nfile)
+{
+#ifdef HDF5
+  hid_t file_id;
+  herr_t status;
+
+  // create the filename
+  std::string filename(P.outdir);
+  filename += std::to_string(nfile);
+  filename += "_edges.h5";
+
+  #ifdef MPI_CHOLLA
+  filename += "." + std::to_string(procID);
+  #endif /*MPI_CHOLLA*/
+
+  // Create a new file
+  file_id = H5Fcreate(filename.data(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
+
+  // Write header (file attributes)
+  G.Write_Header_HDF5(file_id);
+
+  // Write slices of all variables to the output file
+  G.Write_Edges_HDF5(file_id);
+
+  // Close the file
+  status = H5Fclose(file_id);
+
+  #ifdef MPI_CHOLLA
+  if (status < 0) {
+    printf("Output_Edges: File write failed. ProcID: %d\n", procID);
+    chexit(-1);
+  }
+  #else   // MPI_CHOLLA is not defined
+  if (status < 0) {
+    printf("Output_Edges: File write failed.\n");
+    exit(-1);
+  }
+  #endif  // MPI_CHOLLA
+#else     // HDF5 is not defined
+  printf("Output_Edges only defined for hdf5 writes.\n");
+#endif    // HDF5
+}
+
 /*! \fn void Write_Header_Text(FILE *fp)
  *  \brief Write some relevant header info to a text output file. */
 void Grid3D::Write_Header_Text(FILE *fp)
@@ -1375,12 +1425,12 @@ void Grid3D::Write_Grid_HDF5(hid_t file_id)
     #ifdef OUTPUT_METALS
   output_metals = true;
     #else   // not OUTPUT_METALS
-  output_metals = false;
+  output_metals          = false;
     #endif  // OUTPUT_METALS
     #ifdef OUTPUT_ELECTRONS
   output_electrons = true;
     #else   // not OUTPUT_ELECTRONS
-  output_electrons = false;
+  output_electrons       = false;
     #endif  // OUTPUT_ELECTRONS
     #ifdef OUTPUT_FULL_IONIZATION
   output_full_ionization = true;
@@ -1570,7 +1620,7 @@ void Grid3D::Write_Projection_HDF5(hid_t file_id)
           Real const mx = C.momentum_x[id];
           Real const my = C.momentum_y[id];
           Real const mz = C.momentum_z[id];
-          Real const E  = C.Energy[id];
+          Real const E = C.Energy[id];
 
     #ifdef MHD
           auto const [magnetic_x, magnetic_y, magnetic_z] =
@@ -1794,7 +1844,7 @@ void Grid3D::Write_Rotated_Projection_HDF5(hid_t file_id)
             Real const mx = C.momentum_x[id];
             Real const my = C.momentum_y[id];
             Real const mz = C.momentum_z[id];
-            Real const E  = C.Energy[id];
+            Real const E = C.Energy[id];
 
     #ifdef MHD
             auto const [magnetic_x, magnetic_y, magnetic_z] =
@@ -1858,7 +1908,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
   Real *dataset_buffer_GE;
   #endif
   #ifdef SCALAR
-  Real *dataset_buffer_scalar;
+  #ifdef BASIC_SCALAR
+  Real *dataset_buffer_basic_scalar;
+  #endif
+  #ifdef DUST
+  Real *dataset_buffer_dust;
+  #endif
   #endif
   herr_t status;
   int xslice, yslice, zslice;
@@ -1898,8 +1953,13 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
     dataset_buffer_GE = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
   #endif
   #ifdef SCALAR
-    dataset_buffer_scalar = (Real *)malloc(NSCALARS * H.nx_real * H.ny_real * sizeof(Real));
-  #endif
+    #ifdef BASIC_SCALAR
+      dataset_buffer_basic_scalar = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+    #endif
+    #ifdef DUST
+     dataset_buffer_dust = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+    #endif
+  #endif	
 
     // Copy the xy slices to the memory buffers
     for (j = 0; j < H.ny_real; j++) {
@@ -1940,9 +2000,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
           dataset_buffer_GE[buf_id] = C.GasEnergy[id];
   #endif
   #ifdef SCALAR
-          for (int ii = 0; ii < NSCALARS; ii++) {
-            dataset_buffer_scalar[buf_id + ii * H.nx * H.ny] = C.scalar[id + ii * H.n_cells];
-          }
+    #ifdef BASIC_SCALAR
+      dataset_buffer_basic_scalar[buf_id] = C.basic_scalar[id];
+    #endif
+    #ifdef DUST
+      dataset_buffer_dust[buf_id] = C.dust_density[id];
+    #endif
   #endif
   #ifdef MPI_CHOLLA
         }
@@ -1962,9 +2025,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
           dataset_buffer_GE[buf_id] = 0;
     #endif
     #ifdef SCALAR
-          for (int ii = 0; ii < NSCALARS; ii++) {
-            dataset_buffer_scalar[buf_id + ii * H.nx * H.ny] = 0;
-          }
+      #ifdef BASIC_SCALAR
+	  dataset_buffer_basic_scalar[buf_id] = 0;
+      #endif
+      #ifdef DUST
+	  dataset_buffer_dust[buf_id] = 0;
+      #endif
     #endif
         }
   #endif  // MPI_CHOLLA
@@ -1986,7 +2052,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
     status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_GE, "/GE_xy");
   #endif
   #ifdef SCALAR
-    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_scalar, "/scalar_xy");
+    #ifdef BASIC_SCALAR
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_basic_scalar, "/basic_scalar_xy");
+    #endif
+    #ifdef DUST
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_dust, "/d_dust_xy");
+    #endif
   #endif
     // Free the dataspace id
     status = H5Sclose(dataspace_id);
@@ -2001,7 +2072,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
     free(dataset_buffer_GE);
   #endif
   #ifdef SCALAR
-    free(dataset_buffer_scalar);
+    #ifdef BASIC_SCALAR
+    free(dataset_buffer_basic_scalar);
+    #endif 
+    #ifdef DUST
+    free(dataset_buffer_dust);
+    #endif
   #endif
 
     // Create the xz data space for the datasets
@@ -2024,7 +2100,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
     dataset_buffer_GE = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
   #endif
   #ifdef SCALAR
-    dataset_buffer_scalar = (Real *)malloc(NSCALARS * H.nx_real * H.nz_real * sizeof(Real));
+    #ifdef BASIC_SCALAR
+    dataset_buffer_basic_scalar = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+    #endif
+    #ifdef DUST
+    dataset_buffer_dust = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+    #endif
   #endif
 
     // Copy the xz slices to the memory buffers
@@ -2066,9 +2147,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
           dataset_buffer_GE[buf_id] = C.GasEnergy[id];
   #endif
   #ifdef SCALAR
-          for (int ii = 0; ii < NSCALARS; ii++) {
-            dataset_buffer_scalar[buf_id + ii * H.nx * H.nz] = C.scalar[id + ii * H.n_cells];
-          }
+    #ifdef BASIC_SCALAR
+	  dataset_buffer_basic_scalar[buf_id] = C.basic_scalar[id];
+    #endif
+    #ifdef DUST
+	  dataset_buffer_dust[buf_id] = C.dust_density[id];
+    #endif
   #endif
   #ifdef MPI_CHOLLA
         }
@@ -2088,9 +2172,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
           dataset_buffer_GE[buf_id] = 0;
     #endif
     #ifdef SCALAR
-          for (int ii = 0; ii < NSCALARS; ii++) {
-            dataset_buffer_scalar[buf_id + ii * H.nx * H.nz] = 0;
-          }
+      #ifdef BASIC_SCALAR
+	  dataset_buffer_basic_scalar[buf_id] = 0;
+      #endif
+      #ifdef DUST
+	  dataset_buffer_dust[buf_id] = 0;
+      #endif
     #endif
         }
   #endif  // MPI_CHOLLA
@@ -2112,7 +2199,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
     status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_GE, "/GE_xz");
   #endif
   #ifdef SCALAR
-    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_scalar, "/scalar_xz");
+#ifdef BASIC_SCALAR
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_basic_scalar, "/basic_scalar_xz");
+  #endif
+  #ifdef DUST
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_dust, "/d_dust_xz");
+  #endif
   #endif
 
     // Free the dataspace id
@@ -2128,7 +2220,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
     free(dataset_buffer_GE);
   #endif
   #ifdef SCALAR
-    free(dataset_buffer_scalar);
+    #ifdef BASIC_SCALAR
+    free(dataset_buffer_basic_scalar);
+    #endif
+    #ifdef DUST
+    free(dataset_buffer_dust);
+    #endif    
   #endif
 
     // Create the yz data space for the datasets
@@ -2151,7 +2248,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
     dataset_buffer_GE = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
   #endif
   #ifdef SCALAR
-    dataset_buffer_scalar = (Real *)malloc(NSCALARS * H.ny_real * H.nz_real * sizeof(Real));
+    #ifdef BASIC_SCALAR
+    dataset_buffer_basic_scalar = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+    #endif
+    #ifdef DUST
+    dataset_buffer_dust = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+    #endif
   #endif
 
     // Copy the yz slices to the memory buffers
@@ -2192,9 +2294,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
           dataset_buffer_GE[buf_id] = C.GasEnergy[id];
   #endif
   #ifdef SCALAR
-          for (int ii = 0; ii < NSCALARS; ii++) {
-            dataset_buffer_scalar[buf_id + ii * H.ny * H.nz] = C.scalar[id + ii * H.n_cells];
-          }
+    #ifdef DUST
+	  dataset_buffer_dust[buf_id] = C.dust_density[id];
+    #endif
+    #ifdef BASIC_SCALAR
+	  dataset_buffer_basic_scalar[buf_id] = C.basic_scalar[id];
+    #endif
   #endif
   #ifdef MPI_CHOLLA
         }
@@ -2214,9 +2319,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
           dataset_buffer_GE[buf_id] = 0;
     #endif
     #ifdef SCALAR
-          for (int ii = 0; ii < NSCALARS; ii++) {
-            dataset_buffer_scalar[buf_id + ii * H.ny * H.nz] = 0;
-          }
+      #ifdef BASIC_SCALAR
+	  dataset_buffer_basic_scalar[buf_id] = 0;
+      #endif
+      #ifdef DUST
+	  dataset_buffer_dust[buf_id] = 0;
+      #endif
     #endif
         }
   #endif  // MPI_CHOLLA
@@ -2238,7 +2346,12 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
     status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_GE, "/GE_yz");
   #endif
   #ifdef SCALAR
-    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_scalar, "/scalar_yz");
+    #ifdef BASIC_SCALAR
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_basic_scalar, "/basic_scalar_yz");
+    #endif
+    #ifdef DUST
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_dust, "/d_dust_yz");
+    #endif
   #endif
 
     // Free the dataspace id
@@ -2254,74 +2367,668 @@ void Grid3D::Write_Slices_HDF5(hid_t file_id)
     free(dataset_buffer_GE);
   #endif
   #ifdef SCALAR
-    free(dataset_buffer_scalar);
+    #ifdef BASIC_SCALAR
+    free(dataset_buffer_basic_scalar);
+    #endif
+    #ifdef DUST
+    free(dataset_buffer_dust);
+    #endif
   #endif
 
   } else {
     printf("Slice write only works for 3D data.\n");
   }
 }
-#endif  // HDF5
 
-/*! \fn void Read_Grid(struct Parameters P)
- *  \brief Read in grid data from an output file. */
-void Grid3D::Read_Grid(struct Parameters P)
+void Grid3D::Write_Edges_HDF5(hid_t file_id)
 {
-  ScopedTimer timer("Read_Grid");
-  int nfile = P.nfile;  // output step you want to read from
-
-  // create the filename to read from
-  // assumes your data is in the outdir specified in the input file
-  // strcpy(filename, P.outdir);
-  // Changed to read initial conditions from indir
-  std::string filename(P.indir);
-  filename += std::to_string(P.nfile);
-  char sbuffer[1024];
-
-#if defined BINARY
-  filename += ".bin";
-#elif defined HDF5
-  filename += ".h5";
-#endif  // BINARY or HDF5
-// for now assumes you will run on the same number of processors
-#ifdef MPI_CHOLLA
-  #ifdef TILED_INITIAL_CONDITIONS
-  sprintf(sbuffer, "%sics_%dMpc_%d.h5", P.indir, (int)P.tile_length / 1000,
-          H.nx_real);  // Everyone reads the same file
-  filename = sbuffer;
-  #else   // TILED_INITIAL_CONDITIONS is not defined
-  filename += "." + std::to_string(procID);
-  #endif  // TILED_INITIAL_CONDITIONS
-#endif    // MPI_CHOLLA
-
-#if defined BINARY
-  FILE *fp;
-  // open the file
-  fp = fopen(filename, "r");
-  if (!fp) {
-    printf("Unable to open input file.\n");
-    exit(0);
-  }
-
-  // read in grid data
-  Read_Grid_Binary(fp);
-
-  // close the file
-  fclose(fp);
-
-#elif defined HDF5
-  hid_t file_id;
+  int i, j, k, id, buf_id;
+  hid_t dataset_id, dataspace_id;
+  Real *dataset_buffer_d;
+  Real *dataset_buffer_mx;
+  Real *dataset_buffer_my;
+  Real *dataset_buffer_mz;
+  Real *dataset_buffer_E;
+  #ifdef DE
+  Real *dataset_buffer_GE;
+  #endif
+  #ifdef SCALAR
+  Real *dataset_buffer_scalar;
+  #endif
   herr_t status;
+  int minus_xslice, minus_yslice, minus_zslice, plus_xslice, plus_yslice, plus_zslice;
+  minus_xslice = 0;
+  minus_yslice = 0;
+  minus_zslice = 0;
+  plus_xslice  = H.nx;
+  plus_yslice  = H.ny;
+  plus_zslice  = H.nz;
+  #ifdef MPI_CHOLLA
+  minus_xslice = 0;
+  minus_yslice = 0;
+  minus_zslice = 0;
+  plus_xslice  = nx_global - 1;
+  plus_yslice  = ny_global - 1;
+  plus_zslice  = nz_global - 1;
+  #endif
+  // 3D
+  if (H.nx > 1 && H.ny > 1 && H.nz > 1) {
+    int nx_dset = H.nx_real;
+    int ny_dset = H.ny_real;
+    int nz_dset = H.nz_real;
+    hsize_t dims[2];
+    ////////////////////////////////////////////////////////////
+    // Create the -xy data space for the datasets
+    dims[0]      = nx_dset;
+    dims[1]      = ny_dset;
+    dataspace_id = H5Screate_simple(2, dims, NULL);
 
-  // open the file
-  file_id = H5Fopen(filename.data(), H5F_ACC_RDONLY, H5P_DEFAULT);
-  if (file_id < 0) {
-    std::cout << "Unable to open input file: " << filename << std::endl;
-    exit(0);
-  }
-
-  // read in grid data
-  Read_Grid_HDF5(file_id, P);
+    // Allocate memory for the -xy slices
+    dataset_buffer_d  = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+    dataset_buffer_mx = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+    dataset_buffer_my = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+    dataset_buffer_mz = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+    dataset_buffer_E  = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+  #ifdef DE
+    dataset_buffer_GE = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+  #endif
+  #ifdef SCALAR
+    dataset_buffer_scalar = (Real *)malloc(NSCALARS * H.nx_real * H.ny_real * sizeof(Real));
+  #endif
+    // Copy the -xy slices to the memory buffers
+    for (j = 0; j < H.ny_real; j++) {
+      for (i = 0; i < H.nx_real; i++) {
+        id     = cuda_utilities::compute1DIndex(i + H.n_ghost, j + H.n_ghost, minus_zslice, H.nx, H.ny);
+        buf_id = j + i * H.ny_real;
+  #ifdef MPI_CHOLLA
+        // When there are multiple processes, check whether this slice is in
+        // your domain
+        if (minus_zslice >= nz_local_start && minus_zslice < nz_local_start + nz_local) {
+          id = cuda_utilities::compute1DIndex(i + H.n_ghost, j + H.n_ghost, minus_zslice - nz_local_start + H.n_ghost,
+                                              H.nx, H.ny);
+  #endif  // MPI_CHOLLA
+          dataset_buffer_d[buf_id]  = C.density[id];
+          dataset_buffer_mx[buf_id] = C.momentum_x[id];
+          dataset_buffer_my[buf_id] = C.momentum_y[id];
+          dataset_buffer_mz[buf_id] = C.momentum_z[id];
+          dataset_buffer_E[buf_id]  = C.Energy[id];
+  #ifdef SCALAR
+          for (int ii = 0; ii < NSCALARS; ii++) {
+            dataset_buffer_scalar[buf_id + ii * H.nx * H.ny] = C.scalar[id + ii * H.n_cells];
+          }
+  #endif
+  #ifdef MPI_CHOLLA
+        }
+        // if the slice isn't in your domain, just write out zeros
+        else {
+          dataset_buffer_d[buf_id]  = 0;
+          dataset_buffer_mx[buf_id] = 0;
+          dataset_buffer_my[buf_id] = 0;
+          dataset_buffer_mz[buf_id] = 0;
+          dataset_buffer_E[buf_id]  = 0;
+    #ifdef DE
+          dataset_buffer_GE[buf_id] = 0;
+    #endif
+    #ifdef SCALAR
+          for (int ii = 0; ii < NSCALARS; ii++) {
+            dataset_buffer_scalar[buf_id + ii * H.nx * H.ny] = 0;
+          }
+    #endif
+        }
+  #endif  // MPI_CHOLLA
+      }
+    }
+    // Write out the xy datasets for each variable
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_d, "/d_minus_xy");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_mx, "/mx_minus_xy");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_my, "/my_minus_xy");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_mz, "/mz_minus_xy");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_E, "/E_minus_xy");
+  #ifdef DE
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_GE, "/GE_minus_xy");
+  #endif
+  #ifdef SCALAR
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_scalar, "/scalar_minus_xy");
+  #endif
+    // Free the dataspace id
+    status = H5Sclose(dataspace_id);
+
+    // free the dataset buffers
+    free(dataset_buffer_d);
+    free(dataset_buffer_mx);
+    free(dataset_buffer_my);
+    free(dataset_buffer_mz);
+    free(dataset_buffer_E);
+  #ifdef DE
+    free(dataset_buffer_GE);
+  #endif
+  #ifdef SCALAR
+    free(dataset_buffer_scalar);
+  #endif
+    ////////////////////////////////////////////////////////////
+    // Create the +xy data space for the datasets
+    dims[0]      = nx_dset;
+    dims[1]      = ny_dset;
+    dataspace_id = H5Screate_simple(2, dims, NULL);
+
+    // Allocate memory for the +xy slices
+    dataset_buffer_d  = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+    dataset_buffer_mx = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+    dataset_buffer_my = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+    dataset_buffer_mz = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+    dataset_buffer_E  = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+  #ifdef DE
+    dataset_buffer_GE = (Real *)malloc(H.nx_real * H.ny_real * sizeof(Real));
+  #endif
+  #ifdef SCALAR
+    dataset_buffer_scalar = (Real *)malloc(NSCALARS * H.nx_real * H.ny_real * sizeof(Real));
+  #endif
+    // Copy the -xy slices to the memory buffers
+    for (j = 0; j < H.ny_real; j++) {
+      for (i = 0; i < H.nx_real; i++) {
+        id     = cuda_utilities::compute1DIndex(i + H.n_ghost, j + H.n_ghost, plus_zslice, H.nx, H.ny);
+        buf_id = j + i * H.ny_real;
+  #ifdef MPI_CHOLLA
+        // When there are multiple processes, check whether this slice is in
+        // your domain
+        if (plus_zslice >= nz_local_start && plus_zslice < nz_local_start + nz_local) {
+          id = cuda_utilities::compute1DIndex(i + H.n_ghost, j + H.n_ghost, plus_zslice - nz_local_start + H.n_ghost,
+                                              H.nx, H.ny);
+  #endif  // MPI_CHOLLA
+          dataset_buffer_d[buf_id]  = C.density[id];
+          dataset_buffer_mx[buf_id] = C.momentum_x[id];
+          dataset_buffer_my[buf_id] = C.momentum_y[id];
+          dataset_buffer_mz[buf_id] = C.momentum_z[id];
+          dataset_buffer_E[buf_id]  = C.Energy[id];
+  #ifdef SCALAR
+          for (int ii = 0; ii < NSCALARS; ii++) {
+            dataset_buffer_scalar[buf_id + ii * H.nx * H.ny] = C.scalar[id + ii * H.n_cells];
+          }
+  #endif
+  #ifdef MPI_CHOLLA
+        }
+        // if the slice isn't in your domain, just write out zeros
+        else {
+          dataset_buffer_d[buf_id]  = 0;
+          dataset_buffer_mx[buf_id] = 0;
+          dataset_buffer_my[buf_id] = 0;
+          dataset_buffer_mz[buf_id] = 0;
+          dataset_buffer_E[buf_id]  = 0;
+    #ifdef DE
+          dataset_buffer_GE[buf_id] = 0;
+    #endif
+    #ifdef SCALAR
+          for (int ii = 0; ii < NSCALARS; ii++) {
+            dataset_buffer_scalar[buf_id + ii * H.nx * H.ny] = 0;
+          }
+    #endif
+        }
+  #endif  // MPI_CHOLLA
+      }
+    }
+    // Write out the xy datasets for each variable
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_d, "/d_plus_xy");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_mx, "/mx_plus_xy");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_my, "/my_plus_xy");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_mz, "/mz_plus_xy");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_E, "/E_plus_xy");
+  #ifdef DE
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_GE, "/GE_plus_xy");
+  #endif
+  #ifdef SCALAR
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_scalar, "/scalar_plus_xy");
+  #endif
+    // Free the dataspace id
+    status = H5Sclose(dataspace_id);
+
+    // free the dataset buffers
+    free(dataset_buffer_d);
+    free(dataset_buffer_mx);
+    free(dataset_buffer_my);
+    free(dataset_buffer_mz);
+    free(dataset_buffer_E);
+  #ifdef DE
+    free(dataset_buffer_GE);
+  #endif
+  #ifdef SCALAR
+    free(dataset_buffer_scalar);
+  #endif
+    ////////////////////////////////////////////////////////////
+
+    // Create the -xz data space for the datasets
+    dims[0]      = nx_dset;
+    dims[1]      = nz_dset;
+    dataspace_id = H5Screate_simple(2, dims, NULL);
+
+    // allocate the memory for the -xz slices
+    dataset_buffer_d  = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+    dataset_buffer_mx = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+    dataset_buffer_my = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+    dataset_buffer_mz = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+    dataset_buffer_E  = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+  #ifdef DE
+    dataset_buffer_GE = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+  #endif
+  #ifdef SCALAR
+    dataset_buffer_scalar = (Real *)malloc(NSCALARS * H.nx_real * H.nz_real * sizeof(Real));
+  #endif
+
+    // Copy the xz slices to the memory buffers
+    for (k = 0; k < H.nz_real; k++) {
+      for (i = 0; i < H.nx_real; i++) {
+        id     = cuda_utilities::compute1DIndex(i + H.n_ghost, minus_yslice, k + H.n_ghost, H.nx, H.ny);
+        buf_id = k + i * H.nz_real;
+  #ifdef MPI_CHOLLA
+        // When there are multiple processes, check whether this slice is in
+        // your domain
+        if (minus_yslice >= ny_local_start && minus_yslice < ny_local_start + ny_local) {
+          id = cuda_utilities::compute1DIndex(i + H.n_ghost, minus_yslice - ny_local_start + H.n_ghost, k + H.n_ghost,
+                                              H.nx, H.ny);
+  #endif  // MPI_CHOLLA
+          dataset_buffer_d[buf_id]  = C.density[id];
+          dataset_buffer_mx[buf_id] = C.momentum_x[id];
+          dataset_buffer_my[buf_id] = C.momentum_y[id];
+          dataset_buffer_mz[buf_id] = C.momentum_z[id];
+          dataset_buffer_E[buf_id]  = C.Energy[id];
+  #ifdef DE
+          dataset_buffer_GE[buf_id] = C.GasEnergy[id];
+  #endif
+  #ifdef SCALAR
+          for (int ii = 0; ii < NSCALARS; ii++) {
+            dataset_buffer_scalar[buf_id + ii * H.nx * H.nz] = C.scalar[id + ii * H.n_cells];
+          }
+  #endif
+  #ifdef MPI_CHOLLA
+        }
+        // if the slice isn't in your domain, just write out zeros
+        else {
+          dataset_buffer_d[buf_id]  = 0;
+          dataset_buffer_mx[buf_id] = 0;
+          dataset_buffer_my[buf_id] = 0;
+          dataset_buffer_mz[buf_id] = 0;
+          dataset_buffer_E[buf_id]  = 0;
+    #ifdef DE
+          dataset_buffer_GE[buf_id] = 0;
+    #endif
+    #ifdef SCALAR
+          for (int ii = 0; ii < NSCALARS; ii++) {
+            dataset_buffer_scalar[buf_id + ii * H.nx * H.nz] = 0;
+          }
+    #endif
+        }
+  #endif  // MPI_CHOLLA
+      }
+    }
+    // Write out the xz datasets for each variable
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_d, "/d_minus_xz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_mx, "/mx_minus_xz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_my, "/my_minus_xz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_mz, "/mz_minus_xz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_E, "/E_minus_xz");
+  #ifdef DE
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_GE, "/GE_minus_xz");
+  #endif
+  #ifdef SCALAR
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_scalar, "/scalar_minus_xz");
+  #endif
+
+    // Free the dataspace id
+    status = H5Sclose(dataspace_id);
+
+    // free the dataset buffers
+    free(dataset_buffer_d);
+    free(dataset_buffer_mx);
+    free(dataset_buffer_my);
+    free(dataset_buffer_mz);
+    free(dataset_buffer_E);
+  #ifdef DE
+    free(dataset_buffer_GE);
+  #endif
+  #ifdef SCALAR
+    free(dataset_buffer_scalar);
+  #endif
+    ////////////////////////////////////////////////////////////
+    // Create the +xz data space for the datasets
+    dims[0]      = nx_dset;
+    dims[1]      = nz_dset;
+    dataspace_id = H5Screate_simple(2, dims, NULL);
+
+    // allocate the memory for the +xz slices
+    dataset_buffer_d  = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+    dataset_buffer_mx = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+    dataset_buffer_my = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+    dataset_buffer_mz = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+    dataset_buffer_E  = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+  #ifdef DE
+    dataset_buffer_GE = (Real *)malloc(H.nx_real * H.nz_real * sizeof(Real));
+  #endif
+  #ifdef SCALAR
+    dataset_buffer_scalar = (Real *)malloc(NSCALARS * H.nx_real * H.nz_real * sizeof(Real));
+  #endif
+
+    // Copy the xz slices to the memory buffers
+    for (k = 0; k < H.nz_real; k++) {
+      for (i = 0; i < H.nx_real; i++) {
+        id     = cuda_utilities::compute1DIndex(i + H.n_ghost, plus_yslice, k + H.n_ghost, H.nx, H.ny);
+        buf_id = k + i * H.nz_real;
+  #ifdef MPI_CHOLLA
+        // When there are multiple processes, check whether this slice is in
+        // your domain
+        if (plus_yslice >= ny_local_start && plus_yslice < ny_local_start + ny_local) {
+          id = cuda_utilities::compute1DIndex(i + H.n_ghost, plus_yslice - ny_local_start + H.n_ghost, k + H.n_ghost,
+                                              H.nx, H.ny);
+  #endif  // MPI_CHOLLA
+          dataset_buffer_d[buf_id]  = C.density[id];
+          dataset_buffer_mx[buf_id] = C.momentum_x[id];
+          dataset_buffer_my[buf_id] = C.momentum_y[id];
+          dataset_buffer_mz[buf_id] = C.momentum_z[id];
+          dataset_buffer_E[buf_id]  = C.Energy[id];
+  #ifdef DE
+          dataset_buffer_GE[buf_id] = C.GasEnergy[id];
+  #endif
+  #ifdef SCALAR
+          for (int ii = 0; ii < NSCALARS; ii++) {
+            dataset_buffer_scalar[buf_id + ii * H.nx * H.nz] = C.scalar[id + ii * H.n_cells];
+          }
+  #endif
+  #ifdef MPI_CHOLLA
+        }
+        // if the slice isn't in your domain, just write out zeros
+        else {
+          dataset_buffer_d[buf_id]  = 0;
+          dataset_buffer_mx[buf_id] = 0;
+          dataset_buffer_my[buf_id] = 0;
+          dataset_buffer_mz[buf_id] = 0;
+          dataset_buffer_E[buf_id]  = 0;
+    #ifdef DE
+          dataset_buffer_GE[buf_id] = 0;
+    #endif
+    #ifdef SCALAR
+          for (int ii = 0; ii < NSCALARS; ii++) {
+            dataset_buffer_scalar[buf_id + ii * H.nx * H.nz] = 0;
+          }
+    #endif
+        }
+  #endif  // MPI_CHOLLA
+      }
+    }
+    // Write out the xz datasets for each variable
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_d, "/d_plus_xz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_mx, "/mx_plus_xz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_my, "/my_plus_xz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_mz, "/mz_plus_xz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_E, "/E_plus_xz");
+  #ifdef DE
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_GE, "/GE_plus_xz");
+  #endif
+  #ifdef SCALAR
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_scalar, "/scalar_plus_xz");
+  #endif
+
+    // Free the dataspace id
+    status = H5Sclose(dataspace_id);
+
+    // free the dataset buffers
+    free(dataset_buffer_d);
+    free(dataset_buffer_mx);
+    free(dataset_buffer_my);
+    free(dataset_buffer_mz);
+    free(dataset_buffer_E);
+  #ifdef DE
+    free(dataset_buffer_GE);
+  #endif
+  #ifdef SCALAR
+    free(dataset_buffer_scalar);
+  #endif
+    ////////////////////////////////////////////////////////////
+    // Create the -yz data space for the datasets
+    dims[0]      = ny_dset;
+    dims[1]      = nz_dset;
+    dataspace_id = H5Screate_simple(2, dims, NULL);
+
+    // allocate the memory for the -yz slices
+    dataset_buffer_d  = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+    dataset_buffer_mx = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+    dataset_buffer_my = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+    dataset_buffer_mz = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+    dataset_buffer_E  = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+  #ifdef DE
+    dataset_buffer_GE = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+  #endif
+  #ifdef SCALAR
+    dataset_buffer_scalar = (Real *)malloc(NSCALARS * H.ny_real * H.nz_real * sizeof(Real));
+  #endif
+
+    // Copy the yz slices to the memory buffers
+    for (k = 0; k < H.nz_real; k++) {
+      for (j = 0; j < H.ny_real; j++) {
+        id     = cuda_utilities::compute1DIndex(minus_xslice, j + H.n_ghost, k + H.n_ghost, H.nx, H.ny);
+        buf_id = k + j * H.nz_real;
+  #ifdef MPI_CHOLLA
+        // When there are multiple processes, check whether this slice is in
+        // your domain
+        if (minus_xslice >= nx_local_start && minus_xslice < nx_local_start + nx_local) {
+          id = cuda_utilities::compute1DIndex(minus_xslice - nx_local_start, j + H.n_ghost, k + H.n_ghost, H.nx, H.ny);
+  #endif  // MPI_CHOLLA
+          dataset_buffer_d[buf_id]  = C.density[id];
+          dataset_buffer_mx[buf_id] = C.momentum_x[id];
+          dataset_buffer_my[buf_id] = C.momentum_y[id];
+          dataset_buffer_mz[buf_id] = C.momentum_z[id];
+          dataset_buffer_E[buf_id]  = C.Energy[id];
+  #ifdef DE
+          dataset_buffer_GE[buf_id] = C.GasEnergy[id];
+  #endif
+  #ifdef SCALAR
+          for (int ii = 0; ii < NSCALARS; ii++) {
+            dataset_buffer_scalar[buf_id + ii * H.ny * H.nz] = C.scalar[id + ii * H.n_cells];
+          }
+  #endif
+  #ifdef MPI_CHOLLA
+        }
+        // if the slice isn't in your domain, just write out zeros
+        else {
+          dataset_buffer_d[buf_id]  = 0;
+          dataset_buffer_mx[buf_id] = 0;
+          dataset_buffer_my[buf_id] = 0;
+          dataset_buffer_mz[buf_id] = 0;
+          dataset_buffer_E[buf_id]  = 0;
+    #ifdef DE
+          dataset_buffer_GE[buf_id] = 0;
+    #endif
+    #ifdef SCALAR
+          for (int ii = 0; ii < NSCALARS; ii++) {
+            dataset_buffer_scalar[buf_id + ii * H.ny * H.nz] = 0;
+          }
+    #endif
+        }
+  #endif  // MPI_CHOLLA
+      }
+    }
+    // Write out the yz datasets for each variable
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_d, "/d_minus_yz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_mx, "/mx_minus_yz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_my, "/my_minus_yz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_mz, "/mz_minus_yz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_E, "/E_minus_yz");
+  #ifdef DE
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_GE, "/GE_minus_yz");
+  #endif
+  #ifdef SCALAR
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_scalar, "/scalar_minus_yz");
+  #endif
+
+    // Free the dataspace id
+    status = H5Sclose(dataspace_id);
+
+    // free the dataset buffers
+    free(dataset_buffer_d);
+    free(dataset_buffer_mx);
+    free(dataset_buffer_my);
+    free(dataset_buffer_mz);
+    free(dataset_buffer_E);
+  #ifdef DE
+    free(dataset_buffer_GE);
+  #endif
+  #ifdef SCALAR
+    free(dataset_buffer_scalar);
+  #endif
+    ////////////////////////////////////////////////////////////
+    // Create the +yz data space for the datasets
+    dims[0]      = ny_dset;
+    dims[1]      = nz_dset;
+    dataspace_id = H5Screate_simple(2, dims, NULL);
+
+    // allocate the memory for the +yz slices
+    dataset_buffer_d  = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+    dataset_buffer_mx = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+    dataset_buffer_my = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+    dataset_buffer_mz = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+    dataset_buffer_E  = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+  #ifdef DE
+    dataset_buffer_GE = (Real *)malloc(H.ny_real * H.nz_real * sizeof(Real));
+  #endif
+  #ifdef SCALAR
+    dataset_buffer_scalar = (Real *)malloc(NSCALARS * H.ny_real * H.nz_real * sizeof(Real));
+  #endif
+
+    // Copy the yz slices to the memory buffers
+    for (k = 0; k < H.nz_real; k++) {
+      for (j = 0; j < H.ny_real; j++) {
+        id     = cuda_utilities::compute1DIndex(plus_xslice, j + H.n_ghost, k + H.n_ghost, H.nx, H.ny);
+        buf_id = k + j * H.nz_real;
+  #ifdef MPI_CHOLLA
+        // When there are multiple processes, check whether this slice is in
+        // your domain
+        if (plus_xslice >= nx_local_start && plus_xslice < nx_local_start + nx_local) {
+          id = cuda_utilities::compute1DIndex(plus_xslice - nx_local_start, j + H.n_ghost, k + H.n_ghost, H.nx, H.ny);
+  #endif  // MPI_CHOLLA
+          dataset_buffer_d[buf_id]  = C.density[id];
+          dataset_buffer_mx[buf_id] = C.momentum_x[id];
+          dataset_buffer_my[buf_id] = C.momentum_y[id];
+          dataset_buffer_mz[buf_id] = C.momentum_z[id];
+          dataset_buffer_E[buf_id]  = C.Energy[id];
+  #ifdef DE
+          dataset_buffer_GE[buf_id] = C.GasEnergy[id];
+  #endif
+  #ifdef SCALAR
+          for (int ii = 0; ii < NSCALARS; ii++) {
+            dataset_buffer_scalar[buf_id + ii * H.ny * H.nz] = C.scalar[id + ii * H.n_cells];
+          }
+  #endif
+  #ifdef MPI_CHOLLA
+        }
+        // if the slice isn't in your domain, just write out zeros
+        else {
+          dataset_buffer_d[buf_id]  = 0;
+          dataset_buffer_mx[buf_id] = 0;
+          dataset_buffer_my[buf_id] = 0;
+          dataset_buffer_mz[buf_id] = 0;
+          dataset_buffer_E[buf_id]  = 0;
+    #ifdef DE
+          dataset_buffer_GE[buf_id] = 0;
+    #endif
+    #ifdef SCALAR
+          for (int ii = 0; ii < NSCALARS; ii++) {
+            dataset_buffer_scalar[buf_id + ii * H.ny * H.nz] = 0;
+          }
+    #endif
+        }
+  #endif  // MPI_CHOLLA
+      }
+    }
+    // Write out the yz datasets for each variable
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_d, "/d_plus_yz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_mx, "/mx_plus_yz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_my, "/my_plus_yz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_mz, "/mz_plus_yz");
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_E, "/E_plus_yz");
+  #ifdef DE
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_GE, "/GE_plus_yz");
+  #endif
+  #ifdef SCALAR
+    status = Write_HDF5_Dataset(file_id, dataspace_id, dataset_buffer_scalar, "/scalar_plus_yz");
+  #endif
+
+    // Free the dataspace id
+    status = H5Sclose(dataspace_id);
+
+    // free the dataset buffers
+    free(dataset_buffer_d);
+    free(dataset_buffer_mx);
+    free(dataset_buffer_my);
+    free(dataset_buffer_mz);
+    free(dataset_buffer_E);
+  #ifdef DE
+    free(dataset_buffer_GE);
+  #endif
+  #ifdef SCALAR
+    free(dataset_buffer_scalar);
+  #endif
+  } else {
+    printf("Slice write only works for 3D data.\n");
+  }
+}
+#endif  // HDF5
+
+/*! \fn void Read_Grid(struct Parameters P)
+ *  \brief Read in grid data from an output file. */
+void Grid3D::Read_Grid(struct Parameters P)
+{
+  ScopedTimer timer("Read_Grid");
+  int nfile = P.nfile;  // output step you want to read from
+
+  // create the filename to read from
+  // assumes your data is in the outdir specified in the input file
+  // strcpy(filename, P.outdir);
+  // Changed to read initial conditions from indir
+  std::string filename(P.indir);
+  filename += std::to_string(P.nfile);
+  char sbuffer[1024];
+
+#if defined BINARY
+  filename += ".bin";
+#elif defined HDF5
+  filename += ".h5";
+#endif  // BINARY or HDF5
+// for now assumes you will run on the same number of processors
+#ifdef MPI_CHOLLA
+  #ifdef TILED_INITIAL_CONDITIONS
+  sprintf(sbuffer, "%sics_%dMpc_%d.h5", P.indir, (int)P.tile_length / 1000,
+          H.nx_real);  // Everyone reads the same file
+  filename = sbuffer;
+  #else   // TILED_INITIAL_CONDITIONS is not defined
+  filename += "." + std::to_string(procID);
+  #endif  // TILED_INITIAL_CONDITIONS
+#endif    // MPI_CHOLLA
+
+#if defined BINARY
+  FILE *fp;
+  // open the file
+  fp = fopen(filename, "r");
+  if (!fp) {
+    printf("Unable to open input file.\n");
+    exit(0);
+  }
+
+  // read in grid data
+  Read_Grid_Binary(fp);
+
+  // close the file
+  fclose(fp);
+
+#elif defined HDF5
+  hid_t file_id;
+  herr_t status;
+
+  // open the file
+  file_id = H5Fopen(filename.data(), H5F_ACC_RDONLY, H5P_DEFAULT);
+  if (file_id < 0) {
+    std::cout << "Unable to open input file: " << filename << std::endl;
+    exit(0);
+  }
+
+  // read in grid data
+  Read_Grid_HDF5(file_id, P);
 
   // close the file
   status = H5Fclose(file_id);
@@ -2635,6 +3342,9 @@ void Grid3D::Read_Grid_HDF5(hid_t file_id, struct Parameters P)
     // Free the dataset id
     status = H5Dclose(dataset_id);
 
+    #ifdef CLOUD_TRACKING
+    // H.velocity_x_cloud_avg = 1.606867e-01 / (KPC / TIME_UNIT);
+    #endif
     mean_l = 0;
     min_l  = 1e65;
     max_l  = -1;
diff --git a/src/io/io.h b/src/io/io.h
index d8f6ca8ca..952b32b39 100644
--- a/src/io/io.h
+++ b/src/io/io.h
@@ -25,6 +25,8 @@ void Output_Rotated_Projected_Data(Grid3D& G, struct Parameters P, int nfile);
 /* Output xy, xz, and yz slices of the grid data to file. */
 void Output_Slices(Grid3D& G, struct Parameters P, int nfile);
 
+void Output_Edges(Grid3D& G, struct Parameters P, int nfile);
+
 /* MPI-safe printf routine */
 int chprintf(const char* __restrict sdata, ...);
 
diff --git a/src/model/disk_ICs.cpp b/src/model/disk_ICs.cpp
index d68f9e0c4..54f805751 100644
--- a/src/model/disk_ICs.cpp
+++ b/src/model/disk_ICs.cpp
@@ -839,6 +839,21 @@ void Grid3D::Disk_3D(Parameters p)
   Real *rho_halo = (Real *)calloc(nr, sizeof(Real));
   Real *r_halo   = (Real *)calloc(nr, sizeof(Real));
 
+#ifdef SCALAR
+  #ifdef DUST
+  // set entire grid to zero for dust density
+  for (k = H.n_ghost; k < H.nz - H.n_ghost; k++) {
+    for (j = H.n_ghost; j < H.ny - H.n_ghost; j++) {
+      for (i = H.n_ghost; i < H.nx - H.n_ghost; i++) {
+        // get cell index
+        id = i + j * H.nx + k * H.nx * H.ny;
+	C.dust_density[id] = 0.0;
+      }
+    }
+  }
+  #endif
+#endif
+
   //////////////////////////////////////////////
   //////////////////////////////////////////////
   // Produce a look up table for a hydrostatic hot halo
@@ -846,7 +861,6 @@ void Grid3D::Disk_3D(Parameters p)
   //////////////////////////////////////////////
   Hydrostatic_Ray_Analytical_D3D(rho_halo, r_halo, hdp, dr, nr);
   chprintf("Hot halo lookup table generated...\n");
-
   //////////////////////////////////////////////
   //////////////////////////////////////////////
   // Add a disk component
@@ -862,7 +876,7 @@ void Grid3D::Disk_3D(Parameters p)
       // get the centered x, y, and z positions
       k = H.n_ghost + H.ny;
       Get_Position(i, j, k, &x_pos, &y_pos, &z_pos);
-
+	
       // cylindrical radius
       r = sqrt(x_pos * x_pos + y_pos * y_pos);
 
@@ -894,8 +908,13 @@ void Grid3D::Disk_3D(Parameters p)
 
         // store internal energy in Energy array
         C.Energy[id] = P / (gama - 1.0);
-  #ifdef BASIC_SCALAR
-        C.scalar[id] = 1.0 * C.density[id];
+  #ifdef SCALAR
+	#ifdef BASIC_SCALAR
+        C.basic_scalar[id] = 1.0 * C.density[id];
+	#endif
+	#ifdef DUST
+	C.dust_density[id] = C.density[id] * 1e-2;
+	#endif
   #endif
       }
     }
@@ -1050,10 +1069,12 @@ void Grid3D::Disk_3D(Parameters p)
         C.Energy[id] += P / (gama - 1.0);
 
         // add a passive scalar
+#ifdef SCALAR
   #ifdef BASIC_SCALAR
-        c            = fmax(C.scalar[id] / C.density[id], 0.1);
-        C.scalar[id] = c * C.density[id];
+        c            = fmax(C.basic_scalar[id] / C.density[id], 0.1);
+        C.basic_scalar[id] = c * C.density[id];
   #endif
+#endif
       }
     }
   }
diff --git a/src/particles/io_particles.cpp b/src/particles/io_particles.cpp
index 3743dd21a..f4c8e1b7a 100644
--- a/src/particles/io_particles.cpp
+++ b/src/particles/io_particles.cpp
@@ -445,12 +445,12 @@ void Particles3D::Load_Particles_Data_HDF5(hid_t file_id, int nfile, struct Para
   Real vy_max_g = vy_max;
   Real vz_max_g = vz_max;
 
-  Real px_min_g = px_min;
-  Real py_min_g = py_min;
-  Real pz_min_g = pz_min;
-  Real vx_min_g = vx_min;
-  Real vy_min_g = vy_min;
-  Real vz_min_g = vz_min;
+  Real px_min_g        = px_min;
+  Real py_min_g        = py_min;
+  Real pz_min_g        = pz_min;
+  Real vx_min_g        = vx_min;
+  Real vy_min_g        = vy_min;
+  Real vz_min_g        = vz_min;
     #endif  // MPI_CHOLLA
 
     // Print initial Statistics
@@ -563,7 +563,7 @@ void Grid3D::Write_Particles_Data_HDF5(hid_t file_id)
     #ifdef MPI_CHOLLA
   N_particles_total = ReducePartIntSum(Particles.n_local);
     #else
-  N_particles_total = Particles.n_local;
+  N_particles_total    = Particles.n_local;
     #endif
 
   // Print the total particles when saving the particles data
diff --git a/src/particles/particles_3D.cpp b/src/particles/particles_3D.cpp
index 6417e4136..87a2be8e5 100644
--- a/src/particles/particles_3D.cpp
+++ b/src/particles/particles_3D.cpp
@@ -157,12 +157,12 @@ void Particles3D::Initialize(struct Parameters *P, Grav3D &Grav, Real xbound, Re
   G.boundary_type_z0 = P->zlg_bcnd;
   G.boundary_type_z1 = P->zug_bcnd;
   #else
-  G.boundary_type_x0 = P->xl_bcnd;
-  G.boundary_type_x1 = P->xu_bcnd;
-  G.boundary_type_y0 = P->yl_bcnd;
-  G.boundary_type_y1 = P->yu_bcnd;
-  G.boundary_type_z0 = P->zl_bcnd;
-  G.boundary_type_z1 = P->zu_bcnd;
+  G.boundary_type_x0        = P->xl_bcnd;
+  G.boundary_type_x1        = P->xu_bcnd;
+  G.boundary_type_y0        = P->yl_bcnd;
+  G.boundary_type_y1        = P->yu_bcnd;
+  G.boundary_type_z0        = P->zl_bcnd;
+  G.boundary_type_z1        = P->zu_bcnd;
   #endif
 
   #ifdef PARTICLES_GPU
@@ -211,7 +211,7 @@ void Particles3D::Initialize(struct Parameters *P, Grav3D &Grav, Real xbound, Re
   #ifdef MPI_CHOLLA
   n_total_initial = ReducePartIntSum(n_local);
   #else
-  n_total_initial = n_local;
+  n_total_initial           = n_local;
   #endif
 
   chprintf("Particles Initialized: \n n_local: %lu \n", n_local);
diff --git a/src/particles/particles_boundaries_cpu.cpp b/src/particles/particles_boundaries_cpu.cpp
index 27470befe..772153534 100644
--- a/src/particles/particles_boundaries_cpu.cpp
+++ b/src/particles/particles_boundaries_cpu.cpp
@@ -433,13 +433,13 @@ void Particles3D::Unload_Particles_from_Buffer_CPU(int direction, int side, Real
     offset_extra += 1;
     pId = recv_buffer[offset_extra];
     #else
-    pId = 0;
+    pId   = 0;
     #endif
     #ifdef PARTICLE_AGE
     offset_extra += 1;
     pAge = recv_buffer[offset_extra];
     #else
-    pAge = 0.0;
+    pAge  = 0.0;
     #endif
 
     offset_buff += N_DATA_PER_PARTICLE_TRANSFER;
diff --git a/src/reconstruction/plmp_cuda.cu b/src/reconstruction/plmp_cuda.cu
index e8cfa0d09..4e7417268 100644
--- a/src/reconstruction/plmp_cuda.cu
+++ b/src/reconstruction/plmp_cuda.cu
@@ -119,7 +119,7 @@ __global__ void PLMP_cuda(Real *dev_conserved, Real *dev_bounds_L, Real *dev_bou
     dge   = dev_conserved[(n_fields - 1) * n_cells + id];
     p_i   = hydro_utilities::Get_Pressure_From_DE(E, E - E_kin, dge, gamma);
 #else
-    p_i = (dev_conserved[4 * n_cells + id] - 0.5 * d_i * (vx_i * vx_i + vy_i * vy_i + vz_i * vz_i)) * (gamma - 1.0);
+    p_i   = (dev_conserved[4 * n_cells + id] - 0.5 * d_i * (vx_i * vx_i + vy_i * vy_i + vz_i * vz_i)) * (gamma - 1.0);
 #endif  // PRESSURE_DE
     p_i = fmax(p_i, (Real)TINY_NUMBER);
 #ifdef SCALAR
diff --git a/src/reconstruction/ppmc_cuda_tests.cu b/src/reconstruction/ppmc_cuda_tests.cu
index 9e9b11140..c1319ea58 100644
--- a/src/reconstruction/ppmc_cuda_tests.cu
+++ b/src/reconstruction/ppmc_cuda_tests.cu
@@ -139,7 +139,7 @@ TEST(tALLPpmcVLReconstructor, CorrectInputExpectCorrectOutput)
 #ifdef MHD
   size_t const n_fields = 8;
 #else   // not MHD
-  size_t const n_fields = 5;
+  size_t const n_fields                                                = 5;
 #endif  // MHD
 
   // Setup host grid. Fill host grid with random values and randomly assign maximum value
diff --git a/src/reconstruction/ppmp_cuda.cu b/src/reconstruction/ppmp_cuda.cu
index 2038f215a..be125cd0c 100644
--- a/src/reconstruction/ppmp_cuda.cu
+++ b/src/reconstruction/ppmp_cuda.cu
@@ -166,7 +166,7 @@ __global__ void PPMP_cuda(Real *dev_conserved, Real *dev_bounds_L, Real *dev_bou
     dge   = dev_conserved[(n_fields - 1) * n_cells + id];
     p_i   = hydro_utilities::Get_Pressure_From_DE(E, E - E_kin, dge, gamma);
   #else
-    p_i = (dev_conserved[4 * n_cells + id] - 0.5 * d_i * (vx_i * vx_i + vy_i * vy_i + vz_i * vz_i)) * (gamma - 1.0);
+    p_i   = (dev_conserved[4 * n_cells + id] - 0.5 * d_i * (vx_i * vx_i + vy_i * vy_i + vz_i * vz_i)) * (gamma - 1.0);
   #endif  // PRESSURE_DE
     p_i = fmax(p_i, (Real)TINY_NUMBER);
   #ifdef DE
diff --git a/src/reconstruction/reconstruction_tests.cu b/src/reconstruction/reconstruction_tests.cu
index 6c4574951..3c749b23a 100644
--- a/src/reconstruction/reconstruction_tests.cu
+++ b/src/reconstruction/reconstruction_tests.cu
@@ -575,9 +575,13 @@ TEST(tALLReconstructionWriteData, CorrectInputExpectCorrectOutput)
 {
   // Set up test and mock up grid
 #ifdef MHD
-  reconstruction::Primitive interface{1, 2, 3, 4, 5, 6, 7, 8};
+  reconstruction::Primitive interface {
+    1, 2, 3, 4, 5, 6, 7, 8
+  };
 #else   // MHD
-  reconstruction::Primitive interface{6, 7, 8, 9, 10};
+  reconstruction::Primitive interface {
+    6, 7, 8, 9, 10
+  };
 #endif  // MHD
   size_t const nx = 3, ny = 3, nz = 3;
   size_t const n_cells = nx * ny * nz;
diff --git a/src/system_tests/hydro_system_tests.cpp b/src/system_tests/hydro_system_tests.cpp
index e40f1353d..7f21b0875 100644
--- a/src/system_tests/hydro_system_tests.cpp
+++ b/src/system_tests/hydro_system_tests.cpp
@@ -56,8 +56,8 @@ TEST_P(tHYDROtMHDSYSTEMSodShockTubeParameterizedMpi, CorrectInputExpectCorrectOu
   double const maxAllowedL1Error = 7.0E-3;
   double const maxAllowedError   = 4.6E-2;
 #else
-  double const maxAllowedL1Error = 9.4E-5;
-  double const maxAllowedError   = 6.4E-4;
+  double const maxAllowedL1Error         = 9.4E-5;
+  double const maxAllowedError           = 6.4E-4;
 #endif  // MHD
 
   sodTest.numMpiRanks = GetParam();
diff --git a/src/utils/reduction_utilities.cu b/src/utils/reduction_utilities.cu
index 6434f560b..349da4ee6 100644
--- a/src/utils/reduction_utilities.cu
+++ b/src/utils/reduction_utilities.cu
@@ -38,4 +38,18 @@ __global__ void kernelReduceMax(Real* in, Real* out, size_t N)
   gridReduceMax(maxVal, out);
 }
 // =====================================================================
+
+__global__ void Kernel_Reduce_Add(Real* in, Real* out, size_t N)
+{
+  Real sum_stride = 0;
+
+  // Each thread strides through the grid to perform a partial reduction
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
+    sum_stride += in[i];
+  }
+  __syncthreads();
+
+  // Perform grid-wide reduction
+  Grid_Reduce_Add(sum_stride, out);
+}
 }  // namespace reduction_utilities
diff --git a/src/utils/reduction_utilities.h b/src/utils/reduction_utilities.h
index 99191d8c5..982d8df9c 100644
--- a/src/utils/reduction_utilities.h
+++ b/src/utils/reduction_utilities.h
@@ -43,6 +43,14 @@ __inline__ __device__ Real warpReduceMax(Real val)
 }
 // =====================================================================
 
+__inline__ __device__ Real Warp_Reduce_Add(Real val)
+{
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+    val += __shfl_down(val, offset);
+  }
+  return val;
+}
+
 // =====================================================================
 /*!
  * \brief Perform a reduction within the block to find the maximum value
@@ -80,6 +88,33 @@ __inline__ __device__ Real blockReduceMax(Real val)
 }
 // =====================================================================
 
+__inline__ __device__ Real Block_Reduce_Add(Real val)
+{
+  // Shared memory for storing the results of each warp-wise partial
+  // reduction
+  __shared__ Real shared[::maxWarpsPerBlock];
+
+  int lane   = threadIdx.x % warpSize;  // thread ID within the warp,
+  int warpId = threadIdx.x / warpSize;  // ID of the warp itself
+
+  val = Warp_Reduce_Add(val);  // Each warp performs partial reduction
+
+  if (lane == 0) {
+    shared[warpId] = val;
+  }  // Write reduced value to shared memory
+
+  __syncthreads();  // Wait for all partial reductions
+
+  // read from shared memory only if that warp existed
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
+
+  if (warpId == 0) {
+    val = Warp_Reduce_Add(val);
+  }  // Final reduce within first warp
+
+  return val;
+}
+
 #ifndef O_HIP
 // =====================================================================
 // This section handles the atomics. It is complicated because CUDA
@@ -235,6 +270,28 @@ inline __device__ double atomicMinBits(double* address, double val)
 }
 // =====================================================================
 
+// =====================================================================
+inline __device__ float Atomic_Add_Bits(float* address, float val)
+{
+#ifdef O_HIP
+  return atomicAdd(address, val);
+#else  // O_HIP
+  return atomicAdd(address, val);
+#endif
+}
+// =====================================================================
+
+// =====================================================================
+inline __device__ double Atomic_Add_Bits(double* address, double val)
+{
+#ifdef O_HIP
+  return atomicAdd(address, val);
+#else  // O_HIP
+  return atomicAdd(address, val);
+#endif
+}
+// =====================================================================
+
 // =====================================================================
 /*!
  * \brief Perform a reduction within the grid to find the maximum value
@@ -283,6 +340,19 @@ __inline__ __device__ void gridReduceMax(Real val, Real* out)
 }
 // =====================================================================
 
+// =====================================================================
+__inline__ __device__ void Grid_Reduce_Add(Real val, Real* out)
+{
+  // Reduce the entire block in parallel
+  val = Block_Reduce_Add(val);
+
+  // Write block level reduced value to the output scalar atomically
+  if (threadIdx.x == 0) {
+    Atomic_Add_Bits(out, val);
+  }
+}
+// =====================================================================
+
 // =====================================================================
 /*!
  * \brief Find the maximum value in the array. Make sure to initialize
@@ -302,5 +372,7 @@ __inline__ __device__ void gridReduceMax(Real val, Real* out)
  * \param[in] N The size of the `in` array
  */
 __global__ void kernelReduceMax(Real* in, Real* out, size_t N);
+
+__global__ void Kernel_Reduce_Add(Real* in, Real* out, size_t N);
 // =====================================================================
 }  // namespace reduction_utilities
diff --git a/src/utils/reduction_utilities_tests.cu b/src/utils/reduction_utilities_tests.cu
index 528de22a7..cd1eb5379 100644
--- a/src/utils/reduction_utilities_tests.cu
+++ b/src/utils/reduction_utilities_tests.cu
@@ -35,8 +35,7 @@ TEST(tALLKernelReduceMax, CorrectInputExpectCorrectOutput)
   // ====================================
   size_t const gridSize = 64;
   size_t const size     = std::pow(gridSize, 3);
-  ;
-  Real const maxValue = 4;
+  Real const maxValue   = 4;
   std::vector<Real> host_grid(size);
 
   // Fill grid with random values and assign maximum value
@@ -68,3 +67,41 @@ TEST(tALLKernelReduceMax, CorrectInputExpectCorrectOutput)
 // =============================================================================
 // Tests for divergence max reduction
 // =============================================================================
+
+TEST(tALLKernelReduceSum, CorrectInputExpectCorrectOutput)
+{
+  // Launch parameters
+  // =================
+  cuda_utilities::AutomaticLaunchParams static const launch_params(reduction_utilities::Kernel_Reduce_Add);
+
+  // Grid Parameters & testing parameters
+  // ====================================
+  size_t const size = std::pow(64, 3);
+  std::vector<Real> host_grid(size);  // host copy of array to be summed
+  std::vector<Real> host_sum(1);      // variable to store result of sum reduction
+
+  host_sum[0] = 0.0;
+  // Fill grid with ones
+  for (Real& host_data : host_grid) {
+    host_data = 1;
+  }
+  // Allocating and copying to device
+  // ================================
+  cuda_utilities::DeviceVector<Real> dev_grid(host_grid.size());
+  dev_grid.cpyHostToDevice(host_grid);
+
+  cuda_utilities::DeviceVector<Real> static dev_sum(1);
+  dev_sum.assign(0, 0);
+
+  // Do the reduction
+  // ================
+  // .data() passes the a pointer to the kernel
+  hipLaunchKernelGGL(reduction_utilities::Kernel_Reduce_Add, launch_params.numBlocks, launch_params.threadsPerBlock, 0,
+                     0, dev_grid.data(), dev_sum.data(), host_grid.size());
+  cudaDeviceSynchronize();
+
+  dev_sum.cpyDeviceToHost(host_sum);
+
+  // Perform comparison
+  testing_utilities::Check_Results(size, host_sum[0], "sum found");
+}
\ No newline at end of file