diff --git a/exec/compressible_stag/GNUmakefile b/exec/compressible_stag/GNUmakefile
index 24dfa949..123e6ad1 100644
--- a/exec/compressible_stag/GNUmakefile
+++ b/exec/compressible_stag/GNUmakefile
@@ -14,7 +14,20 @@ MAX_SPEC      = 8
 MAX_REAC      = 5
 
 USE_PARTICLES = FALSE
-DO_TURB 	  = FALSE
+DO_TURB       = FALSE
+
+USE_HEFFTE_FFTW   = FALSE
+USE_HEFFTE_CUFFT  = FALSE
+USE_HEFFTE_ROCFFT = FALSE
+USE_DISTRIBUTED_FFT = TRUE
+
+ifeq ($(USE_HEFFTE_FFTW),TRUE)
+  HEFFTE_HOME ?= ../../../heffte/
+else ifeq ($(USE_HEFFTE_CUFFT),TRUE)
+  HEFFTE_HOME ?= ../../../heffte-org/build_aware/
+else ifeq ($(USE_HEFFTE_ROCFFT),TRUE)
+  HEFFTE_HOME ?= ../../../heffte-org/build_noaware/
+endif
 
 include $(AMREX_HOME)/Tools/GNUMake/Make.defs
 
@@ -41,23 +54,35 @@ include ../../src_rng/Make.package
 VPATH_LOCATIONS   += ../../src_rng/
 INCLUDE_LOCATIONS += ../../src_rng/
 
-
 include ../../src_common/Make.package
 VPATH_LOCATIONS   += ../../src_common/
 INCLUDE_LOCATIONS += ../../src_common/
 
+#ifeq ($(USE_HEFFTE_FFTW),TRUE)
+#  include $(HEFFTE_HOME)/src/Make.package
+#else ifeq ($(USE_HEFFTE_CUFFT),TRUE)
+#  include $(HEFFTE_HOME)/src/Make.package
+#else ifeq ($(USE_HEFFTE_ROCFFT),TRUE)
+#  include $(HEFFTE_HOME)/src/Make.package
+#endif
+
+ifeq ($(USE_HEFFTE_FFTW),TRUE)
+  DEFINES += -DHEFFTE_FFTW
+  LIBRARIES += -L$(FFTW_DIR) -lfftw3_mpi -lfftw3 -lfftw3f
+else ifeq ($(USE_HEFFTE_CUFFT),TRUE)
+  DEFINES += -DHEFFTE_CUFFT
+else ifeq ($(USE_HEFFTE_ROCFFT),TRUE)
+  DEFINES += -DHEFFTE_ROCFFT
+endif
+
 include $(AMREX_HOME)/Src/Base/Make.package
 
 include ../../src_analysis/Make.package
 VPATH_LOCATIONS   += ../../src_analysis/
 INCLUDE_LOCATIONS += ../../src_analysis/
-
+ 
 include $(AMREX_HOME)/Tools/GNUMake/Make.rules
 
-ifeq ($(findstring cgpu, $(HOST)), cgpu)
-  CXXFLAGS += $(FFTW)
-endif
-
 ifeq ($(USE_CUDA),TRUE)
   LIBRARIES += -lcufft
 else ifeq ($(USE_HIP),TRUE)
@@ -66,7 +91,7 @@ else ifeq ($(USE_HIP),TRUE)
   LIBRARY_LOCATIONS += $(ROC_PATH)/rocfft/lib
   LIBRARIES += -L$(ROC_PATH)/rocfft/lib -lrocfft
 else
-  LIBRARIES += -L$(FFTW_DIR) -lfftw3_mpi -lfftw3
+  LIBRARIES += -L$(FFTW_DIR) -lfftw3_mpi -lfftw3 -lfftw3f
 endif
 
 ifeq ($(DO_TURB), TRUE)
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/GNUmakefile b/exec/compressible_stag/SPECTRAL_FILTER/GNUmakefile
new file mode 100644
index 00000000..052b3851
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/GNUmakefile
@@ -0,0 +1,69 @@
+AMREX_HOME ?= ../../../../amrex/
+
+DEBUG         = FALSE
+USE_MPI       = TRUE
+USE_OMP       = FALSE
+USE_CUDA      = FALSE
+USE_HIP       = FALSE
+COMP          = gnu
+DIM           = 3
+TINY_PROFILE  = FALSE
+
+USE_HEFFTE_FFTW   = FALSE
+USE_HEFFTE_CUFFT  = FALSE
+USE_HEFFTE_ROCFFT = FALSE
+
+ifeq ($(USE_HEFFTE_FFTW),TRUE)
+  HEFFTE_HOME ?= ../../../../heffte/
+else ifeq ($(USE_HEFFTE_CUFFT),TRUE)
+  HEFFTE_HOME ?= ../../../../heffte-org/build_aware/
+else ifeq ($(USE_HEFFTE_ROCFFT),TRUE)
+  HEFFTE_HOME ?= ../../../../heffte/
+endif
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+VPATH_LOCATIONS   += .
+INCLUDE_LOCATIONS += .
+
+#ifeq ($(USE_HEFFTE_FFTW),TRUE)
+#  include $(HEFFTE_HOME)/src/Make.package
+#else ifeq ($(USE_HEFFTE_CUFFT),TRUE)
+#  include $(HEFFTE_HOME)/src/Make.package
+#else ifeq ($(USE_HEFFTE_ROCFFT),TRUE)
+#  include $(HEFFTE_HOME)/src/Make.package
+#endif
+
+include ./Make.package
+ifeq ($(USE_HEFFTE_FFTW),TRUE)
+  DEFINES += -DHEFFTE_FFTW
+  LIBRARIES += -L$(FFTW_DIR) -lfftw3_mpi -lfftw3 -lfftw3f
+else ifeq ($(USE_HEFFTE_CUFFT),TRUE)
+  DEFINES += -DHEFFTE_CUFFT
+  VPATH_LOCATIONS += $(HEFFTE_HOME)/include
+  INCLUDE_LOCATIONS += $(HEFFTE_HOME)/include
+  LIBRARY_LOCATIONS += $(HEFFTE_HOME)/lib
+  LIBRARIES += -lheffte
+else ifeq ($(USE_HEFFTE_ROCFFT),TRUE)
+  DEFINES += -DHEFFTE_ROCFFT
+endif
+
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
+
+ifeq ($(USE_CUDA),TRUE)
+  LIBRARIES += -lcufft
+else ifeq ($(USE_HIP),TRUE)
+  # Use rocFFT.  ROC_PATH is defined in amrex
+  INCLUDE_LOCATIONS += $(ROC_PATH)/rocfft/include
+  LIBRARY_LOCATIONS += $(ROC_PATH)/rocfft/lib
+  LIBRARIES += -L$(ROC_PATH)/rocfft/lib -lrocfft
+else
+  LIBRARIES += -L$(FFTW_DIR) -lfftw3_mpi -lfftw3 -lfftw3f
+endif
+
+ifeq ($(DO_TURB), TRUE)
+  DEFINES += -DTURB
+endif
+
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/Make.package b/exec/compressible_stag/SPECTRAL_FILTER/Make.package
new file mode 100644
index 00000000..e0391922
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/Make.package
@@ -0,0 +1,5 @@
+CEXE_sources += main.cpp
+CEXE_sources += main_driver.cpp
+CEXE_sources += spectral_functions.cpp
+
+CEXE_headers += spectral_functions.H
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/build_frontier.sh b/exec/compressible_stag/SPECTRAL_FILTER/build_frontier.sh
new file mode 100755
index 00000000..dcb05e97
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/build_frontier.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/bash
+
+## load necessary modules 
+module load craype-accel-amd-gfx90a
+module load amd-mixed
+#module load rocm/5.2.0  # waiting for 5.6 for next bump
+module load cray-mpich/8.1.23
+module load cce/15.0.0  # must be loaded after rocm
+
+# GPU-aware MPI
+export MPICH_GPU_SUPPORT_ENABLED=1
+
+# optimize CUDA compilation for MI250X
+export AMREX_AMD_ARCH=gfx90a
+
+# compiler environment hints
+##export CC=$(which hipcc)
+##export CXX=$(which hipcc)
+##export FC=$(which ftn)
+##export CFLAGS="-I${ROCM_PATH}/include"
+##export CXXFLAGS="-I${ROCM_PATH}/include -Wno-pass-failed"
+##export LDFLAGS="-L${ROCM_PATH}/lib -lamdhip64 ${PE_MPICH_GTL_DIR_amd_gfx90a} -lmpi_gtl_hsa"
+export LDFLAGS="-L${MPICH_DIR}/lib -lmpi ${CRAY_XPMEM_POST_LINK_OPTS} -lxpmem ${PE_MPICH_GTL_DIR_amd_gfx90a} ${PE_MPICH_GTL_LIBS_amd_gfx90a}"
+export CXXFLAGS="-I${MPICH_DIR}/include"
+export HIPFLAGS="--amdgpu-target=gfx90a"
+
+make -j10 USE_HIP=TRUE USE_HEFFTE_ROCFFT=TRUE USE_ASSERTION=TRUE
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/build_perlmutter.sh b/exec/compressible_stag/SPECTRAL_FILTER/build_perlmutter.sh
new file mode 100755
index 00000000..e3bd5aac
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/build_perlmutter.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/bash
+
+# required dependencies
+module load cray-fftw
+module load cmake
+module load cudatoolkit
+
+module list
+
+# necessary to use CUDA-Aware MPI and run a job
+export CRAY_ACCEL_TARGET=nvidia80
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+
+# optimize CUDA compilation for A100
+export AMREX_CUDA_ARCH=8.0
+
+# optimize CPU microarchitecture for AMD EPYC 3rd Gen (Milan/Zen3)
+# note: the cc/CC/ftn wrappers below add those
+export CXXFLAGS="-march=znver3"
+export CFLAGS="-march=znver3"
+
+# compiler environment hints
+export CC=cc
+export CXX=CC
+export FC=ftn
+export CUDACXX=$(which nvcc)
+export CUDAHOSTCXX=CC
+
+make -j10 USE_CUDA=TRUE USE_HEFFTE_CUFFT=TRUE USE_ASSERTION=TRUE MAX_SPEC=2
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/main.cpp b/exec/compressible_stag/SPECTRAL_FILTER/main.cpp
new file mode 100644
index 00000000..95c149e5
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/main.cpp
@@ -0,0 +1,28 @@
+#include <AMReX.H>
+//#include <AMReX_ParmParse.H>
+
+// function declaration
+void main_driver (const char* argv);
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc,argv);
+
+// this specific part has been moved to Flagellum/main_driver.cpp
+//    {
+//        amrex::ParmParse pp("particles");
+//#ifdef AMREX_USE_GPU
+//        bool particles_do_tiling = true;
+//#else
+//        bool particles_do_tiling = false;
+//#endif
+//        pp.queryAdd("do_tiling", particles_do_tiling);
+//    }
+
+    // argv[1] contains the name of the inputs file entered at the command line
+    main_driver(argv[1]);
+
+    amrex::Finalize();
+
+    return 0;
+}
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/main_driver.cpp b/exec/compressible_stag/SPECTRAL_FILTER/main_driver.cpp
new file mode 100644
index 00000000..7db4b16d
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/main_driver.cpp
@@ -0,0 +1,450 @@
+#include "spectral_functions.H"
+#include <AMReX_Vector.H>
+#include <AMReX_MPMD.H>
+#include "AMReX_ParmParse.H"
+
+#include "chrono"
+
+using namespace std::chrono;
+using namespace amrex;
+
+// argv contains the name of the inputs file entered at the command line
+void main_driver(const char* argv)
+{
+    BL_PROFILE_VAR("main_driver()",main_driver);
+    
+    amrex::Vector<amrex::IntVect> nodal_flag_dir;
+    amrex::IntVect                nodal_flag_x;
+    amrex::IntVect                nodal_flag_y;
+    amrex::IntVect                nodal_flag_z;
+    nodal_flag_dir.resize(3);
+
+    for (int i=0; i<3; ++i) {
+        nodal_flag_x[i] = int(i==0);
+        nodal_flag_y[i] = int(i==1);
+        nodal_flag_z[i] = int(i==2);
+        AMREX_D_TERM(nodal_flag_dir[0][i] = nodal_flag_x[i];,
+                     nodal_flag_dir[1][i] = nodal_flag_y[i];,
+                     nodal_flag_dir[2][i] = nodal_flag_z[i];);
+    }
+
+    // timer
+    Real ts1 = ParallelDescriptor::second();
+    
+    std::string inputs_file = argv;
+
+    ParmParse pp;
+    amrex::Vector<amrex::Real> temp_real(3,0.);
+    amrex::Vector<int>         temp_int (3,0 );
+
+    amrex::Vector<int>   max_grid_size(3,1 );
+    amrex::Vector<int>   n_cells(3,0 );
+    amrex::Vector<Real>  prob_lo(3,0 );
+    amrex::Vector<Real>  prob_hi(3,0 );
+    
+    if (pp.queryarr("n_cells",temp_int,0,3)) {
+        for (int i=0; i<3; ++i) {
+            n_cells[i] = temp_int[i];
+        }
+    }
+    int npts = n_cells[0]*n_cells[1]*n_cells[2];
+    if (pp.queryarr("prob_lo",temp_real,0,3)) {
+        for (int i=0; i<3; ++i) {
+            prob_lo[i] = temp_real[i];
+        }
+    }
+    if (pp.queryarr("prob_hi",temp_real,0,3)) {
+        for (int i=0; i<3; ++i) {
+            prob_hi[i] = temp_real[i];
+        }
+    }
+    pp.queryarr("max_grid_size",max_grid_size,0,3);
+
+    int restart;
+    pp.query("restart",restart);
+
+    int nprimvars;
+    pp.query("nprimvars",nprimvars);
+
+    int plot_filter = 0;
+    pp.query("plot_filter",plot_filter);
+
+    amrex::IntVect ngc;
+    for (int i=0; i<3; ++i) {
+        ngc[i] = 1;           // number of ghost cells
+    }
+    if (pp.queryarr("ngc",temp_int,0,3)) {
+        for (int i=0; i<3; ++i) {
+            ngc[i] = temp_int[i];
+        }
+    }
+
+    amrex::Real kmin;
+    pp.query("kmin",kmin);
+
+    amrex::Real kmax;
+    pp.query("kmax",kmax);
+    
+    std::array< MultiFab, 3 > vel;
+    MultiFab prim;
+    
+    // make BoxArray and Geometry
+    BoxArray ba;
+    Geometry geom;
+    DistributionMapping dmap;
+
+    IntVect dom_lo(AMREX_D_DECL(           0,            0,            0));
+    IntVect dom_hi(AMREX_D_DECL(n_cells[0]-1, n_cells[1]-1, n_cells[2]-1));
+    Box domain(dom_lo, dom_hi);
+
+    // This defines the physical box, [-1,1] in each direction.
+    RealBox real_box({AMREX_D_DECL(prob_lo[0],prob_lo[1],prob_lo[2])},
+                     {AMREX_D_DECL(prob_hi[0],prob_hi[1],prob_hi[2])});
+
+    // This defines a Geometry object
+    Vector<int> is_periodic(3,1);  // force to be periodic -- can change later
+    geom.define(domain,&real_box,CoordSys::cartesian,is_periodic.data());
+
+    const GpuArray<Real, 3> dx = geom.CellSizeArray();
+    const RealBox& realDomain = geom.ProbDomain();
+
+    SpectralReadCheckPoint(geom, domain, prim, vel, ba, dmap, n_cells, nprimvars, max_grid_size, ngc, restart);
+    
+    MultiFab MFTurbScalar;
+    MultiFab MFTurbVel;
+    MultiFab vel_decomp_filter_heffte;
+    MultiFab scalar_filter_heffte;
+    MFTurbVel.define(ba, dmap, 3, 0);
+    MFTurbScalar.define(ba, dmap, 1, 0);
+    vel_decomp_filter_heffte.define(ba, dmap, 9, 0);
+    scalar_filter_heffte.define(ba, dmap, 1, 0);
+    vel_decomp_filter_heffte.setVal(0.0);
+    scalar_filter_heffte.setVal(0.0);
+
+    // Set BC: 1) fill boundary 2) physical
+    for (int d=0; d<3; d++) {
+        vel[d].FillBoundary(geom.periodicity());
+    }
+    prim.FillBoundary(geom.periodicity());
+
+    for(int d=0; d<3; d++) {
+        ShiftFaceToCC(vel[d], 0, MFTurbVel, d, 1);
+    }
+    MultiFab::Copy(MFTurbScalar, prim, 0, 0, 1, 0);
+    
+    SpectralVelDecomp(MFTurbVel, vel_decomp_filter_heffte, kmin, kmax, geom, n_cells);
+    SpectralScalarDecomp(MFTurbScalar, scalar_filter_heffte, kmin, kmax, geom, n_cells);
+
+    MultiFab vel_decomp_filter;
+    MultiFab scalar_filter;
+    vel_decomp_filter.define(ba, dmap, 9, 2);
+    scalar_filter.define(ba, dmap, 1, 2);
+    MultiFab::Copy(vel_decomp_filter,vel_decomp_filter_heffte,0,0,9,0);
+    MultiFab::Copy(scalar_filter,scalar_filter_heffte,0,0,1,0);
+    vel_decomp_filter.FillBoundary(geom.periodicity());
+    scalar_filter.FillBoundary(geom.periodicity());
+
+    if (plot_filter) SpectralWritePlotFile(restart, kmin, kmax, geom, vel_decomp_filter, scalar_filter, MFTurbVel, MFTurbScalar);
+
+    // Turbulence Diagnostics
+    Real u_rms, u_rms_s, u_rms_d, delta_u_rms;
+    Real taylor_len, taylor_Re_eta;
+    Real skew, skew_s, skew_d, kurt, kurt_s, kurt_d;
+    Vector<Real> var(9, 0.0);
+    Real skew_vort, kurt_vort, skew_div, kurt_div;
+    {
+      Vector<Real> dProb(3);
+      dProb[0] = 1.0/((n_cells[0]+1)*n_cells[1]*n_cells[2]);
+      dProb[1] = 1.0/((n_cells[1]+1)*n_cells[2]*n_cells[0]);
+      dProb[2] = 1.0/((n_cells[2]+1)*n_cells[0]*n_cells[1]);
+
+      // Setup temp MultiFabs
+      std::array< MultiFab, 3 > gradU;
+      std::array< MultiFab, 3 > faceTemp;
+      MultiFab sound_speed;
+      MultiFab ccTemp;
+      MultiFab ccTempA;
+      AMREX_D_TERM(gradU[0].define(convert(prim.boxArray(),nodal_flag_x), prim.DistributionMap(), 9, 0);,
+                   gradU[1].define(convert(prim.boxArray(),nodal_flag_y), prim.DistributionMap(), 9, 0);,
+                   gradU[2].define(convert(prim.boxArray(),nodal_flag_z), prim.DistributionMap(), 9, 0););   
+      AMREX_D_TERM(faceTemp[0].define(convert(prim.boxArray(),nodal_flag_x), prim.DistributionMap(), 1, 0);,
+                   faceTemp[1].define(convert(prim.boxArray(),nodal_flag_y), prim.DistributionMap(), 1, 0);,
+                   faceTemp[2].define(convert(prim.boxArray(),nodal_flag_z), prim.DistributionMap(), 1, 0););   
+      sound_speed.define(prim.boxArray(),prim.DistributionMap(),1,0);
+      ccTemp.define(prim.boxArray(),prim.DistributionMap(),1,0);
+      ccTempA.define(prim.boxArray(),prim.DistributionMap(),1,0);
+
+      // Setup temp variables
+      Vector<Real> gradU2_temp(3);
+      Vector<Real> gradU2(3);
+      Vector<Real> gradU3(3);
+      Vector<Real> gradU4(3);
+      Vector<Real> gradU2_s(3);
+      Vector<Real> gradU3_s(3);
+      Vector<Real> gradU4_s(3);
+      Vector<Real> gradU2_d(3);
+      Vector<Real> gradU3_d(3);
+      Vector<Real> gradU4_d(3);
+      
+      Vector<int> comps   {0,1,2};
+      Vector<int> comps_s{3,4,5};
+      Vector<int> comps_d{6,7,8};
+
+      // turbulent kinetic energy (total)
+      ccTemp.setVal(0.0);
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,0,vel_decomp_filter,0,0,1,0); //uu
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,1,vel_decomp_filter,1,0,1,0); //vv
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,2,vel_decomp_filter,2,0,1,0); //ww
+      u_rms = ccTemp.sum(0)/npts;
+      u_rms = sqrt(u_rms/3.0);
+      MultiFab::Multiply(ccTemp,prim,0,0,1,0); // rho*(uu+vv+ww)
+
+      // turbulent kinetic energy (solenoidal)
+      ccTemp.setVal(0.0);
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,3,vel_decomp_filter,3,0,1,0); //uu
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,4,vel_decomp_filter,4,0,1,0); //vv
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,5,vel_decomp_filter,5,0,1,0); //ww
+      u_rms_s = ccTemp.sum(0)/npts;
+      u_rms_s = sqrt(u_rms_s/3.0);
+      MultiFab::Multiply(ccTemp,prim,0,0,1,0); // rho*(uu+vv+ww)
+
+      // turbulent kinetic energy (dilatational)
+      ccTemp.setVal(0.0);
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,6,vel_decomp_filter,6,0,1,0); //uu
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,7,vel_decomp_filter,7,0,1,0); //vv
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,8,vel_decomp_filter,8,0,1,0); //ww
+      u_rms_d = ccTemp.sum(0)/npts;
+      u_rms_d = sqrt(u_rms_d/3.0);
+      MultiFab::Multiply(ccTemp,prim,0,0,1,0); // rho*(uu+vv+ww)
+      
+      // ratio of turbulent kinetic energies
+      delta_u_rms  = u_rms_d/u_rms_s;
+
+      // compute gradU = [du/dx dv/dy dw/dz] at cell-centers
+      ComputeGrad(vel_decomp_filter,gradU,0,0,9,-1,geom,0);
+
+      // Compute Velocity gradient moment sum
+      // 2nd moment (total)
+      FCMoments(gradU,comps,faceTemp,2,gradU2_temp);
+      gradU2[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU2[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU2[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+      ccTemp.setVal(0.0);
+      ccTempA.setVal(0.0);
+      ShiftFaceToCC(faceTemp[0],0,ccTempA,0,1);
+      MultiFab::Add(ccTemp,ccTempA,0,0,1,0);
+      ShiftFaceToCC(faceTemp[1],0,ccTempA,0,1);
+      MultiFab::Add(ccTemp,ccTempA,0,0,1,0);
+      ShiftFaceToCC(faceTemp[2],0,ccTempA,0,1);
+      MultiFab::Add(ccTemp,ccTempA,0,0,1,0);
+      Real avg_mom2 = ccTemp.sum(0)/npts;
+      // 2nd moment (solenoidal)
+      FCMoments(gradU,comps_s,faceTemp,2,gradU2_temp);
+      gradU2_s[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU2_s[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU2_s[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+      // 2nd moment (dilatational)
+      FCMoments(gradU,comps_d,faceTemp,2,gradU2_temp);
+      gradU2_d[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU2_d[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU2_d[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+
+      // Taylor Mach
+      //ComputeSoundSpeed(sound_speed,prim,2);
+      //Real c_speed = sound_speed.sum(0)/npts;
+      Real rho_avg = prim.sum(0)/npts;
+      // Taylor Ma
+      //taylor_Ma = sqrt(3.0)*u_rms/c_speed;
+      // Taylor Microscale
+      taylor_len = sqrt(3.0)*u_rms/sqrt(avg_mom2); // from Wang et al., JFM, 2012
+      taylor_Re_eta = rho_avg*taylor_len*u_rms; // from from John, Donzis, Sreenivasan, PRL 2019
+
+      // Compute Velocity gradient moment sum
+      // 3rd moment (total)
+      FCMoments(gradU,comps,faceTemp,3,gradU2_temp);
+      gradU3[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU3[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU3[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+      // 3rd moment (solenoidal)
+      FCMoments(gradU,comps_s,faceTemp,3,gradU2_temp);
+      gradU3_s[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU3_s[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU3_s[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+      // 3rd moment (dilatational)
+      FCMoments(gradU,comps_d,faceTemp,3,gradU2_temp);
+      gradU3_d[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU3_d[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU3_d[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+
+      // Compute Velocity gradient moment sum
+      // 4th moment (total)
+      FCMoments(gradU,comps,faceTemp,4,gradU2_temp);
+      gradU4[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU4[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU4[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+      // 4th moment (solenoidal)
+      FCMoments(gradU,comps_s,faceTemp,4,gradU2_temp);
+      gradU4_s[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU4_s[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU4_s[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+      // 4th moment (dilatational)
+      FCMoments(gradU,comps_d,faceTemp,4,gradU2_temp);
+      gradU4_d[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU4_d[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU4_d[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+              
+      // Skewness
+      // <\sum_i (du_i/dx_i)^3> / (\sum_i <(du_i/dx_i)^2>^1.5)
+      skew   = (gradU3[0] + gradU3[1] + gradU3[2])/
+               (pow(gradU2[0],1.5) + pow(gradU2[1],1.5) + pow(gradU2[2],1.5)); 
+      skew_s = (gradU3_s[0] + gradU3_s[1] + gradU3_s[2])/
+               (pow(gradU2_s[0],1.5) + pow(gradU2_s[1],1.5) + pow(gradU2_s[2],1.5)); 
+      skew_d = (gradU3_d[0] + gradU3_d[1] + gradU3_d[2])/
+               (pow(gradU2_d[0],1.5) + pow(gradU2_d[1],1.5) + pow(gradU2_d[2],1.5)); 
+              
+      // Kurtosis
+      // <\sum_i (du_i/dx_i)^4> / (\sum_i <(du_i/dx_i)^2>^2)
+      kurt   = (gradU4[0] + gradU4[1] + gradU4[2])/
+               (pow(gradU2[0],2.0) + pow(gradU2[1],2.0) + pow(gradU2[2],2.0)); 
+      kurt_s = (gradU4_s[0] + gradU4_s[1] + gradU4_s[2])/
+               (pow(gradU2_s[0],2.0) + pow(gradU2_s[1],2.0) + pow(gradU2_s[2],2.0)); 
+      kurt_d = (gradU4_d[0] + gradU4_d[1] + gradU4_d[2])/
+               (pow(gradU2_d[0],2.0) + pow(gradU2_d[1],2.0) + pow(gradU2_d[2],2.0)); 
+    
+      // velocity variances
+      for (int i=0;i<9;++i) {
+        ccTemp.setVal(0.0);
+        MultiFab::AddProduct(ccTemp,vel_decomp_filter,i,vel_decomp_filter,i,0,1,0);
+        Real mean = vel_decomp_filter.sum(i)/npts;
+        Real mean2 = ccTemp.sum(0)/npts;
+        var[i] = mean2 - mean*mean;
+      }
+
+      // skewness and kurtosis of velocity voritcity and divergence
+      MultiFab vel_stats;
+      vel_stats.define(prim.boxArray(),prim.DistributionMap(),4,0); // div, w1, w2, w3
+      for ( MFIter mfi(vel_stats,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+        const Box& bx = mfi.tilebox();
+        const Array4<const Real>&  v_decomp = vel_decomp_filter.array(mfi);
+        const Array4<      Real>&  v_stats  = vel_stats.array(mfi);
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            // divergence
+            v_stats(i,j,k,0) = 0.5*( (v_decomp(i+1,j,k,0) - v_decomp(i-1,j,k,0))/dx[0] +
+                                  (v_decomp(i,j+1,k,1) - v_decomp(i,j-1,k,1))/dx[1] +
+                                  (v_decomp(i,j,k+1,2) - v_decomp(i,j,k-1,2))/dx[2] );
+
+            // curl w1 = u_2,1 - u_1,2
+            v_stats(i,j,k,1) = 0.5*( (v_decomp(i+1,j,k,1) - v_decomp(i-1,j,k,1))/dx[0] -
+                                  (v_decomp(i,j+1,k,0) - v_decomp(i,j-1,k,0))/dx[1] );
+
+            // curl w2 = u_1,3 - u_3,1
+            v_stats(i,j,k,2) = 0.5*( (v_decomp(i,j,k+1,0) - v_decomp(i,j,k-1,0))/dx[2] -
+                                  (v_decomp(i+1,j,k,2) - v_decomp(i-1,j,k,2))/dx[0] );
+
+            // curl w2 = u_3,2 - u_2,3
+            v_stats(i,j,k,3) = 0.5*( (v_decomp(i,j+1,k,2) - v_decomp(i,j-1,k,2))/dx[1] -
+                                  (v_decomp(i,j,k+1,1) - v_decomp(i,j,k-1,1))/dx[2] );
+          
+        });
+      }
+      // compute spatial mean
+      Real mean_div  = vel_stats.sum(0) / (npts);
+      Real mean_w1   = vel_stats.sum(1) / (npts);
+      Real mean_w2   = vel_stats.sum(2) / (npts);
+      Real mean_w3   = vel_stats.sum(3) / (npts);
+      vel_stats.plus(-1.0*mean_div, 0, 1);
+      vel_stats.plus(-1.0*mean_w1,  1, 1);
+      vel_stats.plus(-1.0*mean_w2,  2, 1);
+      vel_stats.plus(-1.0*mean_w3,  3, 1);
+
+      Vector<Real> U2(4);
+      Vector<Real> U3(4);
+      Vector<Real> U4(4);
+      for (int i=0;i<4;++i) {
+        CCMoments(vel_stats,i,ccTempA,2,U2[i]);
+        CCMoments(vel_stats,i,ccTempA,3,U3[i]);
+        CCMoments(vel_stats,i,ccTempA,4,U4[i]);
+      }
+      skew_div = U3[0]/pow(U2[0],1.5);
+      kurt_div = U4[0]/pow(U4[0],2.0);
+      skew_vort = (U3[1] + U3[2] + U3[3])/
+                  (pow(U2[1],1.5) + pow(U2[2],1.5) + pow(U2[3],1.5));
+      kurt_vort = (U4[1] + U4[2] + U4[3])/
+                  (pow(U2[1],2.0) + pow(U2[2],2.0) + pow(U2[3],2.0));
+
+    }
+    std::string turbfilename = amrex::Concatenate("turbstats_filtered_",restart,9);
+    std::ostringstream os;
+    os << std::setprecision(3) << kmin;
+    turbfilename += os.str();
+    turbfilename += "_";
+    std::ostringstream oss;
+    oss << std::setprecision(3) << kmax;
+    turbfilename += oss.str();
+    
+    std::ofstream turboutfile;
+    if (ParallelDescriptor::IOProcessor()) {
+      turboutfile.open(turbfilename, std::ios::app);
+    }
+    if (ParallelDescriptor::IOProcessor()) {
+      turboutfile << "u_rms " << "u_rms_s " << "u_rms_d " << "delta_u_rms " 
+                  << "TaylorLen " << "TaylorRe*Eta "
+                  << "skew " << "skew_s " << "skew_d "
+                  << "kurt " << "kurt_s " << "kurt_d "
+                  << "var_ux " << "var_uy " << "var_uz "
+                  << "var_uxs " << "var_uys " << "var_uzs "
+                  << "var_uxd " << "var_uyd " << "var_uzd "
+                  << "skew_div " << "kurt_div "
+                  << "skew_vort " << "kurt_vort "
+                  << std::endl;
+
+      turboutfile << u_rms << " ";
+      turboutfile << u_rms_s << " ";
+      turboutfile << u_rms_d << " ";
+      turboutfile << delta_u_rms << " ";
+      turboutfile << taylor_len << " ";
+      turboutfile << taylor_Re_eta << " ";
+      turboutfile << skew << " ";
+      turboutfile << skew_s << " ";
+      turboutfile << skew_d << " ";
+      turboutfile << kurt << " ";
+      turboutfile << kurt_s << " ";
+      turboutfile << kurt_d << " ";
+      for (int i=0;i<9;++i) {
+        turboutfile << var[i] << " ";
+      }
+      turboutfile << skew_div << " ";
+      turboutfile << kurt_div << " ";
+      turboutfile << skew_vort << " ";
+      turboutfile << kurt_vort << " ";
+      turboutfile << std::endl;
+    }
+    // timer
+    Real ts2 = ParallelDescriptor::second() - ts1;
+    ParallelDescriptor::ReduceRealMax(ts2,  ParallelDescriptor::IOProcessorNumber());
+    amrex::Print() << "Time (spectral filtering) " << ts2 << " seconds\n";
+    
+    // MultiFab memory usage
+    const int IOProc = ParallelDescriptor::IOProcessorNumber();
+
+    amrex::Long min_fab_megabytes  = amrex::TotalBytesAllocatedInFabsHWM()/1048576;
+    amrex::Long max_fab_megabytes  = min_fab_megabytes;
+
+    ParallelDescriptor::ReduceLongMin(min_fab_megabytes, IOProc);
+    ParallelDescriptor::ReduceLongMax(max_fab_megabytes, IOProc);
+
+    amrex::Print() << "High-water FAB megabyte spread across MPI nodes: ["
+                   << min_fab_megabytes << " ... " << max_fab_megabytes << "]\n";
+
+    min_fab_megabytes  = amrex::TotalBytesAllocatedInFabs()/1048576;
+    max_fab_megabytes  = min_fab_megabytes;
+
+    ParallelDescriptor::ReduceLongMin(min_fab_megabytes, IOProc);
+    ParallelDescriptor::ReduceLongMax(max_fab_megabytes, IOProc);
+
+    amrex::Print() << "Curent     FAB megabyte spread across MPI nodes: ["
+                   << min_fab_megabytes << " ... " << max_fab_megabytes << "]\n";
+    
+    if (ParallelDescriptor::IOProcessor()) turboutfile.close();
+}
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.H b/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.H
new file mode 100644
index 00000000..4f99f51a
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.H
@@ -0,0 +1,113 @@
+#ifndef _spectral_functions_stag_H_
+#define _spectral_functions_stag_H_
+
+#include <heffte.h>
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_ArrayLim.H>
+
+#include <AMReX_Vector.H>
+#include <AMReX_VisMF.H>
+
+
+#ifdef AMREX_USE_CUDA
+#include <cufft.h>
+#elif AMREX_USE_HIP
+#  if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
+#    include <rocfft/rocfft.h>
+#  else
+#    include <rocfft.h>
+#  endif
+#else
+#include <fftw3.h>
+#include <fftw3-mpi.h>
+#endif
+
+#include <AMReX_GpuComplex.H>
+
+#include <string>
+
+#define ALIGN 16
+
+using namespace amrex;
+
+#if !defined(HEFFTE_FFTW) && !defined(HEFFTE_CUFFT) && !defined(HEFFTE_ROCFFT)
+#ifdef AMREX_USE_CUDA
+std::string cufftError (const cufftResult& err);
+#endif
+#ifdef AMREX_USE_HIP
+std::string rocfftError (const rocfft_status err);
+void Assert_rocfft_status (std::string const& name, rocfft_status status);
+#endif
+#endif
+
+void SpectralReadCheckPoint(amrex::Geometry& geom,
+                            const amrex::Box& domain,
+                            amrex::MultiFab& prim,
+                            std::array<MultiFab, 3>& vel,
+                            BoxArray& ba, DistributionMapping& dmap,
+                            const amrex::Vector<int> n_cells,
+                            const int nprimvars,
+                            const amrex::Vector<int> max_grid_size,
+                            const amrex::IntVect ngc,
+                            const int restart);
+
+void SpectralVelDecomp(const MultiFab& vel,
+                       MultiFab& vel_decomp_filter,
+                       const amrex::Real kmin,
+                       const amrex::Real kmax,
+                       const amrex::Geometry& geom,
+                       const amrex::Vector<int> n_cells);
+
+void SpectralScalarDecomp(const MultiFab& scalar,
+                          MultiFab& scalar_filter,
+                          const amrex::Real kmin,
+                          const amrex::Real kmax,
+                          const amrex::Geometry& geom,
+                          const amrex::Vector<int> n_cells);
+
+void SpectralWritePlotFile(const int step,
+                           const amrex::Real& kmin,
+                           const amrex::Real& kmax,
+                           const amrex::Geometry& geom,
+                           const amrex::MultiFab& vel_decomp_in,
+                           const amrex::MultiFab& scalar_in,
+                           const amrex::MultiFab& vel_total,
+                           const amrex::MultiFab& scalar_total);
+
+void Read_Copy_MF_Checkpoint(amrex::MultiFab& mf, std::string mf_name, 
+                             const std::string& checkpointname, 
+                             BoxArray& ba_old, DistributionMapping& dmap_old,
+                             int NVARS, int NGC, const amrex::IntVect ngc,
+                             int nodal_flag=-1);
+
+void ShiftFaceToCC(const MultiFab& face_in, int face_in_comp,
+                   MultiFab& cc_in, int cc_in_comp,
+                   int ncomp);
+
+void ComputeGrad(const MultiFab & phi_in, std::array<MultiFab, 3> & gphi,
+                 int start_incomp, int start_outcomp, int ncomp, int bccomp, const Geometry & geom,
+                 int increment);
+
+void SumStag(const std::array<MultiFab, 3>& m1,
+	     amrex::Vector<amrex::Real>& sum);
+
+void FCMoments(const std::array<MultiFab, 3>& m1,
+		       const amrex::Vector<int>& comps,
+               std::array<MultiFab, 3>&  mscr,
+		       const int& power,
+		       amrex::Vector<amrex::Real>& prod_val);
+
+void SumCC(const amrex::MultiFab& m1,
+	   const int& comp,
+	   amrex::Real& sum,
+	   const bool& divide_by_ncells);
+
+void CCMoments(const amrex::MultiFab& m1,
+		 const int& comp1,
+                 amrex::MultiFab& mscr,
+		 const int& power,
+		 amrex::Real& prod_val);
+
+#endif
+
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.cpp b/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.cpp
new file mode 100644
index 00000000..b36e18b8
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.cpp
@@ -0,0 +1,1077 @@
+#include "spectral_functions.H"
+#include "AMReX_PlotFileUtil.H"
+#include "AMReX_PlotFileDataImpl.H"
+
+#include <sys/stat.h>
+
+#include "chrono"
+#include <AMReX_MultiFabUtil.H>
+#include "AMReX_PlotFileUtil.H"
+#include "AMReX_BoxArray.H"
+
+using namespace std::chrono;
+
+namespace {
+    void GotoNextLine (std::istream& is)
+    {
+        constexpr std::streamsize bl_ignore_max { 100000 };
+        is.ignore(bl_ignore_max, '\n');
+    }
+}
+
+void SpectralReadCheckPoint(amrex::Geometry& geom,
+                            const amrex::Box& domain,
+                            amrex::MultiFab& prim,
+                            std::array<MultiFab, 3>& vel,
+                            BoxArray& ba, DistributionMapping& dmap,
+                            const amrex::Vector<int> n_cells,
+                            const int nprimvars,
+                            const amrex::Vector<int> max_grid_size,
+                            const amrex::IntVect ngc,
+                            const int restart)
+{
+    // timer for profiling
+    BL_PROFILE_VAR("SpectralReadCheckPoint()",SpectralReadCheckPoint);
+
+    // checkpoint file name, e.g., chk0000010
+    const std::string& checkpointname = amrex::Concatenate("chk",restart,9);
+
+    amrex::Print() << "Restart from checkpoint " << checkpointname << "\n";
+
+    VisMF::IO_Buffer io_buffer(VisMF::GetIOBufferSize());
+
+    std::string line, word;
+
+    // read in old boxarray, and create old distribution map (this is to read in MFabs)
+    BoxArray ba_old;
+    DistributionMapping dmap_old;
+
+    // initialize new boxarray
+    ba.define(domain);
+    ba.maxSize(IntVect(max_grid_size));
+    dmap.define(ba, ParallelDescriptor::NProcs());
+    
+    amrex::Vector<amrex::IntVect> nodal_flag_dir;
+    amrex::IntVect                nodal_flag_x;
+    amrex::IntVect                nodal_flag_y;
+    amrex::IntVect                nodal_flag_z;
+    nodal_flag_dir.resize(3);
+
+    for (int i=0; i<3; ++i) {
+        nodal_flag_x[i] = int(i==0);
+        nodal_flag_y[i] = int(i==1);
+        nodal_flag_z[i] = int(i==2);
+        AMREX_D_TERM(nodal_flag_dir[0][i] = nodal_flag_x[i];,
+                     nodal_flag_dir[1][i] = nodal_flag_y[i];,
+                     nodal_flag_dir[2][i] = nodal_flag_z[i];);
+    }
+    
+    // Header
+    {
+        std::string File(checkpointname + "/Header");
+        Vector<char> fileCharPtr;
+        ParallelDescriptor::ReadAndBcastFile(File, fileCharPtr);
+        std::string fileCharPtrString(fileCharPtr.dataPtr());
+        std::istringstream is(fileCharPtrString, std::istringstream::in);
+
+        // read in title line
+        std::getline(is, line);
+
+        // read in time step number
+        int step;
+        is >> step;
+        GotoNextLine(is);
+
+        // read in time
+        Real time;
+        is >> time;
+        GotoNextLine(is);
+
+        // read in statsCount
+        int statsCount;
+        is >> statsCount;
+        GotoNextLine(is);
+
+        // read in BoxArray (fluid) from Header
+        ba_old.readFrom(is);
+        GotoNextLine(is);
+
+        // create old distribution mapping
+        dmap_old.define(ba_old, ParallelDescriptor::NProcs());
+
+        prim.define(ba,dmap,nprimvars,ngc);
+        // velocity and momentum (instantaneous, means, variances)
+        for (int d=0; d<3; d++) {
+            vel[d].define(convert(ba,nodal_flag_dir[d]), dmap, 1, ngc);
+        }
+    }
+
+    // C++ random number engine
+    // each MPI process reads in its own file
+    int comm_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank);
+
+    int n_ranks;
+    MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);
+    
+    // read in the MultiFab data
+    Read_Copy_MF_Checkpoint(prim,"prim",checkpointname,ba_old,dmap_old,nprimvars,1,ngc);
+
+    Read_Copy_MF_Checkpoint(vel[0],"velx",checkpointname,ba_old,dmap_old,1,1,ngc,0);
+    Read_Copy_MF_Checkpoint(vel[1],"vely",checkpointname,ba_old,dmap_old,1,1,ngc,1);
+    Read_Copy_MF_Checkpoint(vel[2],"velz",checkpointname,ba_old,dmap_old,1,1,ngc,2);
+
+    // FillBoundaries
+    prim.FillBoundary(geom.periodicity());
+    vel[0].FillBoundary(geom.periodicity());
+    vel[1].FillBoundary(geom.periodicity());
+    vel[2].FillBoundary(geom.periodicity());
+}
+
+void SpectralVelDecomp(const MultiFab& vel,
+                       MultiFab& vel_decomp_filter,
+                       const amrex::Real kmin,
+                       const amrex::Real kmax,
+                       const amrex::Geometry& geom,
+                       const amrex::Vector<int> n_cells)
+{
+    BL_PROFILE_VAR("SpectralVelDecomp()",SpectralVelDecomp);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(vel.nComp() == 3, 
+        "SpectralVelDecomp: must have 3 components of input vel MultiFab");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(vel.local_size() == 1, 
+        "SpectralVelDecomp: Must have one Box per MPI process when using heFFTe");
+
+    const GpuArray<Real, 3> dx = geom.CellSizeArray();
+    
+    long npts;
+    Box domain = geom.Domain();
+    npts = (domain.length(0)*domain.length(1)*domain.length(2));
+    Real sqrtnpts = std::sqrt(npts);
+    
+    // get box array and distribution map of vel
+    DistributionMapping dm = vel.DistributionMap();
+    BoxArray ba            = vel.boxArray();
+    
+    // since there is 1 MPI rank per box, each MPI rank obtains its local box and the associated boxid
+    Box local_box;
+    int local_boxid;
+    {
+        for (int i = 0; i < ba.size(); ++i) {
+            Box b = ba[i];
+            // each MPI rank has its own local_box Box and local_boxid ID
+            if (ParallelDescriptor::MyProc() == dm[i]) {
+                local_box = b;
+                local_boxid = i;
+            }
+        }
+    }
+
+    // now each MPI rank works on its own box
+    // for real->complex fft's, the fft is stored in an (nx/2+1) x ny x nz dataset
+
+    // start by coarsening each box by 2 in the x-direction
+    Box c_local_box = amrex::coarsen(local_box, IntVect(AMREX_D_DECL(2,1,1)));
+
+    // if the coarsened box's high-x index is even, we shrink the size in 1 in x
+    // this avoids overlap between coarsened boxes
+    if (c_local_box.bigEnd(0) * 2 == local_box.bigEnd(0)) {
+        c_local_box.setBig(0,c_local_box.bigEnd(0)-1);
+    }
+    // for any boxes that touch the hi-x domain we
+    // increase the size of boxes by 1 in x
+    // this makes the overall fft dataset have size (Nx/2+1 x Ny x Nz)
+    if (local_box.bigEnd(0) == geom.Domain().bigEnd(0)) {
+        c_local_box.growHi(0,1);
+    }
+
+    // each MPI rank gets storage for its piece of the fft
+    BaseFab<GpuComplex<Real> > spectral_field_Tx(c_local_box, 1, The_Device_Arena()); // totalx
+    BaseFab<GpuComplex<Real> > spectral_field_Ty(c_local_box, 1, The_Device_Arena()); // totaly
+    BaseFab<GpuComplex<Real> > spectral_field_Tz(c_local_box, 1, The_Device_Arena()); // totalz
+    BaseFab<GpuComplex<Real> > spectral_field_Sx(c_local_box, 1, The_Device_Arena()); // solenoidalx
+    BaseFab<GpuComplex<Real> > spectral_field_Sy(c_local_box, 1, The_Device_Arena()); // solenoidaly
+    BaseFab<GpuComplex<Real> > spectral_field_Sz(c_local_box, 1, The_Device_Arena()); // solenoidalz
+    BaseFab<GpuComplex<Real> > spectral_field_Dx(c_local_box, 1, The_Device_Arena()); // dilatationalx
+    BaseFab<GpuComplex<Real> > spectral_field_Dy(c_local_box, 1, The_Device_Arena()); // dilatationaly
+    BaseFab<GpuComplex<Real> > spectral_field_Dz(c_local_box, 1, The_Device_Arena()); // dilatationalz
+	  MultiFab vel_single(ba, dm, 1, 0);
+    	
+    int r2c_direction = 0;
+    
+    // ForwardTransform
+    // X
+    using heffte_complex = typename heffte::fft_output<Real>::type;
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      vel_single.ParallelCopy(vel, 0, 0, 1);
+	    heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tx.dataPtr();
+      fft.forward(vel_single[local_boxid].dataPtr(),spectral_data);
+    }
+    // Y
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      vel_single.ParallelCopy(vel, 1, 0, 1);
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Ty.dataPtr();
+      fft.forward(vel_single[local_boxid].dataPtr(),spectral_data);
+    }
+    // Z
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      vel_single.ParallelCopy(vel, 2, 0, 1);
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tz.dataPtr();
+      fft.forward(vel_single[local_boxid].dataPtr(),spectral_data);
+    }
+    
+    Gpu::streamSynchronize();
+    
+    int nx = n_cells[0]; 
+    int ny = n_cells[1]; 
+    int nz = n_cells[2];
+    
+    // Decompose velocity field into solenoidal and dilatational
+    Array4< GpuComplex<Real> > spectral_tx = spectral_field_Tx.array();
+    Array4< GpuComplex<Real> > spectral_ty = spectral_field_Ty.array();
+    Array4< GpuComplex<Real> > spectral_tz = spectral_field_Tz.array();
+    Array4< GpuComplex<Real> > spectral_sx = spectral_field_Sx.array();
+    Array4< GpuComplex<Real> > spectral_sy = spectral_field_Sy.array();
+    Array4< GpuComplex<Real> > spectral_sz = spectral_field_Sz.array();
+    Array4< GpuComplex<Real> > spectral_dx = spectral_field_Dx.array();
+    Array4< GpuComplex<Real> > spectral_dy = spectral_field_Dy.array();
+    Array4< GpuComplex<Real> > spectral_dz = spectral_field_Dz.array();
+    ParallelFor(c_local_box, [=] AMREX_GPU_DEVICE(int i, int j, int k)
+    {
+
+       Real GxR = 0.0, GxC = 0.0, GyR = 0.0, GyC = 0.0, GzR = 0.0, GzC = 0.0;
+       
+       if (i <= nx/2) { 
+           
+           // Get the wavevector
+           int ki = i;
+           int kj = j;
+           if (j >= ny/2) kj = ny - j;
+           int kk = k;
+           if (k >= nz/2) kk = nz - k;
+
+           // Gradient Operators
+           GxR = (cos(2.0*M_PI*ki/nx)-1.0)/dx[0];
+           GxC = (sin(2.0*M_PI*ki/nx)-0.0)/dx[0];
+           GyR = (cos(2.0*M_PI*kj/ny)-1.0)/dx[1];
+           GyC = (sin(2.0*M_PI*kj/ny)-0.0)/dx[1];
+           GzR = (cos(2.0*M_PI*kk/nz)-1.0)/dx[2];
+           GzC = (sin(2.0*M_PI*kk/nz)-0.0)/dx[2];
+       }
+       else { // conjugate
+            amrex::Abort("check the code; i should not go beyond bx.length(0)/2");
+       }
+
+       // Get the wavenumber
+       int ki = i;
+       int kj = j;
+       if (j >= ny/2) kj = ny - j;
+       int kk = k;
+       if (k >= nz/2) kk = nz - k;
+       Real knum = (ki*ki + kj*kj + kk*kk);
+       knum = std::sqrt(knum);
+
+       // Scale Total velocity FFT components with Filtering
+       if ((knum >= kmin) and (knum <= kmax)) {
+         
+         spectral_tx(i,j,k) *= (1.0/sqrtnpts);
+         spectral_ty(i,j,k) *= (1.0/sqrtnpts);
+         spectral_tz(i,j,k) *= (1.0/sqrtnpts);
+
+         // Inverse Laplacian
+         Real Lap = GxR*GxR + GxC*GxC + GyR*GyR + GyC*GyC + GzR*GzR + GzC*GzC;
+
+         // Divergence of vel
+         Real divR = spectral_tx(i,j,k).real()*GxR - spectral_tx(i,j,k).imag()*GxC +
+                     spectral_ty(i,j,k).real()*GyR - spectral_ty(i,j,k).imag()*GyC +
+                     spectral_tz(i,j,k).real()*GzR - spectral_tz(i,j,k).imag()*GzC ;
+         Real divC = spectral_tx(i,j,k).real()*GxC + spectral_tx(i,j,k).imag()*GxR +
+                     spectral_ty(i,j,k).real()*GyC + spectral_ty(i,j,k).imag()*GyR +
+                     spectral_tz(i,j,k).real()*GzC + spectral_tz(i,j,k).imag()*GzR ;
+
+         if (Lap < 1.0e-12) { // zero mode for no bulk motion
+             spectral_dx(i,j,k) *= 0.0;
+             spectral_dy(i,j,k) *= 0.0;
+             spectral_dz(i,j,k) *= 0.0;
+         }
+         else {
+
+             // Dilatational velocity 
+             GpuComplex<Real> copy_dx((divR*GxR + divC*GxC) / Lap, 
+                                      (divC*GxR - divR*GxC) / Lap);
+             spectral_dx(i,j,k) = copy_dx;
+
+             GpuComplex<Real> copy_dy((divR*GyR + divC*GyC) / Lap,
+                                      (divC*GyR - divR*GyC) / Lap);
+             spectral_dy(i,j,k) = copy_dy;
+
+             GpuComplex<Real> copy_dz((divR*GzR + divC*GzC) / Lap,
+                                      (divC*GzR - divR*GzC) / Lap);
+             spectral_dz(i,j,k) = copy_dz;
+         }
+             
+         // Solenoidal velocity
+         spectral_sx(i,j,k) = spectral_tx(i,j,k) - spectral_dx(i,j,k);
+         spectral_sy(i,j,k) = spectral_ty(i,j,k) - spectral_dy(i,j,k); 
+         spectral_sz(i,j,k) = spectral_tz(i,j,k) - spectral_dz(i,j,k);
+       }
+       else {
+         spectral_tx(i,j,k) = 0.0;
+         spectral_ty(i,j,k) = 0.0;
+         spectral_tz(i,j,k) = 0.0;
+         spectral_sx(i,j,k) = 0.0;
+         spectral_sy(i,j,k) = 0.0;
+         spectral_sz(i,j,k) = 0.0;
+         spectral_dx(i,j,k) = 0.0;
+         spectral_dy(i,j,k) = 0.0;
+         spectral_dz(i,j,k) = 0.0;
+       }
+
+    });
+
+    Gpu::streamSynchronize();
+    
+	  MultiFab vel_decomp_filter_single(ba, dm, 1, 0);
+    // inverse Fourier transform filtered total velocity 
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tx.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 0, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Ty.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 1, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tz.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 2, 1);
+    }
+    // inverse Fourier transform filtered solenoidal and dilatational components 
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Sx.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 3, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Sy.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 4, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Sz.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 5, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Dx.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 6, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Dy.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 7, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Dz.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 8, 1);
+    }
+
+    
+    vel_decomp_filter.mult(1.0/sqrtnpts);
+
+}
+
+
+void SpectralScalarDecomp(const MultiFab& scalar,
+                          MultiFab& scalar_filter,
+                          const amrex::Real kmin,
+                          const amrex::Real kmax,
+                          const amrex::Geometry& geom,
+                          const amrex::Vector<int> n_cells)
+{
+    BL_PROFILE_VAR("SpectralScalarDecomp()",SpectralScalarDecomp);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(scalar.nComp() == 1, 
+        "SpectralScalarDecomp: must have 1 components of input scalar MultiFab");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(scalar.local_size() == 1, 
+        "SpectralScalarDecomp: Must have one Box per MPI process when using heFFTe");
+
+    const GpuArray<Real, 3> dx = geom.CellSizeArray();
+    
+    long npts;
+    Box domain = geom.Domain();
+    npts = (domain.length(0)*domain.length(1)*domain.length(2));
+    Real sqrtnpts = std::sqrt(npts);
+    
+    // get box array and distribution map of vel
+    DistributionMapping dm = scalar.DistributionMap();
+    BoxArray ba            = scalar.boxArray();
+    
+    // since there is 1 MPI rank per box, each MPI rank obtains its local box and the associated boxid
+    Box local_box;
+    int local_boxid;
+    {
+        for (int i = 0; i < ba.size(); ++i) {
+            Box b = ba[i];
+            // each MPI rank has its own local_box Box and local_boxid ID
+            if (ParallelDescriptor::MyProc() == dm[i]) {
+                local_box = b;
+                local_boxid = i;
+            }
+        }
+    }
+
+    // now each MPI rank works on its own box
+    // for real->complex fft's, the fft is stored in an (nx/2+1) x ny x nz dataset
+
+    // start by coarsening each box by 2 in the x-direction
+    Box c_local_box = amrex::coarsen(local_box, IntVect(AMREX_D_DECL(2,1,1)));
+
+    // if the coarsened box's high-x index is even, we shrink the size in 1 in x
+    // this avoids overlap between coarsened boxes
+    if (c_local_box.bigEnd(0) * 2 == local_box.bigEnd(0)) {
+        c_local_box.setBig(0,c_local_box.bigEnd(0)-1);
+    }
+    // for any boxes that touch the hi-x domain we
+    // increase the size of boxes by 1 in x
+    // this makes the overall fft dataset have size (Nx/2+1 x Ny x Nz)
+    if (local_box.bigEnd(0) == geom.Domain().bigEnd(0)) {
+        c_local_box.growHi(0,1);
+    }
+
+    // each MPI rank gets storage for its piece of the fft
+    BaseFab<GpuComplex<Real> > spectral_field(c_local_box, 1, The_Device_Arena());
+	  MultiFab scalar_single(ba, dm, 1, 0);
+    	
+    int r2c_direction = 0;
+    
+    // ForwardTransform
+    using heffte_complex = typename heffte::fft_output<Real>::type;
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      scalar_single.ParallelCopy(scalar, 0, 0, 1);
+	    heffte_complex* spectral_data = (heffte_complex*) spectral_field.dataPtr();
+      fft.forward(scalar_single[local_boxid].dataPtr(),spectral_data);
+    }
+    
+    Gpu::streamSynchronize();
+    
+    // filtering
+    Array4< GpuComplex<Real> > spectral = spectral_field.array();
+    int nx = n_cells[0]; 
+    int ny = n_cells[1]; 
+    int nz = n_cells[2];
+    ParallelFor(c_local_box, [=] AMREX_GPU_DEVICE(int i, int j, int k)
+    {
+
+       if (i <= nx/2) { 
+       }
+       else { // conjugate
+            amrex::Abort("check the code; i should not go beyond bx.length(0)/2");
+       }
+
+       // Get the wavenumber
+       int ki = i;
+       int kj = j;
+       if (j >= ny/2) kj = ny - j;
+       int kk = k;
+       if (k >= nz/2) kk = nz - k;
+       Real knum = (ki*ki + kj*kj + kk*kk);
+       knum = std::sqrt(knum);
+
+       // Scale Scalar FFT components with Filtering
+       if ((knum >= kmin) and (knum <= kmax)) {
+         spectral(i,j,k) *= (1.0/sqrtnpts);
+         spectral(i,j,k) *= (1.0/sqrtnpts);
+         spectral(i,j,k) *= (1.0/sqrtnpts);
+       }
+       else {
+         spectral(i,j,k) *= 0.0;
+         spectral(i,j,k) *= 0.0;
+         spectral(i,j,k) *= 0.0;
+       }
+    });
+
+    Gpu::streamSynchronize();
+    
+	  MultiFab scalar_filter_single(ba, dm, 1, 0);
+    // inverse Fourier transform filtered scalar 
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field.dataPtr();
+      fft.backward(spectral_data, scalar_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      scalar_filter.ParallelCopy(scalar_filter_single, 0, 0, 1);
+    }
+    
+    scalar_filter.mult(1.0/sqrtnpts);
+
+}
+
+void SpectralWritePlotFile(const int step,
+                           const amrex::Real& kmin,
+                           const amrex::Real& kmax,
+                           const amrex::Geometry& geom,
+                           const amrex::MultiFab& vel_decomp_in,
+                           const amrex::MultiFab& scalar_in,
+                           const amrex::MultiFab& vel_total,
+                           const amrex::MultiFab& scalar_total)
+{
+
+    MultiFab output;
+    
+    // Cell-Centered Velocity Gradient Stats (1,2,3 are directions)
+    // 0: ux
+    // 1: uy
+    // 2: uz
+    // 3: ux_s
+    // 4: uy_s
+    // 5: uz_s
+    // 6: ux_d
+    // 7: uy_d
+    // 8: uz_d
+    // 9: umag
+    // 10: umag_s
+    // 11: umag_d
+    // 12: scalar
+    // 13: divergence = u_1,1 + u_2,2 + u_3,3
+    // 14: vorticity w1
+    // 15: vorticity w2
+    // 16: vorticity w3
+    // 17: vorticity mag: sqrt(w1**2 + w2**2 + w3**2)
+    // 18: ux_org
+    // 19: scalar_org
+    output.define(vel_decomp_in.boxArray(), vel_decomp_in.DistributionMap(), 20, 0);
+    output.setVal(0.0);
+
+    const GpuArray<Real, 3> dx = geom.CellSizeArray();
+
+    for ( MFIter mfi(output,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+        
+        const Box& bx = mfi.tilebox();
+        
+        const Array4<      Real>&             out   = output.array(mfi);
+
+        const Array4<const Real>&  v_decomp         = vel_decomp_in.array(mfi);
+
+        const Array4<const Real>&  sca              = scalar_in.array(mfi);
+
+        const Array4<const Real>&  v_tot            = vel_total.array(mfi);
+
+        const Array4<const Real>&  sca_tot          = scalar_total.array(mfi);
+        
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            
+            out(i,j,k,0) = v_decomp(i,j,k,0);
+            out(i,j,k,1) = v_decomp(i,j,k,1);
+            out(i,j,k,2) = v_decomp(i,j,k,2);
+            out(i,j,k,3) = v_decomp(i,j,k,3);
+            out(i,j,k,4) = v_decomp(i,j,k,4);
+            out(i,j,k,5) = v_decomp(i,j,k,5);
+            out(i,j,k,6) = v_decomp(i,j,k,6);
+            out(i,j,k,7) = v_decomp(i,j,k,7);
+            out(i,j,k,8) = v_decomp(i,j,k,8);
+
+            out(i,j,k,9)  = std::sqrt(out(i,j,k,0)*out(i,j,k,0) + out(i,j,k,1)*out(i,j,k,1) + out(i,j,k,2)*out(i,j,k,2)); // mag
+            out(i,j,k,10) = std::sqrt(out(i,j,k,3)*out(i,j,k,3) + out(i,j,k,4)*out(i,j,k,4) + out(i,j,k,5)*out(i,j,k,5)); // mag solednoidal
+            out(i,j,k,11) = std::sqrt(out(i,j,k,6)*out(i,j,k,6) + out(i,j,k,7)*out(i,j,k,7) + out(i,j,k,8)*out(i,j,k,8)); // mag solednoidal
+            
+            out(i,j,k,12) = sca(i,j,k,0);
+
+            // divergence
+            out(i,j,k,13) = 0.5*( (v_decomp(i+1,j,k,0) - v_decomp(i-1,j,k,0))/dx[0] +
+                                  (v_decomp(i,j+1,k,1) - v_decomp(i,j-1,k,1))/dx[1] +
+                                  (v_decomp(i,j,k+1,2) - v_decomp(i,j,k-1,2))/dx[2] );
+
+            // curl w1 = u_2,1 - u_1,2
+            out(i,j,k,14) = 0.5*( (v_decomp(i+1,j,k,1) - v_decomp(i-1,j,k,1))/dx[0] -
+                                  (v_decomp(i,j+1,k,0) - v_decomp(i,j-1,k,0))/dx[1] );
+
+            // curl w2 = u_1,3 - u_3,1
+            out(i,j,k,15) = 0.5*( (v_decomp(i,j,k+1,0) - v_decomp(i,j,k-1,0))/dx[2] -
+                                  (v_decomp(i+1,j,k,2) - v_decomp(i-1,j,k,2))/dx[0] );
+
+            // curl w2 = u_3,2 - u_2,3
+            out(i,j,k,16) = 0.5*( (v_decomp(i,j+1,k,2) - v_decomp(i,j-1,k,2))/dx[1] -
+                                  (v_decomp(i,j,k+1,1) - v_decomp(i,j,k-1,1))/dx[2] );
+
+            // vorticity magnitude: sqrt(w1*w1 + w2*w2 + w3*w3)
+            out(i,j,k,17) = std::sqrt( out(i,j,k,14)*out(i,j,k,14) + out(i,j,k,15)*out(i,j,k,15) + out(i,j,k,16)*out(i,j,k,16) );
+
+            // original velx
+            out(i,j,k,18) = v_tot(i,j,k,0);
+
+            // original scalar
+            out(i,j,k,19) = sca_tot(i,j,k,0);
+        });
+    }
+
+    // Write on a plotfile
+    std::string plotfilename = amrex::Concatenate("filtered_",step,9);
+    std::ostringstream os;
+    os << std::setprecision(3) << kmin;
+    plotfilename += os.str();;
+    plotfilename += "_";
+    std::ostringstream oss;
+    oss << std::setprecision(3) << kmax;
+    plotfilename += oss.str();
+
+    amrex::Vector<std::string> varNames(20);
+    varNames[0] = "ux";
+    varNames[1] = "uy";
+    varNames[2] = "uz";
+    varNames[3] = "ux_s";
+    varNames[4] = "uy_s";
+    varNames[5] = "uz_s";
+    varNames[6] = "ux_d";
+    varNames[7] = "uy_d";
+    varNames[8] = "uz_d";
+    varNames[9] = "umag";
+    varNames[10] = "umag_s";
+    varNames[11] = "umag_d";
+    varNames[12] = "rho";
+    varNames[13] = "div";
+    varNames[14] = "w1";
+    varNames[15] = "w2";
+    varNames[16] = "w3";
+    varNames[17] = "vort";
+    varNames[18] = "ux_org";
+    varNames[19] = "rho_org";
+    WriteSingleLevelPlotfile(plotfilename,output,varNames,geom,0.0,step);
+}
+
+void Read_Copy_MF_Checkpoint(amrex::MultiFab& mf, std::string mf_name, const std::string& checkpointname, 
+                             BoxArray& ba_old, DistributionMapping& dmap_old,
+                             int NVARS, int ghost, const amrex::IntVect ngc, 
+                             int nodal_flag)
+{
+    // Read into temporary MF from file
+    MultiFab mf_temp;
+    VisMF::Read(mf_temp,amrex::MultiFabFileFullPrefix(0, checkpointname, "Level_", mf_name));
+
+    // Copy temporary MF into the new MF
+    if (ghost) {
+        mf.ParallelCopy(mf_temp, 0, 0, NVARS, ngc, ngc);
+    }
+    else {
+        mf.ParallelCopy(mf_temp, 0, 0, NVARS, 0, 0);
+    }
+}
+
+void ShiftFaceToCC(const MultiFab& face_in, int face_comp,
+                   MultiFab& cc_in, int cc_comp, int ncomp)
+{
+
+    BL_PROFILE_VAR("ShiftFaceToCC()",ShiftFaceToCC);
+
+    if (!face_in.is_nodal(0) && !face_in.is_nodal(1) && !face_in.is_nodal(2)) {
+        Abort("ShiftFaceToCC requires a face-centered MultiFab");
+    }
+
+    // Loop over boxes (note that mfi takes a cell-centered multifab as an argument)
+    for (MFIter mfi(cc_in,TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+
+        const Box& bx = mfi.tilebox();
+
+        Array4<Real const> const& face = face_in.array(mfi);
+
+        Array4<Real> const& cc = cc_in.array(mfi);
+
+        amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+        {
+            cc(i,j,k,cc_comp+n) = face(i,j,k,face_comp+n);
+        });
+    }
+}
+
+void ComputeGrad(const MultiFab & phi_in, std::array<MultiFab, 3> & gphi,
+                 int start_incomp, int start_outcomp, int ncomp, int bccomp, const Geometry & geom,
+                 int increment)
+{
+    BL_PROFILE_VAR("ComputeGrad()",ComputeGrad);
+    
+    // Physical Domain
+    Box dom(geom.Domain());
+    
+    const GpuArray<Real, 3> dx = geom.CellSizeArray();
+
+    // if not incrementing, initialize data to zero
+    if (increment == 0) {
+        for (int dir=0; dir<3; ++dir) {
+            gphi[dir].setVal(0.,start_outcomp,ncomp,0);
+        }
+    }
+    
+    // Loop over boxes (note that mfi takes a cell-centered multifab as an argument)
+    for ( MFIter mfi(phi_in,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+
+        const Array4<Real const> & phi = phi_in.array(mfi);
+
+        AMREX_D_TERM(const Array4<Real> & gphix = gphi[0].array(mfi);,
+                     const Array4<Real> & gphiy = gphi[1].array(mfi);,
+                     const Array4<Real> & gphiz = gphi[2].array(mfi););
+
+        AMREX_D_TERM(const Box & bx_x = mfi.nodaltilebox(0);,
+                     const Box & bx_y = mfi.nodaltilebox(1);,
+                     const Box & bx_z = mfi.nodaltilebox(2););
+        
+        amrex::ParallelFor(bx_x, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+        {
+            gphix(i,j,k,start_outcomp+n) += (phi(i,j,k,start_incomp+n)-phi(i-1,j,k,start_incomp+n)) / dx[0];
+        },
+                           bx_y, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+        {
+            gphiy(i,j,k,start_outcomp+n) += (phi(i,j,k,start_incomp+n)-phi(i,j-1,k,start_incomp+n)) / dx[1];
+        }
+                         , bx_z, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+        {
+            gphiz(i,j,k,start_outcomp+n) += (phi(i,j,k,start_incomp+n)-phi(i,j,k-1,start_incomp+n)) / dx[2];
+        }
+        );
+
+    } // end MFIter
+}
+
+void FCMoments(const std::array<MultiFab, 3>& m1,
+		       const amrex::Vector<int>& comps,
+               std::array<MultiFab, 3>&  mscr,
+		       const int& power,
+		       amrex::Vector<amrex::Real>& prod_val)
+{
+
+  BL_PROFILE_VAR("FCMoments()",FCMoments);
+
+  for (int d=0; d<3; ++d) {
+    MultiFab::Copy(mscr[d],m1[d],comps[d],0,1,0);
+    for(int i=1; i<power; i++){
+      MultiFab::Multiply(mscr[d],m1[d],comps[d],0,1,0);
+    }
+  }
+  SumStag(mscr,prod_val);
+}
+
+void SumStag(const std::array<MultiFab, 3>& m1,
+	     amrex::Vector<amrex::Real>& sum)
+{
+  BL_PROFILE_VAR("SumStag()",SumStag);
+
+  // Initialize to zero
+  std::fill(sum.begin(), sum.end(), 0.);
+
+  ReduceOps<ReduceOpSum> reduce_op;
+
+  //////// x-faces
+
+  ReduceData<Real> reduce_datax(reduce_op);
+  using ReduceTuple = typename decltype(reduce_datax)::Type;
+
+  for (MFIter mfi(m1[0],TilingIfNotGPU()); mfi.isValid(); ++mfi)
+  {
+      const Box& bx = mfi.tilebox();
+      const Box& bx_grid = mfi.validbox();
+
+      auto const& fab = m1[0].array(mfi);
+
+      int xlo = bx_grid.smallEnd(0);
+      int xhi = bx_grid.bigEnd(0);
+
+      reduce_op.eval(bx, reduce_datax,
+      [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple
+      {
+          Real weight = (i>xlo && i<xhi) ? 1.0 : 0.5;
+          return {fab(i,j,k)*weight};
+      });
+  }
+
+  sum[0] = amrex::get<0>(reduce_datax.value());
+  ParallelDescriptor::ReduceRealSum(sum[0]);
+
+  //////// y-faces
+
+  ReduceData<Real> reduce_datay(reduce_op);
+
+  for (MFIter mfi(m1[1],TilingIfNotGPU()); mfi.isValid(); ++mfi)
+  {
+      const Box& bx = mfi.tilebox();
+      const Box& bx_grid = mfi.validbox();
+
+      auto const& fab = m1[1].array(mfi);
+
+      int ylo = bx_grid.smallEnd(1);
+      int yhi = bx_grid.bigEnd(1);
+
+      reduce_op.eval(bx, reduce_datay,
+      [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple
+      {
+          Real weight = (j>ylo && j<yhi) ? 1.0 : 0.5;
+          return {fab(i,j,k)*weight};
+      });
+  }
+
+  sum[1] = amrex::get<0>(reduce_datay.value());
+  ParallelDescriptor::ReduceRealSum(sum[1]);
+
+  //////// z-faces
+
+  ReduceData<Real> reduce_dataz(reduce_op);
+
+  for (MFIter mfi(m1[2],TilingIfNotGPU()); mfi.isValid(); ++mfi)
+  {
+      const Box& bx = mfi.tilebox();
+      const Box& bx_grid = mfi.validbox();
+
+      auto const& fab = m1[2].array(mfi);
+
+      int zlo = bx_grid.smallEnd(2);
+      int zhi = bx_grid.bigEnd(2);
+
+      reduce_op.eval(bx, reduce_dataz,
+      [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple
+      {
+          Real weight = (k>zlo && k<zhi) ? 1.0 : 0.5;
+          return {fab(i,j,k)*weight};
+      });
+  }
+
+  sum[2] = amrex::get<0>(reduce_dataz.value());
+  ParallelDescriptor::ReduceRealSum(sum[2]);
+}
+
+void CCMoments(const amrex::MultiFab& m1,
+		 const int& comp1,
+                 amrex::MultiFab& mscr,
+		 const int& power,
+		 amrex::Real& prod_val)
+{
+
+  BL_PROFILE_VAR("CCMoments()",CCMoments);
+
+  MultiFab::Copy(mscr,m1,comp1,0,1,0);
+  for(int i=1; i<power; i++){
+  MultiFab::Multiply(mscr,m1,comp1,0,1,0);
+  }
+
+  prod_val = 0.;
+  SumCC(mscr,0,prod_val,false);
+}
+
+void SumCC(const amrex::MultiFab& m1,
+	   const int& comp,
+	   amrex::Real& sum,
+	   const bool& divide_by_ncells)
+{
+  BL_PROFILE_VAR("SumCC()",SumCC);
+
+  sum = 0.;
+  sum = m1.MultiFab::sum(comp, false);
+
+  if (divide_by_ncells == 1) {
+    BoxArray ba_temp = m1.boxArray();
+    long numpts = ba_temp.numPts();
+    sum = sum/(double)(numpts);
+  }
+}
+
+
diff --git a/exec/compressible_stag/TURB_PDFS/GNUmakefile b/exec/compressible_stag/TURB_PDFS/GNUmakefile
new file mode 100644
index 00000000..2f6bd7e1
--- /dev/null
+++ b/exec/compressible_stag/TURB_PDFS/GNUmakefile
@@ -0,0 +1,37 @@
+AMREX_HOME ?= ../../../../amrex/
+
+DEBUG	= TRUE
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+PRECISION = DOUBLE
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+USE_CUDA  = TRUE
+
+TINY_PROFILE = FALSE
+
+###################################################
+
+#EBASE     = main
+#EBASE     = main_decomp
+EBASE     = main_multisteps
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+vpath %.c   : . $(vpathdir)
+vpath %.h   : . $(vpathdir)
+vpath %.cpp : . $(vpathdir)
+vpath %.H   : . $(vpathdir)
+vpath %.F   : . $(vpathdir)
+vpath %.f   : . $(vpathdir)
+vpath %.f90 : . $(vpathdir)
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/exec/compressible_stag/TURB_PDFS/Make.package b/exec/compressible_stag/TURB_PDFS/Make.package
new file mode 100644
index 00000000..ddbf6fb7
--- /dev/null
+++ b/exec/compressible_stag/TURB_PDFS/Make.package
@@ -0,0 +1,13 @@
+CEXE_sources += ${EBASE}.cpp
+
+INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Base
+include $(AMREX_HOME)/Src/Base/Make.package
+vpathdir += $(AMREX_HOME)/Src/Base
+
+INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Extern/amrdata
+include $(AMREX_HOME)/Src/Extern/amrdata/Make.package
+vpathdir += $(AMREX_HOME)/Src/Extern/amrdata
+
+#INCLUDE_LOCATIONS += $(AMREX_HOME)/Tools/C_util
+#include $(AMREX_HOME)/Tools/C_util/Make.package
+#vpathdir += $(AMREX_HOME)/Tools/C_util
diff --git a/exec/compressible_stag/TURB_PDFS/build_frontier.sh b/exec/compressible_stag/TURB_PDFS/build_frontier.sh
new file mode 100755
index 00000000..36b9b69a
--- /dev/null
+++ b/exec/compressible_stag/TURB_PDFS/build_frontier.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/bash
+
+## load necessary modules 
+module load PrgEnv-cray
+module load cray-mpich
+module load cce 
+
+# compiler environment hints
+export CC=$(which craycc)
+export CXX=$(which craycc)
+export FC=$(which crayftn)
+
+make -j10 USE_CUDA=FALSE USE_HIP=FALSE USE_ASSERTION=TRUE 
diff --git a/exec/compressible_stag/TURB_PDFS/build_perlmutter.sh b/exec/compressible_stag/TURB_PDFS/build_perlmutter.sh
new file mode 100755
index 00000000..37901f67
--- /dev/null
+++ b/exec/compressible_stag/TURB_PDFS/build_perlmutter.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/bash
+
+# required dependencies
+module load PrgEnv-gnu
+module load craype
+module load craype-x86-milan
+
+module list
+
+# optimize CPU microarchitecture for AMD EPYC 3rd Gen (Milan/Zen3)
+# note: the cc/CC/ftn wrappers below add those
+export CXXFLAGS="-march=znver3"
+export CFLAGS="-march=znver3"
+
+# compiler environment hints
+export CC=cc
+export CXX=CC
+export FC=ftn
+
+make -j10 USE_CUDA=FALSE MAX_SPEC=2 USE_ASSERTION=TRUE DEBUG=FALSE
diff --git a/exec/compressible_stag/TURB_PDFS/main.cpp b/exec/compressible_stag/TURB_PDFS/main.cpp
new file mode 100644
index 00000000..2a62e5ce
--- /dev/null
+++ b/exec/compressible_stag/TURB_PDFS/main.cpp
@@ -0,0 +1,410 @@
+#include <fstream>
+#include <iostream>
+
+#include <AMReX_ParmParse.H>
+#include <AMReX_PlotFileUtil.H>
+
+using namespace amrex;
+using namespace std;
+
+static
+void
+PrintUsage (const char* progName)
+{
+    Print() << std::endl
+            << "This utility computes PDF of scalars, and various powers of Laplacian of velocity field," << std::endl;
+
+    Print() << "Usage:" << '\n';
+    Print() << progName << " <inputs>" << std::endl
+            << "OR" << std::endl
+            << progName << std::endl
+            << " step=<step number of plotfile to be read>" << std::endl
+            << " nbins=<number of bins> " << std::endl
+            << " range=<lo/hi end of range> " << std::endl
+            << std::endl;
+
+    exit(1);
+}
+
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc,argv);
+
+    {
+
+        if (argc == 1) {
+            PrintUsage(argv[0]);
+        }
+        
+        ParmParse pp;
+        
+        int step;
+        pp.query("step",step);
+        
+        std::string iFile         = amrex::Concatenate("plt",step,9);
+
+        Vector<std::string> scalar_out(3);
+        scalar_out[0] = amrex::Concatenate("rho_pdf",step,9);
+        scalar_out[1] = amrex::Concatenate("press_pdf",step,9);
+        scalar_out[2] = amrex::Concatenate("temp_pdf",step,9);
+        Vector<std::string> Lap_out(5);
+        Lap_out[0] = amrex::Concatenate("L0_pdf",step,9);
+        Lap_out[1] = amrex::Concatenate("L1_pdf",step,9);
+        Lap_out[2] = amrex::Concatenate("L2_pdf",step,9);
+        Lap_out[3] = amrex::Concatenate("L3_pdf",step,9);
+        Lap_out[4] = amrex::Concatenate("L4_pdf",step,9);
+
+        int nbins;
+        pp.get("nbins", nbins);
+
+        Real range;
+        pp.get("range",range);
+
+        amrex::Print() << "Reading from plotfile " << iFile << "\n";
+
+        // for the Header
+        std::string iFile2 = iFile;
+        iFile2 += "/Header";
+
+        // open header
+        ifstream x;
+        x.open(iFile2.c_str(), ios::in);
+
+        // read in first line of header (typically "HyperCLaw-V1.1" or similar)
+        std::string str;
+        x >> str;
+
+        // read in number of components from header
+        int ncomp;
+        x >> ncomp;
+
+        // read in variable names from header
+        int flag = 0;
+        int rho_ind, press_ind, temp_ind, velx_ind;
+        for (int n=0; n<ncomp; ++n) {
+            x >> str;
+            if (str == "rhoInstant") rho_ind = flag;
+            if (str == "pInstant") press_ind = flag;
+            if (str == "tInstant") temp_ind = flag;
+            if (str == "uxInstantFACE") velx_ind = flag;
+            flag ++;
+        }
+
+        // read in dimensionality from header
+        int dim;
+        x >> dim;
+
+        // read in time
+        Real time;
+        x >> time;
+
+        // read in finest level
+        int finest_level;
+        x >> finest_level;
+
+        // read in prob_lo and prob_hi
+        amrex::GpuArray<amrex::Real, 3> prob_lo, prob_hi;
+        for (int i=0; i<3; ++i) {
+            x >> prob_lo[i];        
+        }
+        for (int i=0; i<3; ++i) {
+            x >> prob_hi[i];        
+        }
+        
+        // now read in the plotfile data
+        // check to see whether the user pointed to the plotfile base directory
+        // or the data itself
+        if (amrex::FileExists(iFile+"/Level_0/Cell_H")) {
+           iFile += "/Level_0/Cell";
+        }
+        if (amrex::FileExists(iFile+"/Level_00/Cell_H")) {
+           iFile += "/Level_00/Cell";
+        }
+
+        // storage for the input coarse and fine MultiFabs
+        MultiFab mf;
+
+        // read in plotfile mf to MultiFab
+        VisMF::Read(mf, iFile);
+
+        // get BoxArray and DistributionMapping
+        BoxArray ba = mf.boxArray();
+        DistributionMapping dmap = mf.DistributionMap();
+
+        // physical dimensions of problem
+        RealBox real_box({AMREX_D_DECL(prob_lo[0],prob_lo[1],prob_lo[2])},
+                         {AMREX_D_DECL(prob_hi[0],prob_hi[1],prob_hi[2])});
+
+        // single box with the enire domain
+        Box domain = ba.minimalBox().enclosedCells();
+
+        Real ncells = (double) domain.numPts();
+
+        // set to 1 (periodic)
+        Vector<int> is_periodic(3,1);
+        
+        Geometry geom(domain,&real_box,CoordSys::cartesian,is_periodic.data());
+        
+        const Real* dx = geom.CellSize();
+  
+        ////////////////////////////////////////////////////////////////////////
+        ////////////// velocity Laplacian PDFs /////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////
+        MultiFab vel_grown(ba,dmap,3,1);
+        MultiFab laplacian(ba,dmap,3,1);
+
+        // copy shifted velocity components from mf into vel_grown
+        Copy(vel_grown,mf,velx_ind,0,3,0);
+        Copy(laplacian,mf,velx_ind,0,3,0);
+
+        // fill ghost cells of vel_grown
+        vel_grown.FillBoundary(geom.periodicity());
+        laplacian.FillBoundary(geom.periodicity());
+
+        for (int m=0; m<5; ++m) {
+            
+            Vector<Real> L2(3,0.);
+            for (int i=0; i<3; i++)
+                L2[i]=0.;
+
+            for ( MFIter mfi(laplacian,false); mfi.isValid(); ++mfi ) {
+
+                const Box& bx = mfi.validbox();
+                const auto lo = amrex::lbound(bx);
+                const auto hi = amrex::ubound(bx);
+                
+                const Array4<Real>& lap = laplacian.array(mfi);
+
+                for (auto n=0; n<3; ++n) {
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
+                for (auto i = lo.x; i <= hi.x; ++i) {
+
+                    L2[n] += lap(i,j,k,n)*lap(i,j,k,n);
+                        
+                }
+                }
+                }
+                }
+
+            } // end MFIter
+
+            ParallelDescriptor::ReduceRealSum(L2.dataPtr(),3);
+            amrex::Long totpts =  domain.numPts();
+            L2[0] = sqrt(L2[0]/totpts);
+            L2[1] = sqrt(L2[1]/totpts);
+            L2[2] = sqrt(L2[2]/totpts);
+            Print() << "L2 norm of Laplacian to power " << m << " is " << L2[0] 
+                    << " "  << L2[1] << " "  << L2[2] << " " << std::endl;
+
+            for ( MFIter mfi(laplacian,false); mfi.isValid(); ++mfi ) {
+
+                const Box& bx = mfi.validbox();
+                const auto lo = amrex::lbound(bx);
+                const auto hi = amrex::ubound(bx);
+
+                const Array4<Real>& lap = laplacian.array(mfi);
+
+                for (auto n=0; n<3; ++n) {
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
+                for (auto i = lo.x; i <= hi.x; ++i) {
+
+                    lap(i,j,k,n) = lap(i,j,k,n)/L2[n];
+
+                }
+                }
+                }
+                }
+
+            } // end MFIter
+
+            Vector<Real> bins(nbins+1,0.);
+
+            int halfbin = nbins/2;
+            Real hbinwidth = range/nbins;
+            Real binwidth = 2.*range/nbins;
+            amrex::Long count=0;
+            amrex::Long totbin=0;
+            for (int ind=0 ; ind < nbins+1; ind++) bins[ind]=0;
+
+            for ( MFIter mfi(laplacian,false); mfi.isValid(); ++mfi ) {
+
+                const Box& bx = mfi.validbox();
+                const auto lo = amrex::lbound(bx);
+                const auto hi = amrex::ubound(bx);
+                
+                const Array4<Real>& lap = laplacian.array(mfi);
+
+                for (auto n=0; n<3; ++n) {
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
+                for (auto i = lo.x; i <= hi.x; ++i) {
+
+                    int index = floor((lap(i,j,k,n) + hbinwidth)/binwidth);
+                    index += halfbin;
+                    
+                    if( index >=0 && index <= nbins) {
+                        bins[index] += 1;
+                        totbin++;
+                    }
+
+                    count++;
+                        
+                }
+                }
+                }
+                }
+
+            } // end MFIter
+
+            ParallelDescriptor::ReduceRealSum(bins.dataPtr(),nbins+1);
+            ParallelDescriptor::ReduceLongSum(count);
+            ParallelDescriptor::ReduceLongSum(totbin);
+            Print() << "Points outside of range "<< count - totbin << " " << 
+                       (double)(count-totbin)/count << std::endl;
+
+            // print out contents of bins to the screen
+            for (int i=0; i<nbins+1; ++i) {
+                Print() << "For  m= "<< m << " " <<  (i-halfbin)*binwidth << " " 
+                        << bins[i]/(count*binwidth) << std::endl;
+            }
+            if (ParallelDescriptor::IOProcessor()) {
+                std::ofstream outfile;
+                outfile.open(Lap_out[m]);
+                for (int i=0; i<nbins+1; ++i) {
+                    outfile << (i-halfbin)*binwidth << " " << bins[i]/(count*binwidth) << std::endl;
+                }
+                outfile.close();
+            }
+        
+            for ( MFIter mfi(vel_grown,false); mfi.isValid(); ++mfi ) {
+
+                const Box& bx = mfi.validbox();
+                const auto lo = amrex::lbound(bx);
+                const auto hi = amrex::ubound(bx);
+
+                const Array4<Real>& vel = vel_grown.array(mfi);
+                const Array4<Real>& lap = laplacian.array(mfi);
+
+                for (auto n=0; n<3; ++n) {
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
+                for (auto i = lo.x; i <= hi.x; ++i) {
+
+                    lap(i,j,k,n) = -(vel(i+1,j,k,n) - 2.*vel(i,j,k,n) + vel(i-1,j,k,n)) / (dx[0]*dx[0])
+                                   -(vel(i,j+1,k,n) - 2.*vel(i,j,k,n) + vel(i,j-1,k,n)) / (dx[1]*dx[1])
+                                   -(vel(i,j,k+1,n) - 2.*vel(i,j,k,n) + vel(i,j,k+1,n)) / (dx[2]*dx[2]);
+                }
+                }
+                }
+                }
+
+            } // end MFIter
+
+            // copy lap into vel_grown
+            Copy(vel_grown,laplacian,0,0,3,0);
+
+            // fill ghost cells of vel_grown
+            vel_grown.FillBoundary(geom.periodicity());
+
+        } // end loop
+        ////////////////////////////////////////////////////////////////////////
+        ////////////// velocity Laplacian PDFs /////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////
+  
+        ////////////////////////////////////////////////////////////////////////
+        ///////////////////////// scalar  PDFs /////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////
+        MultiFab scalar(ba,dmap,3,0);
+        Copy(scalar,mf,rho_ind,0,1,0);
+        Copy(scalar,mf,press_ind,1,1,0);
+        Copy(scalar,mf,temp_ind,2,1,0);
+
+        // compute spatial mean
+        Real mean_rho   = scalar.sum(0) / (ncells);
+        Real mean_press = scalar.sum(1) / (ncells);
+        Real mean_temp  = scalar.sum(2) / (ncells);
+
+        // get fluctuations
+        scalar.plus(-1.0*mean_rho,   0, 1);
+        scalar.plus(-1.0*mean_press, 1, 1);
+        scalar.plus(-1.0*mean_temp,  2, 1);
+
+        // get rms
+        Real rms_rho   = scalar.norm2(0) / sqrt(ncells);
+        Real rms_press = scalar.norm2(1) / sqrt(ncells);
+        Real rms_temp  = scalar.norm2(2) / sqrt(ncells);
+
+        // scale by rms
+        scalar.mult(1.0/rms_rho,   0, 1);
+        scalar.mult(1.0/rms_press, 1, 1);
+        scalar.mult(1.0/rms_temp,  2, 1);
+
+        // now compute pdfs
+        for (int m = 0; m < 3; ++m) {
+
+            Vector<Real> bins(nbins+1,0.);
+
+            int halfbin = nbins/2;
+            Real hbinwidth = range/nbins;
+            Real binwidth = 2.*range/nbins;
+            amrex::Long count=0;
+            amrex::Long totbin=0;
+            for (int ind=0 ; ind < nbins+1; ind++) bins[ind]=0;
+
+            for ( MFIter mfi(scalar,false); mfi.isValid(); ++mfi ) {
+
+                const Box& bx = mfi.validbox();
+                const auto lo = amrex::lbound(bx);
+                const auto hi = amrex::ubound(bx);
+                
+                const Array4<Real>& sca = scalar.array(mfi);
+
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
+                for (auto i = lo.x; i <= hi.x; ++i) {
+
+                    int index = floor((sca(i,j,k,m) + hbinwidth)/binwidth);
+                    index += halfbin;
+                    
+                    if( index >=0 && index <= nbins) {
+                        bins[index] += 1;
+                        totbin++;
+                    }
+
+                    count++;
+                        
+                }
+                }
+                }
+
+            } // end MFIter
+
+            ParallelDescriptor::ReduceRealSum(bins.dataPtr(),nbins+1);
+            ParallelDescriptor::ReduceLongSum(count);
+            ParallelDescriptor::ReduceLongSum(totbin);
+            Print() << "Points outside of range "<< count - totbin << " " << 
+                       (double)(count-totbin)/count << std::endl;
+
+            // print out contents of bins to the screen
+            for (int i=0; i<nbins+1; ++i) {
+                Print() << "For scalar m = "<< m << " " <<  (i-halfbin)*binwidth << " " 
+                        << bins[i]/(count*binwidth) << std::endl;
+            }
+            if (ParallelDescriptor::IOProcessor()) {
+                std::ofstream outfile;
+                outfile.open(scalar_out[m]);
+                for (int i=0; i<nbins+1; ++i) {
+                    outfile << (i-halfbin)*binwidth << " " << bins[i]/(count*binwidth) << std::endl;
+                }
+                outfile.close();
+            }
+        }
+    }
+        
+    amrex::Finalize();
+
+}
+
diff --git a/exec/compressible_stag/TURB_PDFS/main_decomp.cpp b/exec/compressible_stag/TURB_PDFS/main_decomp.cpp
new file mode 100644
index 00000000..8a892b33
--- /dev/null
+++ b/exec/compressible_stag/TURB_PDFS/main_decomp.cpp
@@ -0,0 +1,754 @@
+#include <fstream>
+#include <iostream>
+
+#include <AMReX_ParmParse.H>
+#include <AMReX_PlotFileUtil.H>
+#include <AMReX_VisMF.H>
+
+using namespace amrex;
+using namespace std;
+
+static
+void
+PrintUsage (const char* progName)
+{
+    Print() << std::endl
+            << "This utility computes PDF of vorticity and divergence, and various powers of Laplacian of solenoidal and dilatational velocity field," << std::endl;
+
+    Print() << "Usage:" << '\n';
+    Print() << progName << " <inputs>" << std::endl
+            << "OR" << std::endl
+            << progName << std::endl
+            << " step=<step number of plotfile to be read>" << std::endl
+            << " nbins=<number of bins> " << std::endl
+            << " range=<lo/hi end of range> " << std::endl
+            << std::endl;
+
+    exit(1);
+}
+
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc,argv);
+
+    {
+
+        if (argc == 1) {
+            PrintUsage(argv[0]);
+        }
+        
+        ParmParse pp;
+        
+        int step;
+        pp.query("step",step);
+        
+        std::string iFile         = amrex::Concatenate("vel_grad_decomp",step,9);
+
+        Vector<std::string> scalar_out(5);
+        scalar_out[0] = amrex::Concatenate("div_pdf",step,9);
+        scalar_out[1] = amrex::Concatenate("vortx_pdf",step,9);
+        scalar_out[2] = amrex::Concatenate("vorty_pdf",step,9);
+        scalar_out[3] = amrex::Concatenate("vortz_pdf",step,9);
+        scalar_out[4] = amrex::Concatenate("vort_pdf",step,9);
+        Vector<std::string> Lap_out_sol(5);
+        Lap_out_sol[0] = amrex::Concatenate("L0_pdf_sol",step,9);
+        Lap_out_sol[1] = amrex::Concatenate("L1_pdf_sol",step,9);
+        Lap_out_sol[2] = amrex::Concatenate("L2_pdf_sol",step,9);
+        Lap_out_sol[3] = amrex::Concatenate("L3_pdf_sol",step,9);
+        Lap_out_sol[4] = amrex::Concatenate("L4_pdf_sol",step,9);
+        Vector<std::string> Lap_out_dil(5);
+        Lap_out_dil[0] = amrex::Concatenate("L0_pdf_dil",step,9);
+        Lap_out_dil[1] = amrex::Concatenate("L1_pdf_dil",step,9);
+        Lap_out_dil[2] = amrex::Concatenate("L2_pdf_dil",step,9);
+        Lap_out_dil[3] = amrex::Concatenate("L3_pdf_dil",step,9);
+        Lap_out_dil[4] = amrex::Concatenate("L4_pdf_dil",step,9);
+               
+        int nbins;
+        pp.get("nbins", nbins);
+
+        Real range;
+        pp.get("range",range);
+
+        amrex::Print() << "Reading from vel_grad_decomp plotfile " << iFile << "\n";
+
+        // for the Header
+        std::string iFile2 = iFile;
+        iFile2 += "/Header";
+
+        // open header
+        ifstream x;
+        x.open(iFile2.c_str(), ios::in);
+
+        // read in first line of header (typically "HyperCLaw-V1.1" or similar)
+        std::string str;
+        x >> str;
+
+        // read in number of components from header
+        int ncomp;
+        x >> ncomp;
+
+        // read in variable names from header
+        int flag = 0;
+        int vort_ind, div_ind, velx_sol_ind, vely_sol_ind, velz_sol_ind, velx_dil_ind, vely_dil_ind, velz_dil_ind;
+        for (int n=0; n<ncomp; ++n) {
+            x >> str;
+            if (str == "vort") vort_ind = flag;
+            if (str == "div")  div_ind = flag;
+            if (str == "ux_s") velx_sol_ind = flag;
+            if (str == "uy_s") vely_sol_ind = flag;
+            if (str == "uz_s") velz_sol_ind = flag;
+            if (str == "ux_d") velx_dil_ind = flag;
+            if (str == "uy_d") vely_dil_ind = flag;
+            if (str == "uz_d") velz_dil_ind = flag;
+            flag ++;
+        }
+
+        // read in dimensionality from header
+        int dim;
+        x >> dim;
+
+        // read in time
+        Real time;
+        x >> time;
+
+        // read in finest level
+        int finest_level;
+        x >> finest_level;
+
+        // read in prob_lo and prob_hi
+        amrex::GpuArray<amrex::Real, 3> prob_lo, prob_hi;
+        for (int i=0; i<3; ++i) {
+            x >> prob_lo[i];        
+        }
+        for (int i=0; i<3; ++i) {
+            x >> prob_hi[i];        
+        }
+        
+        // now read in the plotfile data
+        // check to see whether the user pointed to the plotfile base directory
+        // or the data itself
+        if (amrex::FileExists(iFile+"/Level_0/Cell_H")) {
+           iFile += "/Level_0/Cell";
+        }
+        if (amrex::FileExists(iFile+"/Level_00/Cell_H")) {
+           iFile += "/Level_00/Cell";
+        }
+
+        // storage for the input coarse and fine MultiFabs
+        MultiFab mf;
+
+        // read in plotfile mf to MultiFab
+        VisMF::Read(mf, iFile);
+
+        // get BoxArray and DistributionMapping
+        BoxArray ba = mf.boxArray();
+        DistributionMapping dmap = mf.DistributionMap();
+
+        // physical dimensions of problem
+        RealBox real_box({AMREX_D_DECL(prob_lo[0],prob_lo[1],prob_lo[2])},
+                         {AMREX_D_DECL(prob_hi[0],prob_hi[1],prob_hi[2])});
+
+        // single box with the enire domain
+        Box domain = ba.minimalBox().enclosedCells();
+
+        Real ncells = (double) domain.numPts();
+
+        // set to 1 (periodic)
+        Vector<int> is_periodic(3,1);
+        
+        Geometry geom(domain,&real_box,CoordSys::cartesian,is_periodic.data());
+        
+        const Real* dx = geom.CellSize();
+  
+        ////////////////////////////////////////////////////////////////////////
+        ////////////// velocity Laplacian PDFs///////////// ////////////////////
+        ////////////////////////////////////////////////////////////////////////
+        MultiFab vel_grown(ba,dmap,6,1);
+        MultiFab vel_sol  (ba,dmap,3,1);
+        MultiFab laplacian(ba,dmap,6,1);
+
+        // copy shifted velocity components from mf into vel_grown
+        Copy(vel_grown,mf,velx_sol_ind,0,1,0); // sol
+        Copy(vel_grown,mf,vely_sol_ind,1,1,0); // sol
+        Copy(vel_grown,mf,velz_sol_ind,2,1,0); // sol
+        
+        Copy(laplacian,mf,velx_sol_ind,0,1,0); // sol
+        Copy(laplacian,mf,vely_sol_ind,1,1,0); // sol
+        Copy(laplacian,mf,velz_sol_ind,2,1,0); // sol
+        
+        Copy(vel_grown,mf,velx_dil_ind,3,1,0); // dil
+        Copy(vel_grown,mf,vely_dil_ind,4,1,0); // dil
+        Copy(vel_grown,mf,velz_dil_ind,5,1,0); // dil
+        
+        Copy(laplacian,mf,velx_dil_ind,3,1,0); // dil
+        Copy(laplacian,mf,vely_dil_ind,4,1,0); // dil
+        Copy(laplacian,mf,velz_dil_ind,5,1,0); // dil
+
+        Copy(vel_sol,mf,velx_sol_ind,0,1,0); // sol
+        Copy(vel_sol,mf,vely_sol_ind,1,1,0); // sol
+        Copy(vel_sol,mf,velz_sol_ind,2,1,0); // sol
+
+        // fill ghost cells of vel_grown
+        vel_grown.FillBoundary(geom.periodicity());
+        laplacian.FillBoundary(geom.periodicity());
+        vel_sol  .FillBoundary(geom.periodicity());
+
+        for (int m=0; m<5; ++m) {
+            
+            Vector<Real> L2(6,0.);
+            for (int i=0; i<6; i++)
+                L2[i]=0.;
+
+            for ( MFIter mfi(laplacian,false); mfi.isValid(); ++mfi ) {
+
+                const Box& bx = mfi.validbox();
+                const auto lo = amrex::lbound(bx);
+                const auto hi = amrex::ubound(bx);
+                
+                const Array4<Real>& lap = laplacian.array(mfi);
+
+                for (auto n=0; n<6; ++n) {
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
+                for (auto i = lo.x; i <= hi.x; ++i) {
+
+                    L2[n] += lap(i,j,k,n)*lap(i,j,k,n);
+                        
+                }
+                }
+                }
+                }
+
+            } // end MFIter
+
+            ParallelDescriptor::ReduceRealSum(L2.dataPtr(),6);
+            amrex::Long totpts =  domain.numPts();
+            L2[0] = sqrt(L2[0]/totpts);
+            L2[1] = sqrt(L2[1]/totpts);
+            L2[2] = sqrt(L2[2]/totpts);
+            L2[3] = sqrt(L2[3]/totpts);
+            L2[4] = sqrt(L2[4]/totpts);
+            L2[5] = sqrt(L2[5]/totpts);
+            Print() << "L2 norm of Laplacian (solenoidal) to power " << m << " is " << L2[0] 
+                    << " "  << L2[1] << " "  << L2[2] << " " << std::endl;
+            Print() << "L2 norm of Laplacian (dilational) to power " << m << " is " << L2[3] 
+                    << " "  << L2[4] << " "  << L2[5] << " " << std::endl;
+
+            for ( MFIter mfi(laplacian,false); mfi.isValid(); ++mfi ) {
+
+                const Box& bx = mfi.validbox();
+                const auto lo = amrex::lbound(bx);
+                const auto hi = amrex::ubound(bx);
+
+                const Array4<Real>& lap = laplacian.array(mfi);
+
+                for (auto n=0; n<6; ++n) {
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
+                for (auto i = lo.x; i <= hi.x; ++i) {
+
+                    lap(i,j,k,n) = lap(i,j,k,n)/L2[n];
+
+                }
+                }
+                }
+                }
+
+            } // end MFIter
+
+            Vector<Real> bins_sol(nbins+1,0.);
+            Vector<Real> bins_dil(nbins+1,0.);
+
+            int halfbin = nbins/2;
+            Real hbinwidth = range/nbins;
+            Real binwidth = 2.*range/nbins;
+            amrex::Long count_sol=0;
+            amrex::Long totbin_sol=0;
+            amrex::Long count_dil=0;
+            amrex::Long totbin_dil=0;
+            for (int ind=0 ; ind < nbins+1; ind++) bins_sol[ind]=0;
+            for (int ind=0 ; ind < nbins+1; ind++) bins_dil[ind]=0;
+
+            for ( MFIter mfi(laplacian,false); mfi.isValid(); ++mfi ) {
+
+                const Box& bx = mfi.validbox();
+                const auto lo = amrex::lbound(bx);
+                const auto hi = amrex::ubound(bx);
+                
+                const Array4<Real>& lap = laplacian.array(mfi);
+
+                for (auto n=0; n<3; ++n) {
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
+                for (auto i = lo.x; i <= hi.x; ++i) {
+
+                    int index = floor((lap(i,j,k,n) + hbinwidth)/binwidth);
+                    index += halfbin;
+                    
+                    if( index >=0 && index <= nbins) {
+                        bins_sol[index] += 1;
+                        totbin_sol++;
+                    }
+
+                    count_sol++;
+                        
+                }
+                }
+                }
+                }
+
+                for (auto n=3; n<6; ++n) {
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
+                for (auto i = lo.x; i <= hi.x; ++i) {
+
+                    int index = floor((lap(i,j,k,n) + hbinwidth)/binwidth);
+                    index += halfbin;
+                    
+                    if( index >=0 && index <= nbins) {
+                        bins_dil[index] += 1;
+                        totbin_dil++;
+                    }
+
+                    count_dil++;
+                        
+                }
+                }
+                }
+                }
+
+            } // end MFIter
+
+            ParallelDescriptor::ReduceRealSum(bins_sol.dataPtr(),nbins+1);
+            ParallelDescriptor::ReduceLongSum(count_sol);
+            ParallelDescriptor::ReduceLongSum(totbin_sol);
+            ParallelDescriptor::ReduceRealSum(bins_dil.dataPtr(),nbins+1);
+            ParallelDescriptor::ReduceLongSum(count_dil);
+            ParallelDescriptor::ReduceLongSum(totbin_dil);
+            Print() << "Points outside of range (solenoidal) "<< count_sol - totbin_sol << " " << 
+                       (double)(count_sol-totbin_sol)/count_sol << std::endl;
+            Print() << "Points outside of range (dilational) "<< count_dil - totbin_dil << " " << 
+                       (double)(count_dil-totbin_dil)/count_dil << std::endl;
+
+            // print out contents of bins to the screen
+            for (int i=0; i<nbins+1; ++i) {
+                Print() << "(solenoidal) For  m= "<< m << " " <<  (i-halfbin)*binwidth << " " 
+                        << bins_sol[i]/(count_sol*binwidth) << std::endl;
+            }
+            for (int i=0; i<nbins+1; ++i) {
+                Print() << "(dilational) For  m= "<< m << " " <<  (i-halfbin)*binwidth << " " 
+                        << bins_dil[i]/(count_dil*binwidth) << std::endl;
+            }
+            if (ParallelDescriptor::IOProcessor()) {
+                std::ofstream outfile;
+                outfile.open(Lap_out_sol[m]);
+                for (int i=0; i<nbins+1; ++i) {
+                    outfile << (i-halfbin)*binwidth << " " << bins_sol[i]/(count_sol*binwidth) << std::endl;
+                }
+                outfile.close();
+                outfile.open(Lap_out_dil[m]);
+                for (int i=0; i<nbins+1; ++i) {
+                    outfile << (i-halfbin)*binwidth << " " << bins_dil[i]/(count_dil*binwidth) << std::endl;
+                }
+                outfile.close();
+            }
+        
+            for ( MFIter mfi(vel_grown,false); mfi.isValid(); ++mfi ) {
+
+                const Box& bx = mfi.validbox();
+                const auto lo = amrex::lbound(bx);
+                const auto hi = amrex::ubound(bx);
+
+                const Array4<Real>& vel = vel_grown.array(mfi);
+                const Array4<Real>& lap = laplacian.array(mfi);
+
+                for (auto n=0; n<6; ++n) {
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
+                for (auto i = lo.x; i <= hi.x; ++i) {
+
+                    lap(i,j,k,n) = -(vel(i+1,j,k,n) - 2.*vel(i,j,k,n) + vel(i-1,j,k,n)) / (dx[0]*dx[0])
+                                   -(vel(i,j+1,k,n) - 2.*vel(i,j,k,n) + vel(i,j-1,k,n)) / (dx[1]*dx[1])
+                                   -(vel(i,j,k+1,n) - 2.*vel(i,j,k,n) + vel(i,j,k+1,n)) / (dx[2]*dx[2]);
+                }
+                }
+                }
+                }
+
+            } // end MFIter
+
+            // copy lap into vel_grown
+            Copy(vel_grown,laplacian,0,0,6,0);
+
+            // fill ghost cells of vel_grown
+            vel_grown.FillBoundary(geom.periodicity());
+
+        } // end loop
+        ////////////////////////////////////////////////////////////////////////
+        ////////////// velocity Laplacian PDFs //////////// ////////////////////
+        ////////////////////////////////////////////////////////////////////////
+  
+        ////////////////////////////////////////////////////////////////////////
+        ///////////////////////// scalar  PDFs /////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////
+        MultiFab scalar(ba,dmap,4,0);    // vort_mag, div, vort_x, vort_y, vort_z
+        scalar.setVal(0.0);
+        Copy(scalar,mf,div_ind,0,1,0);
+
+        // Compute vorticity components and store in scalar
+        for ( MFIter mfi(vel_sol,false); mfi.isValid(); ++mfi ) {
+
+            const Box& bx = mfi.validbox();
+            const auto lo = amrex::lbound(bx);
+            const auto hi = amrex::ubound(bx);
+
+            Array4<Real const> const& sol  = vel_sol   .array(mfi);
+            Array4<Real>       const& sca  = scalar    .array(mfi);
+
+            for (auto k = lo.z; k <= hi.z; ++k) {
+            for (auto j = lo.y; j <= hi.y; ++j) {
+            for (auto i = lo.x; i <= hi.x; ++i) {
+                // dw/dy - dv/dz
+                sca(i,j,k,1) =
+                    (sol(i,j+1,k,velz_sol_ind) - sol(i,j-1,k,velz_sol_ind)) / (2.*dx[1]) -
+                    (sol(i,j,k+1,vely_sol_ind) - sol(i,j,k-1,vely_sol_ind)) / (2.*dx[2]);
+
+                // dv/dx - du/dy
+                sca(i,j,k,2) =
+                    (sol(i+1,j,k,vely_sol_ind) - sol(i-1,j,k,vely_sol_ind)) / (2.*dx[0]) -
+                    (sol(i,j+1,k,velx_sol_ind) - sol(i,j-1,k,velx_sol_ind)) / (2.*dx[1]);
+
+                // du/dz - dw/dx                
+                sca(i,j,k,3) =
+                    (sol(i,j,k+1,velx_sol_ind) - sol(i,j,k-1,velx_sol_ind)) / (2.*dx[2]) -
+                    (sol(i+1,j,k,velz_sol_ind) - sol(i-1,j,k,velz_sol_ind)) / (2.*dx[0]);
+
+            }
+            }
+            }
+        }
+
+        // compute spatial mean
+        Real mean_div     = scalar.sum(0) / (ncells);
+        Real mean_vortx   = scalar.sum(1) / (ncells);
+        Real mean_vorty   = scalar.sum(2) / (ncells);
+        Real mean_vortz   = scalar.sum(3) / (ncells);
+
+        // get fluctuations
+        scalar.plus(-1.0*mean_div,     0, 1);
+        scalar.plus(-1.0*mean_vortx,   1, 1);
+        scalar.plus(-1.0*mean_vorty,   2, 1);
+        scalar.plus(-1.0*mean_vortz,   3, 1);
+
+        // get rms
+        Real rms_div     = scalar.norm2(0) / sqrt(ncells);
+        Real rms_vortx   = scalar.norm2(1) / sqrt(ncells);
+        Real rms_vorty   = scalar.norm2(2) / sqrt(ncells);
+        Real rms_vortz   = scalar.norm2(3) / sqrt(ncells);
+
+        // scale by rms
+        scalar.mult(1.0/rms_div,     0, 1);
+        scalar.mult(1.0/rms_vortx,   1, 1);
+        scalar.mult(1.0/rms_vorty,   2, 1);
+        scalar.mult(1.0/rms_vortz,   3, 1);
+
+        // now compute pdfs
+        for (int m = 0; m < 4; ++m) {
+
+            Vector<Real> bins(nbins+1,0.);
+
+            int halfbin = nbins/2;
+            Real hbinwidth = range/nbins;
+            Real binwidth = 2.*range/nbins;
+            amrex::Long count=0;
+            amrex::Long totbin=0;
+            for (int ind=0 ; ind < nbins+1; ind++) bins[ind]=0;
+
+            for ( MFIter mfi(scalar,false); mfi.isValid(); ++mfi ) {
+
+                const Box& bx = mfi.validbox();
+                const auto lo = amrex::lbound(bx);
+                const auto hi = amrex::ubound(bx);
+                
+                const Array4<Real>& sca = scalar.array(mfi);
+
+                for (auto k = lo.z; k <= hi.z; ++k) {
+                for (auto j = lo.y; j <= hi.y; ++j) {
+                for (auto i = lo.x; i <= hi.x; ++i) {
+
+                    int index = floor((sca(i,j,k,m) + hbinwidth)/binwidth);
+                    index += halfbin;
+                    
+                    if( index >=0 && index <= nbins) {
+                        bins[index] += 1;
+                        totbin++;
+                    }
+
+                    count++;
+                        
+                }
+                }
+                }
+
+            } // end MFIter
+
+            ParallelDescriptor::ReduceRealSum(bins.dataPtr(),nbins+1);
+            ParallelDescriptor::ReduceLongSum(count);
+            ParallelDescriptor::ReduceLongSum(totbin);
+            Print() << "Points outside of range "<< count - totbin << " " << 
+                       (double)(count-totbin)/count << std::endl;
+
+            // print out contents of bins to the screen
+            for (int i=0; i<nbins+1; ++i) {
+                Print() << "For scalar m = "<< m << " " <<  (i-halfbin)*binwidth << " " 
+                        << bins[i]/(count*binwidth) << std::endl;
+            }
+            if (ParallelDescriptor::IOProcessor()) {
+                std::ofstream outfile;
+                outfile.open(scalar_out[m]);
+                for (int i=0; i<nbins+1; ++i) {
+                    outfile << (i-halfbin)*binwidth << " " << bins[i]/(count*binwidth) << std::endl;
+                }
+                outfile.close();
+            }
+        }
+
+        // total vorticity PDF
+        {
+          Vector<Real> bins(nbins+1,0.);
+
+          int halfbin = nbins/2;
+          Real hbinwidth = range/nbins;
+          Real binwidth = 2.*range/nbins;
+          amrex::Long count=0;
+          amrex::Long totbin=0;
+          for (int ind=0 ; ind < nbins+1; ind++) bins[ind]=0;
+
+          for ( MFIter mfi(scalar,false); mfi.isValid(); ++mfi ) {
+
+              const Box& bx = mfi.validbox();
+              const auto lo = amrex::lbound(bx);
+              const auto hi = amrex::ubound(bx);
+              
+              const Array4<Real>& sca = scalar.array(mfi);
+
+              for (auto n = 1;    n < 4;     ++n) { 
+              for (auto k = lo.z; k <= hi.z; ++k) {
+              for (auto j = lo.y; j <= hi.y; ++j) {
+              for (auto i = lo.x; i <= hi.x; ++i) {
+
+                  int index = floor((sca(i,j,k,n) + hbinwidth)/binwidth);
+                  index += halfbin;
+                  
+                  if( index >=0 && index <= nbins) {
+                      bins[index] += 1;
+                      totbin++;
+                  }
+
+                  count++;
+                      
+              }
+              }
+              }
+              }
+
+          } // end MFIter
+
+          ParallelDescriptor::ReduceRealSum(bins.dataPtr(),nbins+1);
+          ParallelDescriptor::ReduceLongSum(count);
+          ParallelDescriptor::ReduceLongSum(totbin);
+          Print() << "Points outside of range "<< count - totbin << " " << 
+                     (double)(count-totbin)/count << std::endl;
+
+          // print out contents of bins to the screen
+          for (int i=0; i<nbins+1; ++i) {
+              Print() << "For scalar m = "<< 4 << " " <<  (i-halfbin)*binwidth << " " 
+                      << bins[i]/(count*binwidth) << std::endl;
+          }
+          if (ParallelDescriptor::IOProcessor()) {
+              std::ofstream outfile;
+              outfile.open(scalar_out[4]);
+              for (int i=0; i<nbins+1; ++i) {
+                  outfile << (i-halfbin)*binwidth << " " << bins[i]/(count*binwidth) << std::endl;
+              }
+              outfile.close();
+          }
+        
+        }
+
+        // solenoidal  and dilataional velocity PDF
+        MultiFab vel_decomp(ba,dmap,6,0);
+
+        Copy(vel_decomp,mf,velx_sol_ind,0,1,0); // sol
+        Copy(vel_decomp,mf,vely_sol_ind,1,1,0); // sol
+        Copy(vel_decomp,mf,velz_sol_ind,2,1,0); // sol
+        Copy(vel_decomp,mf,velx_dil_ind,3,1,0); // dil
+        Copy(vel_decomp,mf,vely_dil_ind,4,1,0); // dil
+        Copy(vel_decomp,mf,velz_dil_ind,5,1,0); // dil
+
+        // compute spatial mean
+        Real mean_solx   = vel_decomp.sum(0) / (ncells);
+        Real mean_soly   = vel_decomp.sum(1) / (ncells);
+        Real mean_solz   = vel_decomp.sum(2) / (ncells);
+        Real mean_dilx   = vel_decomp.sum(3) / (ncells);
+        Real mean_dily   = vel_decomp.sum(4) / (ncells);
+        Real mean_dilz   = vel_decomp.sum(5) / (ncells);
+
+        // get fluctuations
+        vel_decomp.plus(-1.0*mean_solx,     0, 1);
+        vel_decomp.plus(-1.0*mean_soly,     1, 1);
+        vel_decomp.plus(-1.0*mean_solz,     2, 1);
+        vel_decomp.plus(-1.0*mean_dilx,     3, 1);
+        vel_decomp.plus(-1.0*mean_dily,     4, 1);
+        vel_decomp.plus(-1.0*mean_dilz,     5, 1);
+
+        // get rms
+        Real rms_solx   = vel_decomp.norm2(0) / sqrt(ncells);
+        Real rms_soly   = vel_decomp.norm2(1) / sqrt(ncells);
+        Real rms_solz   = vel_decomp.norm2(2) / sqrt(ncells);
+        Real rms_dilx   = vel_decomp.norm2(3) / sqrt(ncells);
+        Real rms_dily   = vel_decomp.norm2(4) / sqrt(ncells);
+        Real rms_dilz   = vel_decomp.norm2(5) / sqrt(ncells);
+
+        // scale by rms
+        vel_decomp.mult(1.0/rms_solx,   0, 1);
+        vel_decomp.mult(1.0/rms_soly,   1, 1);
+        vel_decomp.mult(1.0/rms_solz,   2, 1);
+        vel_decomp.mult(1.0/rms_dilx,   3, 1);
+        vel_decomp.mult(1.0/rms_dily,   4, 1);
+        vel_decomp.mult(1.0/rms_dilz,   5, 1);
+
+        // solenoidal 
+        {
+          Vector<Real> bins(nbins+1,0.);
+
+          int halfbin = nbins/2;
+          Real hbinwidth = range/nbins;
+          Real binwidth = 2.*range/nbins;
+          amrex::Long count=0;
+          amrex::Long totbin=0;
+          for (int ind=0 ; ind < nbins+1; ind++) bins[ind]=0;
+
+          for ( MFIter mfi(vel_decomp,false); mfi.isValid(); ++mfi ) {
+
+              const Box& bx = mfi.validbox();
+              const auto lo = amrex::lbound(bx);
+              const auto hi = amrex::ubound(bx);
+              
+              const Array4<Real>& vel = vel_decomp.array(mfi);
+
+              for (auto n = 0;    n < 3;     ++n) { 
+              for (auto k = lo.z; k <= hi.z; ++k) {
+              for (auto j = lo.y; j <= hi.y; ++j) {
+              for (auto i = lo.x; i <= hi.x; ++i) {
+
+                  int index = floor((vel(i,j,k,n) + hbinwidth)/binwidth);
+                  index += halfbin;
+                  
+                  if( index >=0 && index <= nbins) {
+                      bins[index] += 1;
+                      totbin++;
+                  }
+
+                  count++;
+                      
+              }
+              }
+              }
+              }
+
+          } // end MFIter
+
+          ParallelDescriptor::ReduceRealSum(bins.dataPtr(),nbins+1);
+          ParallelDescriptor::ReduceLongSum(count);
+          ParallelDescriptor::ReduceLongSum(totbin);
+          Print() << "Points outside of range "<< count - totbin << " " << 
+                     (double)(count-totbin)/count << std::endl;
+
+          // print out contents of bins to the screen
+          for (int i=0; i<nbins+1; ++i) {
+              Print() << "For solenoid. vel. " <<  (i-halfbin)*binwidth << " " 
+                      << bins[i]/(count*binwidth) << std::endl;
+          }
+          if (ParallelDescriptor::IOProcessor()) {
+              std::ofstream outfile;
+              outfile.open(amrex::Concatenate("solenoidal_pdf",step,9));
+              for (int i=0; i<nbins+1; ++i) {
+                  outfile << (i-halfbin)*binwidth << " " << bins[i]/(count*binwidth) << std::endl;
+              }
+              outfile.close();
+          }
+        
+        }
+
+        // dilatational
+        {
+          Vector<Real> bins(nbins+1,0.);
+
+          int halfbin = nbins/2;
+          Real hbinwidth = range/nbins;
+          Real binwidth = 2.*range/nbins;
+          amrex::Long count=0;
+          amrex::Long totbin=0;
+          for (int ind=0 ; ind < nbins+1; ind++) bins[ind]=0;
+
+          for ( MFIter mfi(vel_decomp,false); mfi.isValid(); ++mfi ) {
+
+              const Box& bx = mfi.validbox();
+              const auto lo = amrex::lbound(bx);
+              const auto hi = amrex::ubound(bx);
+              
+              const Array4<Real>& vel = vel_decomp.array(mfi);
+
+              for (auto n = 3;    n < 6;     ++n) { 
+              for (auto k = lo.z; k <= hi.z; ++k) {
+              for (auto j = lo.y; j <= hi.y; ++j) {
+              for (auto i = lo.x; i <= hi.x; ++i) {
+
+                  int index = floor((vel(i,j,k,n) + hbinwidth)/binwidth);
+                  index += halfbin;
+                  
+                  if( index >=0 && index <= nbins) {
+                      bins[index] += 1;
+                      totbin++;
+                  }
+
+                  count++;
+                      
+              }
+              }
+              }
+              }
+
+          } // end MFIter
+
+          ParallelDescriptor::ReduceRealSum(bins.dataPtr(),nbins+1);
+          ParallelDescriptor::ReduceLongSum(count);
+          ParallelDescriptor::ReduceLongSum(totbin);
+          Print() << "Points outside of range "<< count - totbin << " " << 
+                     (double)(count-totbin)/count << std::endl;
+
+          // print out contents of bins to the screen
+          for (int i=0; i<nbins+1; ++i) {
+              Print() << "For dilation. vel. " <<  (i-halfbin)*binwidth << " " 
+                      << bins[i]/(count*binwidth) << std::endl;
+          }
+          if (ParallelDescriptor::IOProcessor()) {
+              std::ofstream outfile;
+              outfile.open(amrex::Concatenate("dilatational_pdf",step,9));
+              for (int i=0; i<nbins+1; ++i) {
+                  outfile << (i-halfbin)*binwidth << " " << bins[i]/(count*binwidth) << std::endl;
+              }
+              outfile.close();
+          }
+        
+        }
+
+    }
+        
+    amrex::Finalize();
+
+}
+
+
diff --git a/exec/compressible_stag/TURB_PDFS/main_multisteps.cpp b/exec/compressible_stag/TURB_PDFS/main_multisteps.cpp
new file mode 100644
index 00000000..a5c9f35f
--- /dev/null
+++ b/exec/compressible_stag/TURB_PDFS/main_multisteps.cpp
@@ -0,0 +1,526 @@
+#include <fstream>
+#include <iostream>
+
+#include <AMReX_ParmParse.H>
+#include <AMReX_PlotFileUtil.H>
+#include <AMReX_VisMF.H>
+
+using namespace amrex;
+using namespace std;
+
+static
+void
+PrintUsage (const char* progName)
+{
+    Print() << std::endl
+            << "This utility computes PDF of vorticity and divergence, and various thermodynamic scalars," << std::endl;
+
+    Print() << "Usage:" << '\n';
+    Print() << progName << " <inputs>" << std::endl
+            << "OR" << std::endl
+            << progName << std::endl
+            << " steps=<step numbers of plotfiles to be read>" << std::endl
+            << " nbins=<number of bins> " << std::endl
+            << " range=<lo/hi end of range> " << std::endl
+            << std::endl;
+
+    exit(1);
+}
+
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc,argv);
+
+    {
+
+        if (argc == 1) {
+            PrintUsage(argv[0]);
+        }
+        
+        ParmParse pp;
+        
+	std::vector<int> steps;
+        pp.queryarr("steps",steps);
+	int nsteps = steps.size();
+	Print() << "number of steps to process: " << nsteps << std::endl;
+        
+        Vector<std::string> scalar_out(5);
+        scalar_out[0] = amrex::Concatenate("div_pdf_",steps[0],9);
+	scalar_out[0] = scalar_out[0] + "_";
+	scalar_out[0] = amrex::Concatenate(scalar_out[0],steps[nsteps-1],9);
+        scalar_out[1] = amrex::Concatenate("vortx_pdf_",steps[0],9);
+	scalar_out[1] = scalar_out[1] + "_";
+	scalar_out[1] = amrex::Concatenate(scalar_out[1],steps[nsteps-1],9);
+        scalar_out[2] = amrex::Concatenate("vorty_pdf_",steps[0],9);
+	scalar_out[2] = scalar_out[2] + "_";
+	scalar_out[2] = amrex::Concatenate(scalar_out[2],steps[nsteps-1],9);
+        scalar_out[3] = amrex::Concatenate("vortz_pdf_",steps[0],9);
+	scalar_out[3] = scalar_out[3] + "_";
+	scalar_out[3] = amrex::Concatenate(scalar_out[3],steps[nsteps-1],9);
+        scalar_out[4] = amrex::Concatenate("vort_pdf_",steps[0],9);
+	scalar_out[4] = scalar_out[4] + "_";
+	scalar_out[4] = amrex::Concatenate(scalar_out[4],steps[nsteps-1],9);
+               
+  int nbins;
+  pp.get("nbins", nbins);
+
+  Real range;
+  pp.get("range",range);
+  
+	Vector<Vector<Real> > bins;
+	Vector<amrex::Long> count(5,0);
+	Vector<amrex::Long> totbin(5,0);
+	for (int i=0; i<5; ++i) {
+	    bins.push_back(Vector<Real> (nbins+1,0.));
+	}
+	
+	int halfbin = nbins/2;
+  Real hbinwidth = range/nbins;
+  Real binwidth = 2.*range/nbins;
+
+	for (int step=0; step<nsteps; ++step) {
+		std::string iFile         = amrex::Concatenate("vel_grad_decomp",steps[step],9);
+		amrex::Print() << "Reading from vel_grad_decomp plotfile " << iFile << "\n";
+
+		// for the Header
+		std::string iFile2 = iFile;
+		iFile2 += "/Header";
+
+		// open header
+		ifstream x;
+		x.open(iFile2.c_str(), ios::in);
+
+		// read in first line of header (typically "HyperCLaw-V1.1" or similar)
+		std::string str;
+		x >> str;
+
+		// read in number of components from header
+		int ncomp;
+		x >> ncomp;
+
+		// read in variable names from header
+		int flag = 0;
+		int vort_ind, div_ind, velx_sol_ind, vely_sol_ind, velz_sol_ind, velx_dil_ind, vely_dil_ind, velz_dil_ind;
+		for (int n=0; n<ncomp; ++n) {
+		    x >> str;
+		    if (str == "vort") vort_ind = flag;
+		    if (str == "div")  div_ind = flag;
+		    if (str == "ux_s") velx_sol_ind = flag;
+		    if (str == "uy_s") vely_sol_ind = flag;
+		    if (str == "uz_s") velz_sol_ind = flag;
+		    flag ++;
+		}
+
+		// read in dimensionality from header
+		int dim;
+		x >> dim;
+
+		// read in time
+		Real time;
+		x >> time;
+
+		// read in finest level
+		int finest_level;
+		x >> finest_level;
+
+		// read in prob_lo and prob_hi
+		amrex::GpuArray<amrex::Real, 3> prob_lo, prob_hi;
+		for (int i=0; i<3; ++i) {
+		    x >> prob_lo[i];        
+		}
+		for (int i=0; i<3; ++i) {
+		    x >> prob_hi[i];        
+		}
+		
+		// now read in the plotfile data
+		// check to see whether the user pointed to the plotfile base directory
+		// or the data itself
+		if (amrex::FileExists(iFile+"/Level_0/Cell_H")) {
+		   iFile += "/Level_0/Cell";
+		}
+		if (amrex::FileExists(iFile+"/Level_00/Cell_H")) {
+		   iFile += "/Level_00/Cell";
+		}
+
+		// storage for the input coarse and fine MultiFabs
+		MultiFab mf;
+
+		// read in plotfile mf to MultiFab
+		VisMF::Read(mf, iFile);
+
+		// get BoxArray and DistributionMapping
+		BoxArray ba = mf.boxArray();
+		DistributionMapping dmap = mf.DistributionMap();
+
+		// physical dimensions of problem
+		RealBox real_box({AMREX_D_DECL(prob_lo[0],prob_lo[1],prob_lo[2])},
+				 {AMREX_D_DECL(prob_hi[0],prob_hi[1],prob_hi[2])});
+
+		// single box with the enire domain
+		Box domain = ba.minimalBox().enclosedCells();
+
+		Real ncells = (double) domain.numPts();
+
+		// set to 1 (periodic)
+		Vector<int> is_periodic(3,1);
+		
+		Geometry geom(domain,&real_box,CoordSys::cartesian,is_periodic.data());
+		
+		const Real* dx = geom.CellSize();
+  
+		////////////////////////////////////////////////////////////////////////
+		////////////// velocity Laplacian PDFs///////////// ////////////////////
+		////////////////////////////////////////////////////////////////////////
+		MultiFab vel_grown(ba,dmap,6,1);
+		MultiFab vel_sol  (ba,dmap,3,1);
+
+		// copy shifted velocity components from mf into vel_grown
+		Copy(vel_grown,mf,velx_sol_ind,0,1,0); // sol
+		Copy(vel_grown,mf,vely_sol_ind,1,1,0); // sol
+		Copy(vel_grown,mf,velz_sol_ind,2,1,0); // sol
+		
+		Copy(vel_grown,mf,velx_dil_ind,3,1,0); // dil
+		Copy(vel_grown,mf,vely_dil_ind,4,1,0); // dil
+		Copy(vel_grown,mf,velz_dil_ind,5,1,0); // dil
+		
+		Copy(vel_sol,mf,velx_sol_ind,0,1,0); // sol
+		Copy(vel_sol,mf,vely_sol_ind,1,1,0); // sol
+		Copy(vel_sol,mf,velz_sol_ind,2,1,0); // sol
+
+		// fill ghost cells of vel_grown
+		vel_grown.FillBoundary(geom.periodicity());
+		vel_sol  .FillBoundary(geom.periodicity());
+
+		////////////////////////////////////////////////////////////////////////
+		///////////////////////// scalar  PDFs /////////////////////////////////
+		////////////////////////////////////////////////////////////////////////
+		MultiFab scalar(ba,dmap,4,0);    // vort_mag, div, vort_x, vort_y, vort_z
+		scalar.setVal(0.0);
+		Copy(scalar,mf,div_ind,0,1,0);
+
+		// Compute vorticity components and store in scalar
+		for ( MFIter mfi(vel_sol,false); mfi.isValid(); ++mfi ) {
+
+		    const Box& bx = mfi.validbox();
+		    const auto lo = amrex::lbound(bx);
+		    const auto hi = amrex::ubound(bx);
+
+		    Array4<Real const> const& sol  = vel_sol   .array(mfi);
+		    Array4<Real>       const& sca  = scalar    .array(mfi);
+
+		    for (auto k = lo.z; k <= hi.z; ++k) {
+		    for (auto j = lo.y; j <= hi.y; ++j) {
+		    for (auto i = lo.x; i <= hi.x; ++i) {
+			// dw/dy - dv/dz
+			sca(i,j,k,1) =
+			    (sol(i,j+1,k,velz_sol_ind) - sol(i,j-1,k,velz_sol_ind)) / (2.*dx[1]) -
+			    (sol(i,j,k+1,vely_sol_ind) - sol(i,j,k-1,vely_sol_ind)) / (2.*dx[2]);
+
+			// dv/dx - du/dy
+			sca(i,j,k,2) =
+			    (sol(i+1,j,k,vely_sol_ind) - sol(i-1,j,k,vely_sol_ind)) / (2.*dx[0]) -
+			    (sol(i,j+1,k,velx_sol_ind) - sol(i,j-1,k,velx_sol_ind)) / (2.*dx[1]);
+
+			// du/dz - dw/dx                
+			sca(i,j,k,3) =
+			    (sol(i,j,k+1,velx_sol_ind) - sol(i,j,k-1,velx_sol_ind)) / (2.*dx[2]) -
+			    (sol(i+1,j,k,velz_sol_ind) - sol(i-1,j,k,velz_sol_ind)) / (2.*dx[0]);
+
+		    }
+		    }
+		    }
+		}
+
+		// compute spatial mean
+		Real mean_div     = scalar.sum(0) / (ncells);
+		Real mean_vortx   = scalar.sum(1) / (ncells);
+		Real mean_vorty   = scalar.sum(2) / (ncells);
+		Real mean_vortz   = scalar.sum(3) / (ncells);
+
+		// get fluctuations
+		scalar.plus(-1.0*mean_div,     0, 1);
+		scalar.plus(-1.0*mean_vortx,   1, 1);
+		scalar.plus(-1.0*mean_vorty,   2, 1);
+		scalar.plus(-1.0*mean_vortz,   3, 1);
+
+		// get rms
+		Real rms_div     = scalar.norm2(0) / sqrt(ncells);
+		Real rms_vortx   = scalar.norm2(1) / sqrt(ncells);
+		Real rms_vorty   = scalar.norm2(2) / sqrt(ncells);
+		Real rms_vortz   = scalar.norm2(3) / sqrt(ncells);
+
+		// scale by rms
+		scalar.mult(1.0/rms_div,     0, 1);
+		scalar.mult(1.0/rms_vortx,   1, 1);
+		scalar.mult(1.0/rms_vorty,   2, 1);
+		scalar.mult(1.0/rms_vortz,   3, 1);
+
+		// ompute pdfs
+		for (int m = 0; m < 4; ++m) {
+
+		    for ( MFIter mfi(scalar,false); mfi.isValid(); ++mfi ) {
+
+			const Box& bx = mfi.validbox();
+			const auto lo = amrex::lbound(bx);
+			const auto hi = amrex::ubound(bx);
+			
+			const Array4<Real>& sca = scalar.array(mfi);
+
+			for (auto k = lo.z; k <= hi.z; ++k) {
+			for (auto j = lo.y; j <= hi.y; ++j) {
+			for (auto i = lo.x; i <= hi.x; ++i) {
+
+			    int index = floor((sca(i,j,k,m) + hbinwidth)/binwidth);
+			    index += halfbin;
+			    
+			    if( index >=0 && index <= nbins) {
+				bins[m][index] += 1;
+				totbin[m]++;
+			    }
+
+			    count[m]++;
+				
+			}
+			}
+			}
+
+		    } // end MFIter
+		    ParallelDescriptor::ReduceRealSum(bins[m].dataPtr(),nbins+1);
+		    ParallelDescriptor::ReduceLongSum(count[m]);
+		    ParallelDescriptor::ReduceLongSum(totbin[m]);
+		
+		    Print() << "Points outside of range "<< count[m] - totbin[m] << " " << 
+			   (double)(count[m]-totbin[m])/count[m] << std::endl;
+		}
+		
+		// ompute pdfs vorticity
+		for ( MFIter mfi(scalar,false); mfi.isValid(); ++mfi ) {
+
+		    const Box& bx = mfi.validbox();
+		    const auto lo = amrex::lbound(bx);
+		    const auto hi = amrex::ubound(bx);
+		    
+		    const Array4<Real>& sca = scalar.array(mfi);
+
+		    for (auto n = 1;    n < 4;     ++n) { 
+		    for (auto k = lo.z; k <= hi.z; ++k) {
+		    for (auto j = lo.y; j <= hi.y; ++j) {
+		    for (auto i = lo.x; i <= hi.x; ++i) {
+
+			int index = floor((sca(i,j,k,n) + hbinwidth)/binwidth);
+			index += halfbin;
+			
+			if( index >=0 && index <= nbins) {
+			    bins[4][index] += 1;
+			    totbin[4]++;
+			}
+
+			count[4]++;
+			    
+		    }
+		    }
+		    }
+		    }
+
+		} // end MFIter
+		ParallelDescriptor::ReduceRealSum(bins[4].dataPtr(),nbins+1);
+		ParallelDescriptor::ReduceLongSum(count[4]);
+		ParallelDescriptor::ReduceLongSum(totbin[4]);
+        
+        	Print() << "Points outside of range "<< count[4] - totbin[4] << " " << 
+                   (double)(count[4]-totbin[4])/count[4] << std::endl;
+
+	} // end nsteps
+	
+	// print out contents of bins to the screen
+	for (int m=0; m<5; ++m) {
+		for (int i=0; i<nbins+1; ++i) {
+		    Print() << "For scalar m = "<< m << " " <<  (i-halfbin)*binwidth << " " 
+			    << bins[m][i]/(count[m]*binwidth) << std::endl;
+		}
+		if (ParallelDescriptor::IOProcessor()) {
+		    std::ofstream outfile;
+		    outfile.open(scalar_out[m]);
+		    for (int i=0; i<nbins+1; ++i) {
+			outfile << (i-halfbin)*binwidth << " " << bins[m][i]/(count[m]*binwidth) << std::endl;
+		    }
+		    outfile.close();
+		}
+	}
+//
+//        // solenoidal  and dilataional velocity PDF
+//        MultiFab vel_decomp(ba,dmap,6,0);
+//
+//        Copy(vel_decomp,mf,velx_sol_ind,0,1,0); // sol
+//        Copy(vel_decomp,mf,vely_sol_ind,1,1,0); // sol
+//        Copy(vel_decomp,mf,velz_sol_ind,2,1,0); // sol
+//        Copy(vel_decomp,mf,velx_dil_ind,3,1,0); // dil
+//        Copy(vel_decomp,mf,vely_dil_ind,4,1,0); // dil
+//        Copy(vel_decomp,mf,velz_dil_ind,5,1,0); // dil
+//
+//        // compute spatial mean
+//        Real mean_solx   = vel_decomp.sum(0) / (ncells);
+//        Real mean_soly   = vel_decomp.sum(1) / (ncells);
+//        Real mean_solz   = vel_decomp.sum(2) / (ncells);
+//        Real mean_dilx   = vel_decomp.sum(3) / (ncells);
+//        Real mean_dily   = vel_decomp.sum(4) / (ncells);
+//        Real mean_dilz   = vel_decomp.sum(5) / (ncells);
+//
+//        // get fluctuations
+//        vel_decomp.plus(-1.0*mean_solx,     0, 1);
+//        vel_decomp.plus(-1.0*mean_soly,     1, 1);
+//        vel_decomp.plus(-1.0*mean_solz,     2, 1);
+//        vel_decomp.plus(-1.0*mean_dilx,     3, 1);
+//        vel_decomp.plus(-1.0*mean_dily,     4, 1);
+//        vel_decomp.plus(-1.0*mean_dilz,     5, 1);
+//
+//        // get rms
+//        Real rms_solx   = vel_decomp.norm2(0) / sqrt(ncells);
+//        Real rms_soly   = vel_decomp.norm2(1) / sqrt(ncells);
+//        Real rms_solz   = vel_decomp.norm2(2) / sqrt(ncells);
+//        Real rms_dilx   = vel_decomp.norm2(3) / sqrt(ncells);
+//        Real rms_dily   = vel_decomp.norm2(4) / sqrt(ncells);
+//        Real rms_dilz   = vel_decomp.norm2(5) / sqrt(ncells);
+//
+//        // scale by rms
+//        vel_decomp.mult(1.0/rms_solx,   0, 1);
+//        vel_decomp.mult(1.0/rms_soly,   1, 1);
+//        vel_decomp.mult(1.0/rms_solz,   2, 1);
+//        vel_decomp.mult(1.0/rms_dilx,   3, 1);
+//        vel_decomp.mult(1.0/rms_dily,   4, 1);
+//        vel_decomp.mult(1.0/rms_dilz,   5, 1);
+//
+//        // solenoidal 
+//        {
+//          Vector<Real> bins(nbins+1,0.);
+//
+//          int halfbin = nbins/2;
+//          Real hbinwidth = range/nbins;
+//          Real binwidth = 2.*range/nbins;
+//          amrex::Long count=0;
+//          amrex::Long totbin=0;
+//          for (int ind=0 ; ind < nbins+1; ind++) bins[ind]=0;
+//
+//          for ( MFIter mfi(vel_decomp,false); mfi.isValid(); ++mfi ) {
+//
+//              const Box& bx = mfi.validbox();
+//              const auto lo = amrex::lbound(bx);
+//              const auto hi = amrex::ubound(bx);
+//              
+//              const Array4<Real>& vel = vel_decomp.array(mfi);
+//
+//              for (auto n = 0;    n < 3;     ++n) { 
+//              for (auto k = lo.z; k <= hi.z; ++k) {
+//              for (auto j = lo.y; j <= hi.y; ++j) {
+//              for (auto i = lo.x; i <= hi.x; ++i) {
+//
+//                  int index = floor((vel(i,j,k,n) + hbinwidth)/binwidth);
+//                  index += halfbin;
+//                  
+//                  if( index >=0 && index <= nbins) {
+//                      bins[index] += 1;
+//                      totbin++;
+//                  }
+//
+//                  count++;
+//                      
+//              }
+//              }
+//              }
+//              }
+//
+//          } // end MFIter
+//
+//          ParallelDescriptor::ReduceRealSum(bins.dataPtr(),nbins+1);
+//          ParallelDescriptor::ReduceLongSum(count);
+//          ParallelDescriptor::ReduceLongSum(totbin);
+//          Print() << "Points outside of range "<< count - totbin << " " << 
+//                     (double)(count-totbin)/count << std::endl;
+//
+//          // print out contents of bins to the screen
+//          for (int i=0; i<nbins+1; ++i) {
+//              Print() << "For solenoid. vel. " <<  (i-halfbin)*binwidth << " " 
+//                      << bins[i]/(count*binwidth) << std::endl;
+//          }
+//          if (ParallelDescriptor::IOProcessor()) {
+//              std::ofstream outfile;
+//              outfile.open(amrex::Concatenate("solenoidal_pdf",step,9));
+//              for (int i=0; i<nbins+1; ++i) {
+//                  outfile << (i-halfbin)*binwidth << " " << bins[i]/(count*binwidth) << std::endl;
+//              }
+//              outfile.close();
+//          }
+//        
+//        }
+//
+//        // dilatational
+//        {
+//          Vector<Real> bins(nbins+1,0.);
+//
+//          int halfbin = nbins/2;
+//          Real hbinwidth = range/nbins;
+//          Real binwidth = 2.*range/nbins;
+//          amrex::Long count=0;
+//          amrex::Long totbin=0;
+//          for (int ind=0 ; ind < nbins+1; ind++) bins[ind]=0;
+//
+//          for ( MFIter mfi(vel_decomp,false); mfi.isValid(); ++mfi ) {
+//
+//              const Box& bx = mfi.validbox();
+//              const auto lo = amrex::lbound(bx);
+//              const auto hi = amrex::ubound(bx);
+//              
+//              const Array4<Real>& vel = vel_decomp.array(mfi);
+//
+//              for (auto n = 3;    n < 6;     ++n) { 
+//              for (auto k = lo.z; k <= hi.z; ++k) {
+//              for (auto j = lo.y; j <= hi.y; ++j) {
+//              for (auto i = lo.x; i <= hi.x; ++i) {
+//
+//                  int index = floor((vel(i,j,k,n) + hbinwidth)/binwidth);
+//                  index += halfbin;
+//                  
+//                  if( index >=0 && index <= nbins) {
+//                      bins[index] += 1;
+//                      totbin++;
+//                  }
+//
+//                  count++;
+//                      
+//              }
+//              }
+//              }
+//              }
+//
+//          } // end MFIter
+//
+//          ParallelDescriptor::ReduceRealSum(bins.dataPtr(),nbins+1);
+//          ParallelDescriptor::ReduceLongSum(count);
+//          ParallelDescriptor::ReduceLongSum(totbin);
+//          Print() << "Points outside of range "<< count - totbin << " " << 
+//                     (double)(count-totbin)/count << std::endl;
+//
+//          // print out contents of bins to the screen
+//          for (int i=0; i<nbins+1; ++i) {
+//              Print() << "For dilation. vel. " <<  (i-halfbin)*binwidth << " " 
+//                      << bins[i]/(count*binwidth) << std::endl;
+//          }
+//          if (ParallelDescriptor::IOProcessor()) {
+//              std::ofstream outfile;
+//              outfile.open(amrex::Concatenate("dilatational_pdf",step,9));
+//              for (int i=0; i<nbins+1; ++i) {
+//                  outfile << (i-halfbin)*binwidth << " " << bins[i]/(count*binwidth) << std::endl;
+//              }
+//              outfile.close();
+//          }
+//        
+//        }
+
+    }
+        
+    amrex::Finalize();
+
+}
+
+
diff --git a/exec/compressible_stag/build_frontier.sh b/exec/compressible_stag/build_frontier.sh
index d89d820b..a56f6ab3 100755
--- a/exec/compressible_stag/build_frontier.sh
+++ b/exec/compressible_stag/build_frontier.sh
@@ -1,10 +1,10 @@
 #!/usr/bin/bash
 
 ## load necessary modules 
-module load cmake/3.23.2
 module load craype-accel-amd-gfx90a
-module load rocm/5.2.0  # waiting for 5.6 for next bump
-module load cray-mpich
+module load amd-mixed
+#module load rocm/5.2.0  # waiting for 5.6 for next bump
+module load cray-mpich/8.1.23
 module load cce/15.0.0  # must be loaded after rocm
 
 # GPU-aware MPI
@@ -14,11 +14,14 @@ export MPICH_GPU_SUPPORT_ENABLED=1
 export AMREX_AMD_ARCH=gfx90a
 
 # compiler environment hints
-export CC=$(which hipcc)
-export CXX=$(which hipcc)
-export FC=$(which ftn)
-export CFLAGS="-I${ROCM_PATH}/include"
-export CXXFLAGS="-I${ROCM_PATH}/include -Wno-pass-failed"
-export LDFLAGS="-L${ROCM_PATH}/lib -lamdhip64 ${PE_MPICH_GTL_DIR_amd_gfx90a} -lmpi_gtl_hsa"
+##export CC=$(which hipcc)
+##export CXX=$(which hipcc)
+##export FC=$(which ftn)
+##export CFLAGS="-I${ROCM_PATH}/include"
+##export CXXFLAGS="-I${ROCM_PATH}/include -Wno-pass-failed"
+##export LDFLAGS="-L${ROCM_PATH}/lib -lamdhip64 ${PE_MPICH_GTL_DIR_amd_gfx90a} -lmpi_gtl_hsa"
+export LDFLAGS="-L${MPICH_DIR}/lib -lmpi ${CRAY_XPMEM_POST_LINK_OPTS} -lxpmem ${PE_MPICH_GTL_DIR_amd_gfx90a} ${PE_MPICH_GTL_LIBS_amd_gfx90a}"
+export CXXFLAGS="-I${MPICH_DIR}/include"
+export HIPFLAGS="--amdgpu-target=gfx90a"
 
-make -j18 USE_CUDA=FALSE USE_HIP=TRUE DO_TURB=TRUE MAX_SPEC=2
+make -j10 USE_CUDA=FALSE USE_HIP=TRUE DO_TURB=TRUE MAX_SPEC=2 USE_HEFFTE_ROCFFT=TRUE USE_ASSERTION=TRUE 
diff --git a/exec/compressible_stag/build_frontier_101324.sh b/exec/compressible_stag/build_frontier_101324.sh
new file mode 100755
index 00000000..01c565fd
--- /dev/null
+++ b/exec/compressible_stag/build_frontier_101324.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/bash
+
+## load necessary modules 
+module load rocm/5.7.1
+
+# GPU-aware MPI
+export MPICH_GPU_SUPPORT_ENABLED=1
+
+# optimize CUDA compilation for MI250X
+export AMREX_AMD_ARCH=gfx90a
+
+# compiler environment hints
+export CC=$(which hipcc)
+export CXX=$(which hipcc)
+export FC=$(which ftn)
+export CFLAGS="-I${ROCM_PATH}/include"
+export CXXFLAGS="-I${ROCM_PATH}/include -Wno-pass-failed"
+export LDFLAGS="-L${ROCM_PATH}/lib -lamdhip64 ${PE_MPICH_GTL_DIR_amd_gfx90a} -lmpi_gtl_hsa"
+
+make -j8 USE_HIP=TRUE DO_TURB=TRUE MAX_SPEC=2 USE_FFT=TRUE USE_ASSERTION=TRUE
diff --git a/exec/compressible_stag/build_perlmutter.sh b/exec/compressible_stag/build_perlmutter.sh
new file mode 100755
index 00000000..2118c705
--- /dev/null
+++ b/exec/compressible_stag/build_perlmutter.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/bash
+
+# required dependencies
+module load gpu
+module load PrgEnv-gnu
+module load craype
+module load craype-x86-milan
+module load craype-accel-nvidia80
+module load cudatoolkit
+module load cmake/3.24.3
+
+# necessary to use CUDA-Aware MPI and run a job
+export CRAY_ACCEL_TARGET=nvidia80
+
+# optimize CUDA compilation for A100
+export AMREX_CUDA_ARCH=8.0
+
+# optimize CPU microarchitecture for AMD EPYC 3rd Gen (Milan/Zen3)
+# note: the cc/CC/ftn wrappers below add those
+export CXXFLAGS="-march=znver3"
+export CFLAGS="-march=znver3"
+
+# compiler environment hints
+export CC=cc
+export CXX=CC
+export FC=ftn
+export CUDACXX=$(which nvcc)
+export CUDAHOSTCXX=CC
+
+make -j10 USE_CUDA=TRUE DO_TURB=TRUE MAX_SPEC=2 USE_HEFFTE_CUFFT=TRUE USE_ASSERTION=TRUE
diff --git a/exec/compressible_stag/build_perlmutter_101724.sh b/exec/compressible_stag/build_perlmutter_101724.sh
new file mode 100755
index 00000000..e2b81f93
--- /dev/null
+++ b/exec/compressible_stag/build_perlmutter_101724.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/bash
+
+# required dependencies
+module load cray-fftw
+module load cmake
+module load cudatoolkit
+
+module list
+
+# necessary to use CUDA-Aware MPI and run a job
+export CRAY_ACCEL_TARGET=nvidia80
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+
+# optimize CUDA compilation for A100
+export AMREX_CUDA_ARCH=8.0
+
+# optimize CPU microarchitecture for AMD EPYC 3rd Gen (Milan/Zen3)
+# note: the cc/CC/ftn wrappers below add those
+export CXXFLAGS="-march=znver3"
+export CFLAGS="-march=znver3"
+
+# compiler environment hints
+export CC=cc
+export CXX=CC
+export FC=ftn
+export CUDACXX=$(which nvcc)
+export CUDAHOSTCXX=CC
+
+make -j10 USE_CUDA=TRUE DO_TURB=TRUE MAX_SPEC=2  USE_FFT=TRUE USE_ASSERTION=TRUE
diff --git a/src_analysis/Make.package b/src_analysis/Make.package
index e9d28403..5607d4de 100644
--- a/src_analysis/Make.package
+++ b/src_analysis/Make.package
@@ -1,3 +1,12 @@
+CEXE_headers += StructFact.H
 CEXE_sources += StructFact.cpp
 
-CEXE_headers += StructFact.H
+CEXE_headers += TurbSpectra.H
+CEXE_headers += TurbSpectra_distributed.H
+CEXE_sources += TurbSpectra_distributed.cpp
+
+#CEXE_sources += TurbSpectra_single.cpp
+#CEXE_sources += TurbSpectra_heffte.cpp
+
+#CEXE_headers += TurbSpectra_single.H
+#CEXE_headers += TurbSpectra_heffte.H
diff --git a/src_analysis/TurbSpectra.H b/src_analysis/TurbSpectra.H
new file mode 100644
index 00000000..c1221455
--- /dev/null
+++ b/src_analysis/TurbSpectra.H
@@ -0,0 +1,12 @@
+#ifndef _TurbSpectra_H_
+#define _TurbSpectra_H_
+
+#include <TurbSpectra_distributed.H>
+//#if defined(HEFFTE_FFTW) || defined(HEFFTE_CUFFT) || defined(HEFFTE_ROCFFT) // use heFFTe
+//#include <TurbSpectra_heffte.H>
+//#elif defined(USE_DISTRIBUTED_FFT) // use single grid FFT
+//#include <TurbSpectra_distributed.H>
+//#else  // use single grid FFT
+//#include <TurbSpectra_single.H>
+//#endif
+#endif
diff --git a/src_analysis/TurbSpectra.cpp b/src_analysis/TurbSpectra.cpp
new file mode 100644
index 00000000..e69de29b
diff --git a/src_analysis/TurbSpectra_distributed.H b/src_analysis/TurbSpectra_distributed.H
new file mode 100644
index 00000000..41975655
--- /dev/null
+++ b/src_analysis/TurbSpectra_distributed.H
@@ -0,0 +1,52 @@
+#ifndef _TurbSpectraDistributed_H_
+#define _TurbSpectraDistributed_H_
+
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_Vector.H>
+#include <AMReX_VisMF.H>
+
+#include <AMReX_GpuComplex.H>
+
+#include <string>
+
+#include "common_functions.H"
+
+#define ALIGN 16
+
+using namespace amrex;
+
+
+void IntegrateKScalar(const MultiFab& cov_mag,
+                      	    const std::string& name,
+                            const int& step,
+                            const int& comp);
+//void IntegrateKScalarHeffte(const BaseFab<GpuComplex<Real> >& spectral_field,
+//                      	    const std::string& name, const Real& scaling,
+//                      	    const Box& c_local_box,
+//                      	    const Real& sqrtnpts,
+//                      	    const int& step);
+void IntegrateKVelocity(const MultiFab& cov_mag,
+                              const std::string& name,
+                              const int& step,
+                              const int& comp);
+//void IntegrateKVelocityHeffte(const BaseFab<GpuComplex<Real> >& spectral_fieldx,
+//			      const BaseFab<GpuComplex<Real> >& spectral_fieldy,
+//			      const BaseFab<GpuComplex<Real> >& spectral_fieldz,
+//                              const std::string& name, const Real& scaling,
+//                              const Box& c_local_box,
+//                              const int& step);
+void TurbSpectrumScalar(const MultiFab& variables, 
+                              const amrex::Geometry& geom, 
+                              const int& step, 
+                              const amrex::Vector<amrex::Real>& var_scaling,
+                              const amrex::Vector< std::string >& var_names);
+void TurbSpectrumVelDecomp(const MultiFab& vel,
+                                 MultiFab& vel_decomp,
+                                 const amrex::Geometry& geom,
+                                 const int& step,
+                                 const amrex::Real& var_scaling,
+                                 const amrex::Vector< std::string >& var_names);
+
+
+#endif
diff --git a/src_analysis/TurbSpectra_distributed.cpp b/src_analysis/TurbSpectra_distributed.cpp
new file mode 100644
index 00000000..e628c5ab
--- /dev/null
+++ b/src_analysis/TurbSpectra_distributed.cpp
@@ -0,0 +1,452 @@
+#include <AMReX_FFT.H>
+
+#include "TurbSpectra.H"
+#include "common_functions.H"
+
+#include <AMReX_MultiFabUtil.H>
+#include "AMReX_PlotFileUtil.H"
+#include "AMReX_BoxArray.H"
+
+void TurbSpectrumScalar(const MultiFab& variables, 
+                        const amrex::Geometry& geom, 
+                        const int& step, 
+                        const amrex::Vector<amrex::Real>& scaling,
+                        const amrex::Vector< std::string >& var_names)
+{
+    BL_PROFILE_VAR("TurbSpectrumScalar()",TurbSpectrumScalar);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(variables.nComp() == var_names.size(), 
+        "TurbSpectrumScalar: must have same number variable names as components of input MultiFab");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(variables.nComp() == scaling.size(), 
+        "TurbSpectrumScalar: must have same number variable scaling as components of input MultiFab");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(variables.local_size() == 1, 
+        "TurbSpectrumScalar: Must have one Box per MPI process");
+
+    int ncomp = variables.nComp();
+
+    Box domain = geom.Domain();
+    auto npts = domain.numPts();
+    Real sqrtnpts = std::sqrt(npts);
+    
+    amrex::FFT::R2C<Real,FFT::Direction::forward> r2c(geom.Domain());
+
+    auto const& [cba, cdm] = r2c.getSpectralDataLayout();
+
+    MultiFab cov(cba, cdm, ncomp, 0);
+    
+    for (int comp=0; comp<ncomp; ++comp) {    
+
+	MultiFab mf(variables, amrex::make_alias, comp, 1);
+	cMultiFab cmf(cba, cdm, 1, 0);
+
+	r2c.forward(mf,cmf);
+
+        // Fill in the covariance multifab
+        int comp_gpu = comp;
+        Real sqrtnpts_gpu = sqrtnpts;
+        Real scaling_i_gpu = scaling[comp];
+        std::string name_gpu = var_names[comp];
+        for (MFIter mfi(cov); mfi.isValid(); ++mfi) {
+            Array4<Real> const& data = cov.array(mfi);
+            Array4<const GpuComplex<Real> > spectral = cmf.const_array(mfi);
+            const Box& bx = mfi.validbox();
+            amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                Real re = spectral(i,j,k).real();
+                Real im = spectral(i,j,k).imag();
+                data(i,j,k,comp_gpu) = (re*re + im*im)/(sqrtnpts_gpu*sqrtnpts_gpu*scaling_i_gpu);
+            });
+        }
+        
+        // Integrate spectra over k-shells
+        IntegrateKScalar(cov,name_gpu,step,comp_gpu);
+    }
+}
+
+
+void TurbSpectrumVelDecomp(const MultiFab& vel,
+                                 MultiFab& vel_decomp,
+                                 const amrex::Geometry& geom,
+                                 const int& step,
+                                 const amrex::Real& scaling,
+                                 const amrex::Vector< std::string >& var_names)
+{
+    BL_PROFILE_VAR("TurbSpectrumVelDecomp()",TurbSpectrumVelDecomp);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(vel.nComp() == 3, 
+        "TurbSpectrumVelDecomp: must have 3 components of input vel MultiFab");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(var_names.size() == 3, 
+        "TurbSpectrumVelDecomp: must have 3 names for output vel spectra (total, solenoidal, dilatational");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(vel.local_size() == 1, 
+        "TurbSpectrumVelDecomp: Must have one Box per MPI process");
+
+    const GpuArray<Real, AMREX_SPACEDIM> dx = geom.CellSizeArray();
+    
+    Box domain = geom.Domain();
+    auto npts = domain.numPts();
+    Real sqrtnpts = std::sqrt(npts);
+    
+    // get box array and distribution map of vel
+    DistributionMapping dm = vel.DistributionMap();
+    BoxArray ba            = vel.boxArray();
+    
+    amrex::FFT::R2C<Real,FFT::Direction::both> r2c(geom.Domain());
+
+    // box array and dmap for FFT
+    auto const& [cba, cdm] = r2c.getSpectralDataLayout();
+
+    // each MPI rank gets storage for its piece of the fft
+    cMultiFab spectral_field_Tx(cba,cdm,1,0); // totalx
+    cMultiFab spectral_field_Ty(cba,cdm,1,0); // totaly
+    cMultiFab spectral_field_Tz(cba,cdm,1,0); // totalz
+    cMultiFab spectral_field_Sx(cba,cdm,1,0); // solenoidalx
+    cMultiFab spectral_field_Sy(cba,cdm,1,0); // solenoidaly
+    cMultiFab spectral_field_Sz(cba,cdm,1,0); // solenoidalz
+    cMultiFab spectral_field_Dx(cba,cdm,1,0); // dilatationalx
+    cMultiFab spectral_field_Dy(cba,cdm,1,0); // dilatationaly
+    cMultiFab spectral_field_Dz(cba,cdm,1,0); // dilatationalz
+	
+    // ForwardTransform
+    // X
+    {
+	MultiFab vel_single(vel, amrex::make_alias, 0, 1);
+	r2c.forward(vel_single,spectral_field_Tx);
+    }
+    // Y
+    {
+	MultiFab vel_single(vel, amrex::make_alias, 1, 1);
+	r2c.forward(vel_single,spectral_field_Ty);
+    }
+    // Z
+    {
+	MultiFab vel_single(vel, amrex::make_alias, 2, 1);
+	r2c.forward(vel_single,spectral_field_Tz);
+    }
+    
+    // Decompose velocity field into solenoidal and dilatational
+    for (MFIter mfi(spectral_field_Tx); mfi.isValid(); ++mfi) {
+        Array4< GpuComplex<Real> > spectral_tx = spectral_field_Tx.array(mfi);
+        Array4< GpuComplex<Real> > spectral_ty = spectral_field_Ty.array(mfi);
+        Array4< GpuComplex<Real> > spectral_tz = spectral_field_Tz.array(mfi);
+        Array4< GpuComplex<Real> > spectral_sx = spectral_field_Sx.array(mfi);
+        Array4< GpuComplex<Real> > spectral_sy = spectral_field_Sy.array(mfi);
+        Array4< GpuComplex<Real> > spectral_sz = spectral_field_Sz.array(mfi);
+        Array4< GpuComplex<Real> > spectral_dx = spectral_field_Dx.array(mfi);
+        Array4< GpuComplex<Real> > spectral_dy = spectral_field_Dy.array(mfi);
+        Array4< GpuComplex<Real> > spectral_dz = spectral_field_Dz.array(mfi);
+        const Box& bx = mfi.validbox();
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
+        {
+
+           int nx = n_cells[0]; 
+           int ny = n_cells[1]; 
+           int nz = n_cells[2];
+
+           Real GxR = 0.0, GxC = 0.0, GyR = 0.0, GyC = 0.0, GzR = 0.0, GzC = 0.0;
+           
+           if (i <= nx/2) { 
+               
+               // Get the wavevector
+               int ki = i;
+               int kj = j;
+               if (j >= ny/2) kj = ny - j;
+               int kk = k;
+               if (k >= nz/2) kk = nz - k;
+
+               // Gradient Operators
+               GxR = (cos(2.0*M_PI*ki/nx)-1.0)/dx[0];
+               GxC = (sin(2.0*M_PI*ki/nx)-0.0)/dx[0];
+               GyR = (cos(2.0*M_PI*kj/ny)-1.0)/dx[1];
+               GyC = (sin(2.0*M_PI*kj/ny)-0.0)/dx[1];
+               GzR = (cos(2.0*M_PI*kk/nz)-1.0)/dx[2];
+               GzC = (sin(2.0*M_PI*kk/nz)-0.0)/dx[2];
+
+               // Scale Total velocity FFT components
+               spectral_tx(i,j,k) *= (1.0/sqrtnpts);
+               spectral_ty(i,j,k) *= (1.0/sqrtnpts);
+               spectral_tz(i,j,k) *= (1.0/sqrtnpts);
+
+               // Inverse Laplacian
+               Real Lap = GxR*GxR + GxC*GxC + GyR*GyR + GyC*GyC + GzR*GzR + GzC*GzC;
+
+               // Divergence of vel
+               Real divR = spectral_tx(i,j,k).real()*GxR - spectral_tx(i,j,k).imag()*GxC +
+                           spectral_ty(i,j,k).real()*GyR - spectral_ty(i,j,k).imag()*GyC +
+                           spectral_tz(i,j,k).real()*GzR - spectral_tz(i,j,k).imag()*GzC ;
+               Real divC = spectral_tx(i,j,k).real()*GxC + spectral_tx(i,j,k).imag()*GxR +
+                           spectral_ty(i,j,k).real()*GyC + spectral_ty(i,j,k).imag()*GyR +
+                           spectral_tz(i,j,k).real()*GzC + spectral_tz(i,j,k).imag()*GzR ;
+
+               if (Lap < 1.0e-12) { // zero mode for no bulk motion
+                   spectral_dx(i,j,k) *= 0.0;
+                   spectral_dy(i,j,k) *= 0.0;
+                   spectral_dz(i,j,k) *= 0.0;
+               }
+               else {
+
+                   // Dilatational velocity 
+                   GpuComplex<Real> copy_dx((divR*GxR + divC*GxC) / Lap, 
+                                            (divC*GxR - divR*GxC) / Lap);
+                   spectral_dx(i,j,k) = copy_dx;
+
+                   GpuComplex<Real> copy_dy((divR*GyR + divC*GyC) / Lap,
+                                            (divC*GyR - divR*GyC) / Lap);
+                   spectral_dy(i,j,k) = copy_dy;
+
+                   GpuComplex<Real> copy_dz((divR*GzR + divC*GzC) / Lap,
+                                            (divC*GzR - divR*GzC) / Lap);
+                   spectral_dz(i,j,k) = copy_dz;
+               }
+                   
+               // Solenoidal velocity
+               spectral_sx(i,j,k) = spectral_tx(i,j,k) - spectral_dx(i,j,k);
+               spectral_sy(i,j,k) = spectral_ty(i,j,k) - spectral_dy(i,j,k); 
+               spectral_sz(i,j,k) = spectral_tz(i,j,k) - spectral_dz(i,j,k);
+           }
+           else { // conjugate
+                amrex::Abort("check the code; i should not go beyond bx.length(0)/2");
+           }
+
+        });
+    }
+    
+    MultiFab cov(cba, cdm, 3, 0); // total, solenoidal, dilatational
+    
+    // Fill in the covariance multifab
+    Real scaling_gpu = scaling;
+    for (MFIter mfi(cov); mfi.isValid(); ++mfi) {
+        Array4<Real> const& data = cov.array(mfi);
+        Array4<const GpuComplex<Real> > spec_tx = spectral_field_Tx.const_array(mfi);
+        Array4<const GpuComplex<Real> > spec_ty = spectral_field_Ty.const_array(mfi);
+        Array4<const GpuComplex<Real> > spec_tz = spectral_field_Tz.const_array(mfi);
+        Array4<const GpuComplex<Real> > spec_sx = spectral_field_Sx.const_array(mfi);
+        Array4<const GpuComplex<Real> > spec_sy = spectral_field_Sy.const_array(mfi);
+        Array4<const GpuComplex<Real> > spec_sz = spectral_field_Sz.const_array(mfi);
+        Array4<const GpuComplex<Real> > spec_dx = spectral_field_Dx.const_array(mfi);
+        Array4<const GpuComplex<Real> > spec_dy = spectral_field_Dy.const_array(mfi);
+        Array4<const GpuComplex<Real> > spec_dz = spectral_field_Dz.const_array(mfi);
+        const Box& bx = mfi.validbox();
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            if (i <= n_cells[0]/2) {
+                Real re_x, re_y, re_z, im_x, im_y, im_z;
+                
+                re_x = spec_tx(i,j,k).real();
+                im_x = spec_tx(i,j,k).imag();
+                re_y = spec_ty(i,j,k).real();
+                im_y = spec_ty(i,j,k).imag();
+                re_z = spec_tz(i,j,k).real();
+                im_z = spec_tz(i,j,k).imag();
+                data(i,j,k,0) = (re_x*re_x + im_x*im_x + 
+                                 re_y*re_y + im_y*im_y +
+                                 re_z*re_z + im_z*im_z)/(scaling_gpu);
+                re_x = spec_sx(i,j,k).real();
+                im_x = spec_sx(i,j,k).imag();
+                re_y = spec_sy(i,j,k).real();
+                im_y = spec_sy(i,j,k).imag();
+                re_z = spec_sz(i,j,k).real();
+                im_z = spec_sz(i,j,k).imag();
+                data(i,j,k,1) = (re_x*re_x + im_x*im_x + 
+                                 re_y*re_y + im_y*im_y +
+                                 re_z*re_z + im_z*im_z)/(scaling_gpu);
+                re_x = spec_dx(i,j,k).real();
+                im_x = spec_dx(i,j,k).imag();
+                re_y = spec_dy(i,j,k).real();
+                im_y = spec_dy(i,j,k).imag();
+                re_z = spec_dz(i,j,k).real();
+                im_z = spec_dz(i,j,k).imag();
+                data(i,j,k,2) = (re_x*re_x + im_x*im_x + 
+                                 re_y*re_y + im_y*im_y +
+                                 re_z*re_z + im_z*im_z)/(scaling_gpu);
+            }
+            else {
+                amrex::Abort("check the code; i should not go beyond n_cells[0]/2");
+            }
+        });
+    }
+
+    // Integrate K spectrum for velocities
+    IntegrateKVelocity(cov,"vel_total"     ,step,0);
+    IntegrateKVelocity(cov,"vel_solenoidal",step,1);
+    IntegrateKVelocity(cov,"vel_dilational",step,2);
+    
+    // inverse Fourier transform solenoidal and dilatational components 
+    {
+	MultiFab vel_decomp_single(vel_decomp, amrex::make_alias, 0, 1);
+	r2c.backward(spectral_field_Sx,vel_decomp_single);
+    }
+    {
+	MultiFab vel_decomp_single(vel_decomp, amrex::make_alias, 1, 1);
+	r2c.backward(spectral_field_Sy,vel_decomp_single);
+    }
+    {
+	MultiFab vel_decomp_single(vel_decomp, amrex::make_alias, 2, 1);
+	r2c.backward(spectral_field_Sz,vel_decomp_single);
+    }
+    {
+	MultiFab vel_decomp_single(vel_decomp, amrex::make_alias, 3, 1);
+	r2c.backward(spectral_field_Dx,vel_decomp_single);
+    }
+    {
+	MultiFab vel_decomp_single(vel_decomp, amrex::make_alias, 4, 1);
+	r2c.backward(spectral_field_Dy,vel_decomp_single);
+    }
+    {
+	MultiFab vel_decomp_single(vel_decomp, amrex::make_alias, 5, 1);
+	r2c.backward(spectral_field_Dz,vel_decomp_single);
+    }
+
+    vel_decomp.mult(1.0/sqrtnpts);
+
+}
+
+void IntegrateKScalar(const MultiFab& cov_mag,
+                      	    const std::string& name,
+                            const int& step,
+                            const int& comp)
+
+{
+    int npts = n_cells[0]/2;
+    
+    Gpu::DeviceVector<Real> phisum_device(npts, 0);
+    Gpu::DeviceVector<int>  phicnt_device(npts, 0);
+    Real* phisum_ptr = phisum_device.dataPtr();  // pointer to data
+    int*  phicnt_ptr = phicnt_device.dataPtr();  // pointer to data
+
+    int comp_gpu = comp;
+    int nx = n_cells[0]; 
+    int ny = n_cells[1]; 
+    int nz = n_cells[2];
+    for ( MFIter mfi(cov_mag,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+        
+        const Box& bx = mfi.tilebox();
+
+        const Array4<const Real> & cov = cov_mag.const_array(mfi);
+
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            if (i <= n_cells[0]/2) {
+                int ki = i; 
+                int kj = j;
+                if (j >= ny/2) kj = ny - j;
+                int kk = k;
+                if (k >= nz/2) kk = nz - k;
+
+                Real dist = (ki*ki + kj*kj + kk*kk);
+                dist = std::sqrt(dist);
+                
+                if ( dist <=  n_cells[0]/2-0.5) {
+                    dist = dist+0.5;
+                    int cell = int(dist);
+                    amrex::Gpu::Atomic::Add(&(phisum_ptr[cell]), cov(i,j,k,comp_gpu));
+                    amrex::Gpu::Atomic::Add(&(phicnt_ptr[cell]),1);
+                }
+            }
+            else {
+                amrex::Abort("check the code; i should not go beyond n_cells[0]/2");
+            }
+        });
+    }
+
+    Gpu::HostVector<Real> phisum_host(npts);
+    Gpu::HostVector<int>  phicnt_host(npts);
+    Gpu::copyAsync(Gpu::deviceToHost, phisum_device.begin(), phisum_device.end(), phisum_host.begin());
+    Gpu::copyAsync(Gpu::deviceToHost, phicnt_device.begin(), phicnt_device.end(), phicnt_host.begin());
+    Gpu::streamSynchronize();
+
+    ParallelDescriptor::ReduceRealSum(phisum_host.dataPtr(),npts);
+    ParallelDescriptor::ReduceIntSum(phicnt_host.dataPtr(),npts);
+        
+    Real dk = 1.;
+    for (int d = 1; d < npts; ++d) {
+        phisum_host[d] *= 4.*M_PI*(d*d*dk+dk*dk*dk/12.)/phicnt_host[d];
+    }
+    
+    if (ParallelDescriptor::IOProcessor()) {
+        std::ofstream turb;
+        std::string turbBaseName = "turb_"+name;
+        std::string turbName = Concatenate(turbBaseName,step,7);
+        turbName += ".txt";
+        
+        turb.open(turbName);
+        for (int d=1; d<npts; ++d) {
+            turb << d << " " << phisum_host[d] << std::endl;
+        }
+        turb.close();
+    }
+}
+
+void IntegrateKVelocity(const MultiFab& cov_mag,
+                              const std::string& name,
+                              const int& step,
+                              const int& comp)
+
+{
+    int npts = n_cells[0]/2;
+    
+    Gpu::DeviceVector<Real> phisum_device(npts, 0);
+    Gpu::DeviceVector<int>  phicnt_device(npts, 0);
+    Real* phisum_ptr = phisum_device.dataPtr();  // pointer to data
+    int*  phicnt_ptr = phicnt_device.dataPtr();  // pointer to data
+
+    int comp_gpu = comp;
+    int nx = n_cells[0]; 
+    int ny = n_cells[1]; 
+    int nz = n_cells[2];
+    for ( MFIter mfi(cov_mag,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+        
+        const Box& bx = mfi.tilebox();
+
+        const Array4<const Real> & cov = cov_mag.const_array(mfi);
+
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            if (i <= n_cells[0]/2) {
+                int ki = i; 
+                int kj = j;
+                if (j >= ny/2) kj = ny - j;
+                int kk = k;
+                if (k >= nz/2) kk = nz - k;
+
+                Real dist = (ki*ki + kj*kj + kk*kk);
+                dist = std::sqrt(dist);
+                
+                if ( dist <=  n_cells[0]/2-0.5) {
+                    dist = dist+0.5;
+                    int cell = int(dist);
+                    amrex::Gpu::Atomic::Add(&(phisum_ptr[cell]), cov(i,j,k,comp_gpu));
+                    amrex::Gpu::Atomic::Add(&(phicnt_ptr[cell]),1);
+                }
+            }
+            else {
+                amrex::Abort("check the code; i should not go beyond n_cells[0]/2");
+            }
+        });
+    }
+    
+    Gpu::HostVector<Real> phisum_host(npts);
+    Gpu::HostVector<int>  phicnt_host(npts);
+    Gpu::copyAsync(Gpu::deviceToHost, phisum_device.begin(), phisum_device.end(), phisum_host.begin());
+    Gpu::copyAsync(Gpu::deviceToHost, phicnt_device.begin(), phicnt_device.end(), phicnt_host.begin());
+    Gpu::streamSynchronize();
+
+    ParallelDescriptor::ReduceRealSum(phisum_host.dataPtr(),npts);
+    ParallelDescriptor::ReduceIntSum(phicnt_host.dataPtr(),npts);
+        
+    Real dk = 1.;
+    for (int d = 1; d < npts; ++d) {
+        phisum_host[d] *= 4.*M_PI*(d*d*dk+dk*dk*dk/12.)/phicnt_host[d];
+    }
+    
+    if (ParallelDescriptor::IOProcessor()) {
+        std::ofstream turb;
+        std::string turbBaseName = "turb_"+name;
+        std::string turbName = Concatenate(turbBaseName,step,7);
+        turbName += ".txt";
+        
+        turb.open(turbName);
+        for (int d=1; d<npts; ++d) {
+            turb << d << " " << phisum_host[d] << std::endl;
+        }
+        turb.close();
+    }
+}
+
diff --git a/src_analysis/TurbSpectra_heffte.H b/src_analysis/TurbSpectra_heffte.H
new file mode 100644
index 00000000..81e173d2
--- /dev/null
+++ b/src_analysis/TurbSpectra_heffte.H
@@ -0,0 +1,53 @@
+#ifndef _TurbSpectraHeffte_H_
+#define _TurbSpectraHeffte_H_
+
+#include <heffte.h>
+
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_Vector.H>
+#include <AMReX_VisMF.H>
+
+#include <AMReX_GpuComplex.H>
+
+#include <string>
+
+#include "common_functions.H"
+
+#define ALIGN 16
+
+using namespace amrex;
+
+void IntegrateKScalar(const MultiFab& cov_mag,
+                      	    const std::string& name,
+                            const int& step,
+                            const int& comp);
+//void IntegrateKScalarHeffte(const BaseFab<GpuComplex<Real> >& spectral_field,
+//                      	    const std::string& name, const Real& scaling,
+//                      	    const Box& c_local_box,
+//                      	    const Real& sqrtnpts,
+//                      	    const int& step);
+void IntegrateKVelocity(const MultiFab& cov_mag,
+                              const std::string& name,
+                              const int& step,
+                              const int& comp);
+//void IntegrateKVelocityHeffte(const BaseFab<GpuComplex<Real> >& spectral_fieldx,
+//			      const BaseFab<GpuComplex<Real> >& spectral_fieldy,
+//			      const BaseFab<GpuComplex<Real> >& spectral_fieldz,
+//                              const std::string& name, const Real& scaling,
+//                              const Box& c_local_box,
+//                              const int& step);
+void TurbSpectrumScalar(const MultiFab& variables, 
+                              const amrex::Geometry& geom, 
+                              const int& step, 
+                              const amrex::Vector<amrex::Real>& var_scaling,
+                              const amrex::Vector< std::string >& var_names);
+void TurbSpectrumVelDecomp(const MultiFab& vel,
+                                 MultiFab& vel_decomp,
+                                 const amrex::Geometry& geom,
+                                 const int& step,
+                                 const amrex::Real& var_scaling,
+                                 const amrex::Vector< std::string >& var_names);
+
+
+#endif
diff --git a/src_analysis/TurbSpectra_heffte.cpp b/src_analysis/TurbSpectra_heffte.cpp
new file mode 100644
index 00000000..90cc2615
--- /dev/null
+++ b/src_analysis/TurbSpectra_heffte.cpp
@@ -0,0 +1,749 @@
+#include "TurbSpectra.H"
+#include "common_functions.H"
+
+#include <AMReX_MultiFabUtil.H>
+#include "AMReX_PlotFileUtil.H"
+#include "AMReX_BoxArray.H"
+
+void TurbSpectrumScalar(const MultiFab& variables, 
+                              const amrex::Geometry& geom, 
+                              const int& step, 
+                              const amrex::Vector<amrex::Real>& scaling,
+                              const amrex::Vector< std::string >& var_names)
+{
+    BL_PROFILE_VAR("TurbSpectrumScalar()",TurbSpectrumScalar);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(variables.nComp() == var_names.size(), 
+        "TurbSpectrumScalar: must have same number variable names as components of input MultiFab");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(variables.nComp() == scaling.size(), 
+        "TurbSpectrumScalar: must have same number variable scaling as components of input MultiFab");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(variables.local_size() == 1, 
+        "TurbSpectrumScalar: Must have one Box per MPI process when using heFFTe");
+
+    int ncomp = variables.nComp();
+
+    long npts;
+    Box domain = geom.Domain();
+    npts = (domain.length(0)*domain.length(1)*domain.length(2));
+    Real sqrtnpts = std::sqrt(npts);
+    
+    // get box array and distribution map of variables
+    DistributionMapping dm = variables.DistributionMap();
+    BoxArray ba            = variables.boxArray();
+    
+    // since there is 1 MPI rank per box, each MPI rank obtains its local box and the associated boxid
+    Box local_box;
+    int local_boxid;
+    {
+        for (int i = 0; i < ba.size(); ++i) {
+            Box b = ba[i];
+            // each MPI rank has its own local_box Box and local_boxid ID
+            if (ParallelDescriptor::MyProc() == dm[i]) {
+                local_box = b;
+                local_boxid = i;
+            }
+        }
+    }
+
+    // now each MPI rank works on its own box
+    // for real->complex fft's, the fft is stored in an (nx/2+1) x ny x nz dataset
+
+    // start by coarsening each box by 2 in the x-direction
+    Box c_local_box = amrex::coarsen(local_box, IntVect(AMREX_D_DECL(2,1,1)));
+
+    // if the coarsened box's high-x index is even, we shrink the size in 1 in x
+    // this avoids overlap between coarsened boxes
+    if (c_local_box.bigEnd(0) * 2 == local_box.bigEnd(0)) {
+        c_local_box.setBig(0,c_local_box.bigEnd(0)-1);
+    }
+    // for any boxes that touch the hi-x domain we
+    // increase the size of boxes by 1 in x
+    // this makes the overall fft dataset have size (Nx/2+1 x Ny x Nz)
+    if (local_box.bigEnd(0) == geom.Domain().bigEnd(0)) {
+        c_local_box.growHi(0,1);
+    }
+
+    // BOX ARRAY TO STORE COVARIANCE MATRIX IN A MFAB
+    // create a BoxArray containing the fft boxes
+    // by construction, these boxes correlate to the associated spectral_data
+    // this we can copy the spectral data into this multifab since we know they are owned by the same MPI rank
+    BoxArray fft_ba;
+    {
+        BoxList bl;
+        bl.reserve(ba.size());
+
+        for (int i = 0; i < ba.size(); ++i) {
+            Box b = ba[i];
+
+            Box r_box = b;
+            Box c_box = amrex::coarsen(r_box, IntVect(AMREX_D_DECL(2,1,1)));
+
+            // this avoids overlap for the cases when one or more r_box's
+            // have an even cell index in the hi-x cell
+            if (c_box.bigEnd(0) * 2 == r_box.bigEnd(0)) {
+                c_box.setBig(0,c_box.bigEnd(0)-1);
+            }
+
+            // increase the size of boxes touching the hi-x domain by 1 in x
+            // this is an (Nx x Ny x Nz) -> (Nx/2+1 x Ny x Nz) real-to-complex sizing
+            if (b.bigEnd(0) == geom.Domain().bigEnd(0)) {
+                c_box.growHi(0,1);
+            }
+            bl.push_back(c_box);
+
+        }
+        fft_ba.define(std::move(bl));
+    }
+    MultiFab cov(fft_ba, dm, ncomp, 0);
+		
+    // each MPI rank gets storage for its piece of the fft
+    BaseFab<GpuComplex<Real> > spectral_field(c_local_box, 1, The_Device_Arena());
+    MultiFab variables_single(ba, dm, 1, 0);
+    using heffte_complex = typename heffte::fft_output<Real>::type;
+    
+    int r2c_direction = 0;
+    for (int comp=0; comp<ncomp; ++comp) {    
+#if defined(HEFFTE_CUFFT)
+    	  heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+    	  heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+    	  heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+        ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+        {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+        {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+        {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+        r2c_direction, ParallelDescriptor::Communicator());
+
+        heffte_complex* spectral_data = (heffte_complex*) spectral_field.dataPtr();        
+        variables_single.ParallelCopy(variables,comp,0,1);
+        fft.forward(variables_single[local_boxid].dataPtr(),spectral_data);        
+        Gpu::streamSynchronize();
+
+        // Fill in the covariance multifab
+        int comp_gpu = comp;
+        Real sqrtnpts_gpu = sqrtnpts;
+        Real scaling_i_gpu = scaling[comp];
+        std::string name_gpu = var_names[comp];
+        for (MFIter mfi(cov); mfi.isValid(); ++mfi) {
+            Array4<Real> const& data = cov.array(mfi);
+            Array4<const GpuComplex<Real> > spectral = spectral_field.const_array();
+            const Box& bx = mfi.fabbox();
+            amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                Real re = spectral(i,j,k).real();
+                Real im = spectral(i,j,k).imag();
+                data(i,j,k,comp_gpu) = (re*re + im*im)/(sqrtnpts_gpu*sqrtnpts_gpu*scaling_i_gpu);
+            });
+        }
+        
+        // Integrate spectra over k-shells
+        IntegrateKScalar(cov,name_gpu,step,comp_gpu);
+    }
+}
+
+void TurbSpectrumVelDecomp(const MultiFab& vel,
+                                 MultiFab& vel_decomp,
+                                 const amrex::Geometry& geom,
+                                 const int& step,
+                                 const amrex::Real& scaling,
+                                 const amrex::Vector< std::string >& var_names)
+{
+    BL_PROFILE_VAR("TurbSpectrumVelDecomp()",TurbSpectrumVelDecomp);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(vel.nComp() == 3, 
+        "TurbSpectrumVelDecomp: must have 3 components of input vel MultiFab");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(var_names.size() == 3, 
+        "TurbSpectrumVelDecomp: must have 3 names for output vel spectra (total, solenoidal, dilatational");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(vel.local_size() == 1, 
+        "TurbSpectrumVelDecomp: Must have one Box per MPI process when using heFFTe");
+
+    const GpuArray<Real, AMREX_SPACEDIM> dx = geom.CellSizeArray();
+    
+    long npts;
+    Box domain = geom.Domain();
+    npts = (domain.length(0)*domain.length(1)*domain.length(2));
+    Real sqrtnpts = std::sqrt(npts);
+    
+    // get box array and distribution map of vel
+    DistributionMapping dm = vel.DistributionMap();
+    BoxArray ba            = vel.boxArray();
+    
+    // since there is 1 MPI rank per box, each MPI rank obtains its local box and the associated boxid
+    Box local_box;
+    int local_boxid;
+    {
+        for (int i = 0; i < ba.size(); ++i) {
+            Box b = ba[i];
+            // each MPI rank has its own local_box Box and local_boxid ID
+            if (ParallelDescriptor::MyProc() == dm[i]) {
+                local_box = b;
+                local_boxid = i;
+            }
+        }
+    }
+
+    // now each MPI rank works on its own box
+    // for real->complex fft's, the fft is stored in an (nx/2+1) x ny x nz dataset
+
+    // start by coarsening each box by 2 in the x-direction
+    Box c_local_box = amrex::coarsen(local_box, IntVect(AMREX_D_DECL(2,1,1)));
+
+    // if the coarsened box's high-x index is even, we shrink the size in 1 in x
+    // this avoids overlap between coarsened boxes
+    if (c_local_box.bigEnd(0) * 2 == local_box.bigEnd(0)) {
+        c_local_box.setBig(0,c_local_box.bigEnd(0)-1);
+    }
+    // for any boxes that touch the hi-x domain we
+    // increase the size of boxes by 1 in x
+    // this makes the overall fft dataset have size (Nx/2+1 x Ny x Nz)
+    if (local_box.bigEnd(0) == geom.Domain().bigEnd(0)) {
+        c_local_box.growHi(0,1);
+    }
+
+    // each MPI rank gets storage for its piece of the fft
+    BaseFab<GpuComplex<Real> > spectral_field_Tx(c_local_box, 1, The_Device_Arena()); // totalx
+    BaseFab<GpuComplex<Real> > spectral_field_Ty(c_local_box, 1, The_Device_Arena()); // totaly
+    BaseFab<GpuComplex<Real> > spectral_field_Tz(c_local_box, 1, The_Device_Arena()); // totalz
+    BaseFab<GpuComplex<Real> > spectral_field_Sx(c_local_box, 1, The_Device_Arena()); // solenoidalx
+    BaseFab<GpuComplex<Real> > spectral_field_Sy(c_local_box, 1, The_Device_Arena()); // solenoidaly
+    BaseFab<GpuComplex<Real> > spectral_field_Sz(c_local_box, 1, The_Device_Arena()); // solenoidalz
+    BaseFab<GpuComplex<Real> > spectral_field_Dx(c_local_box, 1, The_Device_Arena()); // dilatationalx
+    BaseFab<GpuComplex<Real> > spectral_field_Dy(c_local_box, 1, The_Device_Arena()); // dilatationaly
+    BaseFab<GpuComplex<Real> > spectral_field_Dz(c_local_box, 1, The_Device_Arena()); // dilatationalz
+	  MultiFab vel_single(ba, dm, 1, 0);
+    	
+    int r2c_direction = 0;
+    
+    // ForwardTransform
+    // X
+    using heffte_complex = typename heffte::fft_output<Real>::type;
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      vel_single.ParallelCopy(vel, 0, 0, 1);
+	    heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tx.dataPtr();
+      fft.forward(vel_single[local_boxid].dataPtr(),spectral_data);
+    }
+    // Y
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      vel_single.ParallelCopy(vel, 1, 0, 1);
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Ty.dataPtr();
+      fft.forward(vel_single[local_boxid].dataPtr(),spectral_data);
+    }
+    // Z
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      vel_single.ParallelCopy(vel, 2, 0, 1);
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tz.dataPtr();
+      fft.forward(vel_single[local_boxid].dataPtr(),spectral_data);
+    }
+    
+    Gpu::streamSynchronize();
+    
+    // Decompose velocity field into solenoidal and dilatational
+    Array4< GpuComplex<Real> > spectral_tx = spectral_field_Tx.array();
+    Array4< GpuComplex<Real> > spectral_ty = spectral_field_Ty.array();
+    Array4< GpuComplex<Real> > spectral_tz = spectral_field_Tz.array();
+    Array4< GpuComplex<Real> > spectral_sx = spectral_field_Sx.array();
+    Array4< GpuComplex<Real> > spectral_sy = spectral_field_Sy.array();
+    Array4< GpuComplex<Real> > spectral_sz = spectral_field_Sz.array();
+    Array4< GpuComplex<Real> > spectral_dx = spectral_field_Dx.array();
+    Array4< GpuComplex<Real> > spectral_dy = spectral_field_Dy.array();
+    Array4< GpuComplex<Real> > spectral_dz = spectral_field_Dz.array();
+    ParallelFor(c_local_box, [=] AMREX_GPU_DEVICE(int i, int j, int k)
+    {
+
+       int nx = n_cells[0]; 
+       int ny = n_cells[1]; 
+       int nz = n_cells[2];
+
+       Real GxR = 0.0, GxC = 0.0, GyR = 0.0, GyC = 0.0, GzR = 0.0, GzC = 0.0;
+       
+       if (i <= nx/2) { 
+           
+           // Get the wavevector
+           int ki = i;
+           int kj = j;
+           if (j >= ny/2) kj = ny - j;
+           int kk = k;
+           if (k >= nz/2) kk = nz - k;
+
+           // Gradient Operators
+           GxR = (cos(2.0*M_PI*ki/nx)-1.0)/dx[0];
+           GxC = (sin(2.0*M_PI*ki/nx)-0.0)/dx[0];
+           GyR = (cos(2.0*M_PI*kj/ny)-1.0)/dx[1];
+           GyC = (sin(2.0*M_PI*kj/ny)-0.0)/dx[1];
+           GzR = (cos(2.0*M_PI*kk/nz)-1.0)/dx[2];
+           GzC = (sin(2.0*M_PI*kk/nz)-0.0)/dx[2];
+       }
+       else { // conjugate
+            amrex::Abort("check the code; i should not go beyond bx.length(0)/2");
+       }
+
+       // Scale Total velocity FFT components
+       spectral_tx(i,j,k) *= (1.0/sqrtnpts);
+       spectral_ty(i,j,k) *= (1.0/sqrtnpts);
+       spectral_tz(i,j,k) *= (1.0/sqrtnpts);
+
+       // Inverse Laplacian
+       Real Lap = GxR*GxR + GxC*GxC + GyR*GyR + GyC*GyC + GzR*GzR + GzC*GzC;
+
+       // Divergence of vel
+       Real divR = spectral_tx(i,j,k).real()*GxR - spectral_tx(i,j,k).imag()*GxC +
+                   spectral_ty(i,j,k).real()*GyR - spectral_ty(i,j,k).imag()*GyC +
+                   spectral_tz(i,j,k).real()*GzR - spectral_tz(i,j,k).imag()*GzC ;
+       Real divC = spectral_tx(i,j,k).real()*GxC + spectral_tx(i,j,k).imag()*GxR +
+                   spectral_ty(i,j,k).real()*GyC + spectral_ty(i,j,k).imag()*GyR +
+                   spectral_tz(i,j,k).real()*GzC + spectral_tz(i,j,k).imag()*GzR ;
+
+       if (Lap < 1.0e-12) { // zero mode for no bulk motion
+           spectral_dx(i,j,k) *= 0.0;
+           spectral_dy(i,j,k) *= 0.0;
+           spectral_dz(i,j,k) *= 0.0;
+       }
+       else {
+
+           // Dilatational velocity 
+           GpuComplex<Real> copy_dx((divR*GxR + divC*GxC) / Lap, 
+                                    (divC*GxR - divR*GxC) / Lap);
+           spectral_dx(i,j,k) = copy_dx;
+
+           GpuComplex<Real> copy_dy((divR*GyR + divC*GyC) / Lap,
+                                    (divC*GyR - divR*GyC) / Lap);
+           spectral_dy(i,j,k) = copy_dy;
+
+           GpuComplex<Real> copy_dz((divR*GzR + divC*GzC) / Lap,
+                                    (divC*GzR - divR*GzC) / Lap);
+           spectral_dz(i,j,k) = copy_dz;
+       }
+           
+       // Solenoidal velocity
+       spectral_sx(i,j,k) = spectral_tx(i,j,k) - spectral_dx(i,j,k);
+       spectral_sy(i,j,k) = spectral_ty(i,j,k) - spectral_dy(i,j,k); 
+       spectral_sz(i,j,k) = spectral_tz(i,j,k) - spectral_dz(i,j,k);
+
+    });
+
+    Gpu::streamSynchronize();
+    
+    // BOX ARRAY TO STORE COVARIANCE MATRIX IN A MFAB
+    // create a BoxArray containing the fft boxes
+    // by construction, these boxes correlate to the associated spectral_data
+    // this we can copy the spectral data into this multifab since we know they are owned by the same MPI rank
+    BoxArray fft_ba;
+    {
+        BoxList bl;
+        bl.reserve(ba.size());
+
+        for (int i = 0; i < ba.size(); ++i) {
+            Box b = ba[i];
+
+            Box r_box = b;
+            Box c_box = amrex::coarsen(r_box, IntVect(AMREX_D_DECL(2,1,1)));
+
+            // this avoids overlap for the cases when one or more r_box's
+            // have an even cell index in the hi-x cell
+            if (c_box.bigEnd(0) * 2 == r_box.bigEnd(0)) {
+                c_box.setBig(0,c_box.bigEnd(0)-1);
+            }
+
+            // increase the size of boxes touching the hi-x domain by 1 in x
+            // this is an (Nx x Ny x Nz) -> (Nx/2+1 x Ny x Nz) real-to-complex sizing
+            if (b.bigEnd(0) == geom.Domain().bigEnd(0)) {
+                c_box.growHi(0,1);
+            }
+            bl.push_back(c_box);
+
+        }
+        fft_ba.define(std::move(bl));
+    }
+    MultiFab cov(fft_ba, dm, 3, 0); // total, solenoidal, dilatational
+    
+    // Fill in the covariance multifab
+    Real sqrtnpts_gpu = sqrtnpts;
+    Real scaling_gpu = scaling;
+    for (MFIter mfi(cov); mfi.isValid(); ++mfi) {
+        Array4<Real> const& data = cov.array(mfi);
+        Array4<const GpuComplex<Real> > spec_tx = spectral_field_Tx.const_array();
+        Array4<const GpuComplex<Real> > spec_ty = spectral_field_Ty.const_array();
+        Array4<const GpuComplex<Real> > spec_tz = spectral_field_Tz.const_array();
+        Array4<const GpuComplex<Real> > spec_sx = spectral_field_Sx.const_array();
+        Array4<const GpuComplex<Real> > spec_sy = spectral_field_Sy.const_array();
+        Array4<const GpuComplex<Real> > spec_sz = spectral_field_Sz.const_array();
+        Array4<const GpuComplex<Real> > spec_dx = spectral_field_Dx.const_array();
+        Array4<const GpuComplex<Real> > spec_dy = spectral_field_Dy.const_array();
+        Array4<const GpuComplex<Real> > spec_dz = spectral_field_Dz.const_array();
+        const Box& bx = mfi.fabbox();
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            Real re_x, re_y, re_z, im_x, im_y, im_z;
+            
+            re_x = spec_tx(i,j,k).real();
+            im_x = spec_tx(i,j,k).imag();
+            re_y = spec_ty(i,j,k).real();
+            im_y = spec_ty(i,j,k).imag();
+            re_z = spec_tz(i,j,k).real();
+            im_z = spec_tz(i,j,k).imag();
+            data(i,j,k,0) = (re_x*re_x + im_x*im_x + 
+                             re_y*re_y + im_y*im_y +
+                             re_z*re_z + im_z*im_z)/(scaling_gpu);
+            re_x = spec_sx(i,j,k).real();
+            im_x = spec_sx(i,j,k).imag();
+            re_y = spec_sy(i,j,k).real();
+            im_y = spec_sy(i,j,k).imag();
+            re_z = spec_sz(i,j,k).real();
+            im_z = spec_sz(i,j,k).imag();
+            data(i,j,k,1) = (re_x*re_x + im_x*im_x + 
+                             re_y*re_y + im_y*im_y +
+                             re_z*re_z + im_z*im_z)/(scaling_gpu);
+            re_x = spec_dx(i,j,k).real();
+            im_x = spec_dx(i,j,k).imag();
+            re_y = spec_dy(i,j,k).real();
+            im_y = spec_dy(i,j,k).imag();
+            re_z = spec_dz(i,j,k).real();
+            im_z = spec_dz(i,j,k).imag();
+            data(i,j,k,2) = (re_x*re_x + im_x*im_x + 
+                             re_y*re_y + im_y*im_y +
+                             re_z*re_z + im_z*im_z)/(scaling_gpu);
+        });
+    }
+
+    // Integrate K spectrum for velocities
+    IntegrateKVelocity(cov,"vel_total"     ,step,0);
+    IntegrateKVelocity(cov,"vel_solenoidal",step,1);
+    IntegrateKVelocity(cov,"vel_dilational",step,2);
+    
+	  MultiFab vel_decomp_single(ba, dm, 1, 0);
+    // inverse Fourier transform solenoidal and dilatational components 
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Sx.dataPtr();
+      fft.backward(spectral_data, vel_decomp_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp.ParallelCopy(vel_decomp_single, 0, 0, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Sy.dataPtr();
+      fft.backward(spectral_data, vel_decomp_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp.ParallelCopy(vel_decomp_single, 0, 1, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Sz.dataPtr();
+      fft.backward(spectral_data, vel_decomp_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp.ParallelCopy(vel_decomp_single, 0, 2, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Dx.dataPtr();
+      fft.backward(spectral_data, vel_decomp_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp.ParallelCopy(vel_decomp_single, 0, 3, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Dy.dataPtr();
+      fft.backward(spectral_data, vel_decomp_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp.ParallelCopy(vel_decomp_single, 0, 4, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Dz.dataPtr();
+      fft.backward(spectral_data, vel_decomp_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp.ParallelCopy(vel_decomp_single, 0, 5, 1);
+    }
+
+    
+    vel_decomp.mult(1.0/sqrtnpts);
+
+}
+
+void IntegrateKScalar(const MultiFab& cov_mag,
+                      	    const std::string& name,
+                            const int& step,
+                            const int& comp)
+
+{
+    int npts = n_cells[0]/2;
+    
+    Gpu::DeviceVector<Real> phisum_device(npts);
+    Gpu::DeviceVector<int>  phicnt_device(npts);
+//    Gpu::HostVector<Real> phisum_host(npts);
+//    Gpu::HostVector<int>  phicnt_host(npts);
+    
+    Gpu::HostVector<Real> phisum_host(npts);
+    
+    Real* phisum_ptr = phisum_device.dataPtr();  // pointer to data
+    int*  phicnt_ptr = phicnt_device.dataPtr();  // pointer to data
+    
+    amrex::ParallelFor(npts, [=] AMREX_GPU_DEVICE (int d) noexcept
+    {
+      phisum_ptr[d] = 0.;
+      phicnt_ptr[d] = 0;
+    });
+//    for (int d=0; d<npts; ++d) {
+//	phisum_host[d] = 0.;
+//	phicnt_host[d] = 0;
+//    }
+
+    int comp_gpu = comp;
+    int nx = n_cells[0]; 
+    int ny = n_cells[1]; 
+    int nz = n_cells[2];
+    for ( MFIter mfi(cov_mag,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+        
+        const Box& bx = mfi.tilebox();
+
+        const Array4<const Real> & cov = cov_mag.const_array(mfi);
+
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            int ki = i; 
+            int kj = j;
+            if (j >= ny/2) kj = ny - j;
+            int kk = k;
+            if (k >= nz/2) kk = nz - k;
+
+            Real dist = (ki*ki + kj*kj + kk*kk);
+            dist = std::sqrt(dist);
+            
+            if ( dist <=  n_cells[0]/2-0.5) {
+	              dist = dist+0.5;
+                int cell = int(dist);
+		            amrex::Gpu::Atomic::Add(&(phisum_ptr[cell]), cov(i,j,k,comp_gpu));
+		            amrex::Gpu::Atomic::Add(&(phicnt_ptr[cell]),1);
+            }
+        });
+    }
+    
+    Gpu::streamSynchronize();
+        
+    ParallelDescriptor::ReduceRealSum(phisum_device.dataPtr(),npts);
+    ParallelDescriptor::ReduceIntSum(phicnt_device.dataPtr(),npts);
+        
+    Real dk = 1.;
+    amrex::ParallelFor(npts, [=] AMREX_GPU_DEVICE (int d) noexcept
+    {
+        if (d != 0) {
+        phisum_ptr[d] *= 4.*M_PI*(d*d*dk+dk*dk*dk/12.)/phicnt_ptr[d];
+        }
+    });
+    
+    Gpu::copyAsync(Gpu::deviceToHost, phisum_device.begin(), phisum_device.end(), phisum_host.begin());
+    Gpu::streamSynchronize();
+    
+    if (ParallelDescriptor::IOProcessor()) {
+        std::ofstream turb;
+        std::string turbBaseName = "turb_"+name;
+        std::string turbName = Concatenate(turbBaseName,step,7);
+        turbName += ".txt";
+        
+        turb.open(turbName);
+        for (int d=1; d<npts; ++d) {
+            turb << d << " " << phisum_host[d] << std::endl;
+        }
+        turb.close();
+    }
+}
+
+void IntegrateKVelocity(const MultiFab& cov_mag,
+                              const std::string& name,
+                              const int& step,
+                              const int& comp)
+
+{
+    int npts = n_cells[0]/2;
+    
+    Gpu::DeviceVector<Real> phisum_device(npts);
+    Gpu::DeviceVector<int>  phicnt_device(npts);
+   
+    Gpu::HostVector<Real> phisum_host(npts);
+    
+    Real* phisum_ptr = phisum_device.dataPtr();  // pointer to data
+    int*  phicnt_ptr = phicnt_device.dataPtr();  // pointer to data
+    
+    amrex::ParallelFor(npts, [=] AMREX_GPU_DEVICE (int d) noexcept
+    {
+      phisum_ptr[d] = 0.;
+      phicnt_ptr[d] = 0;
+    });
+    
+    int comp_gpu = comp;
+    int nx = n_cells[0]; 
+    int ny = n_cells[1]; 
+    int nz = n_cells[2];
+    for ( MFIter mfi(cov_mag,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+        
+        const Box& bx = mfi.tilebox();
+
+        const Array4<const Real> & cov = cov_mag.const_array(mfi);
+
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            int ki = i; 
+            int kj = j;
+            if (j >= ny/2) kj = ny - j;
+            int kk = k;
+            if (k >= nz/2) kk = nz - k;
+
+            Real dist = (ki*ki + kj*kj + kk*kk);
+            dist = std::sqrt(dist);
+            
+            if ( dist <=  n_cells[0]/2-0.5) {
+	              dist = dist+0.5;
+                int cell = int(dist);
+		            amrex::Gpu::Atomic::Add(&(phisum_ptr[cell]), cov(i,j,k,comp_gpu));
+		            amrex::Gpu::Atomic::Add(&(phicnt_ptr[cell]),1);
+            }
+        });
+    }
+    
+    Gpu::streamSynchronize();
+
+    ParallelDescriptor::ReduceRealSum(phisum_device.dataPtr(),npts);
+    ParallelDescriptor::ReduceIntSum(phicnt_device.dataPtr(),npts);
+        
+    Real dk = 1.;
+    amrex::ParallelFor(npts, [=] AMREX_GPU_DEVICE (int d) noexcept
+    {
+        if (d != 0) {
+        phisum_ptr[d] *= 4.*M_PI*(d*d*dk+dk*dk*dk/12.)/phicnt_ptr[d];
+        }
+    });
+    
+    Gpu::copyAsync(Gpu::deviceToHost, phisum_device.begin(), phisum_device.end(), phisum_host.begin());
+    Gpu::streamSynchronize();
+    
+    if (ParallelDescriptor::IOProcessor()) {
+        std::ofstream turb;
+        std::string turbBaseName = "turb_"+name;
+        std::string turbName = Concatenate(turbBaseName,step,7);
+        turbName += ".txt";
+        
+        turb.open(turbName);
+        for (int d=1; d<npts; ++d) {
+            turb << d << " " << phisum_host[d] << std::endl;
+        }
+        turb.close();
+    }
+}
+
diff --git a/src_analysis/TurbSpectra_single.H b/src_analysis/TurbSpectra_single.H
new file mode 100644
index 00000000..eb773fa4
--- /dev/null
+++ b/src_analysis/TurbSpectra_single.H
@@ -0,0 +1,68 @@
+#ifndef _TurbSpectraSingle_H_
+#define _TurbSpectraSingle_H_
+
+#ifdef AMREX_USE_CUDA
+#include <cufft.h>
+#elif AMREX_USE_HIP
+#  if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
+#    include <rocfft/rocfft.h>
+#  else
+#    include <rocfft.h>
+#  endif
+#else
+#include <fftw3.h>
+#include <fftw3-mpi.h>
+#endif
+
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_Vector.H>
+#include <AMReX_VisMF.H>
+
+#include <AMReX_GpuComplex.H>
+
+#include <string>
+
+#include "common_functions.H"
+
+#define ALIGN 16
+
+using namespace amrex;
+
+#ifdef AMREX_USE_CUDA
+std::string cufftError (const cufftResult& err);
+#endif
+#ifdef AMREX_USE_HIP
+std::string rocfftError (const rocfft_status err);
+void Assert_rocfft_status (std::string const& name, rocfft_status status);
+#endif
+
+void IntegrateKScalar(const Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > >& spectral_field,
+                      const MultiFab& variables_onegrid,
+                      const std::string& name,
+                      const Real& scaling,
+                      const Real& sqrtnpts,
+                      const int& step);
+void IntegrateKVelocity(const Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > >& spectral_fieldx,
+                        const Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > >& spectral_fieldy,
+                        const Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > >& spectral_fieldz,
+                        const MultiFab& vel_onegrid,
+                        const std::string& name,
+                        const Real& scaling,
+                        const int& step);
+void TurbSpectrumScalar(const MultiFab& variables, 
+                        const amrex::Geometry& geom, 
+                        const int& step, 
+                        const amrex::Vector<amrex::Real>& var_scaling,
+                        const amrex::Vector< std::string >& var_names);
+void TurbSpectrumVelDecomp(const MultiFab& vel,
+                           MultiFab& vel_decomp,
+                           const amrex::Geometry& geom,
+                           const int& step,
+                           const amrex::Real& var_scaling,
+                           const amrex::Vector< std::string >& var_names);
+void InverseFFTVel(Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > >& spectral_field, 
+                   MultiFab& vel_decomp_onegrid, const IntVect& fft_size);
+
+
+#endif
diff --git a/src_analysis/TurbSpectra_single.cpp b/src_analysis/TurbSpectra_single.cpp
new file mode 100644
index 00000000..b6bccc02
--- /dev/null
+++ b/src_analysis/TurbSpectra_single.cpp
@@ -0,0 +1,1043 @@
+#include "TurbSpectra.H"
+#include "common_functions.H"
+
+#include <AMReX_MultiFabUtil.H>
+#include "AMReX_PlotFileUtil.H"
+#include "AMReX_BoxArray.H"
+
+#ifdef AMREX_USE_CUDA
+std::string cufftError (const cufftResult& err)
+{
+    switch (err) {
+    case CUFFT_SUCCESS:  return "CUFFT_SUCCESS";
+    case CUFFT_INVALID_PLAN: return "CUFFT_INVALID_PLAN";
+    case CUFFT_ALLOC_FAILED: return "CUFFT_ALLOC_FAILED";
+    case CUFFT_INVALID_TYPE: return "CUFFT_INVALID_TYPE";
+    case CUFFT_INVALID_VALUE: return "CUFFT_INVALID_VALUE";
+    case CUFFT_INTERNAL_ERROR: return "CUFFT_INTERNAL_ERROR";
+    case CUFFT_EXEC_FAILED: return "CUFFT_EXEC_FAILED";
+    case CUFFT_SETUP_FAILED: return "CUFFT_SETUP_FAILED";
+    case CUFFT_INVALID_SIZE: return "CUFFT_INVALID_SIZE";
+    case CUFFT_UNALIGNED_DATA: return "CUFFT_UNALIGNED_DATA";
+    default: return std::to_string(err) + " (unknown error code)";
+    }
+}
+#endif
+
+#ifdef AMREX_USE_HIP
+std::string rocfftError (const rocfft_status err)
+{
+    if              (err == rocfft_status_success) {
+        return std::string("rocfft_status_success");
+    } else if       (err == rocfft_status_failure) {
+        return std::string("rocfft_status_failure");
+    } else if       (err == rocfft_status_invalid_arg_value) {
+        return std::string("rocfft_status_invalid_arg_value");
+    } else if       (err == rocfft_status_invalid_dimensions) {
+        return std::string("rocfft_status_invalid_dimensions");
+    } else if       (err == rocfft_status_invalid_array_type) {
+        return std::string("rocfft_status_invalid_array_type");
+    } else if       (err == rocfft_status_invalid_strides) {
+        return std::string("rocfft_status_invalid_strides");
+    } else if       (err == rocfft_status_invalid_distance) {
+        return std::string("rocfft_status_invalid_distance");
+    } else if       (err == rocfft_status_invalid_offset) {
+        return std::string("rocfft_status_invalid_offset");
+    } else {
+        return std::to_string(err) + " (unknown error code)";
+    }
+}
+
+void Assert_rocfft_status (std::string const& name, rocfft_status status)
+{
+    if (status != rocfft_status_success) {
+        amrex::AllPrint() <<  name + " failed! Error: " + rocfftError(status) << "\n";;
+    }
+}
+#endif
+
+void TurbSpectrumScalar(const MultiFab& variables, 
+                        const amrex::Geometry& geom, 
+                        const int& step, 
+                        const amrex::Vector<amrex::Real>& scaling,
+                        const amrex::Vector< std::string >& var_names)
+{
+    BL_PROFILE_VAR("TurbSpectrumScalar()",TurbSpectrumScalar);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(variables.nComp() == var_names.size(), "TurbSpectrumScalar: must have same number variable names as components of input MultiFab");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(variables.nComp() == scaling.size(), "TurbSpectrumScalar: must have same number variable scaling as components of input MultiFab");
+    int ncomp = variables.nComp();
+
+    long npts;
+
+    // Initialize the boxarray "ba_onegrid" from the single box "domain"
+    BoxArray ba_onegrid;
+    {
+      Box domain = geom.Domain();
+      ba_onegrid.define(domain);
+      npts = (domain.length(0)*domain.length(1)*domain.length(2));
+    }
+    Real sqrtnpts = std::sqrt(npts);
+    DistributionMapping dmap_onegrid(ba_onegrid);
+    MultiFab variables_onegrid;
+    variables_onegrid.define(ba_onegrid, dmap_onegrid, 1, 0);
+
+#ifdef AMREX_USE_CUDA
+    using FFTplan = cufftHandle;
+    using FFTcomplex = cuDoubleComplex;
+#elif AMREX_USE_HIP
+    using FFTplan = rocfft_plan;
+    using FFTcomplex = double2;
+#else
+    using FFTplan = fftw_plan;
+    using FFTcomplex = fftw_complex;
+#endif
+
+    // size of box including ghost cell range
+    IntVect fft_size;
+    
+    // contain to store FFT - note it is shrunk by "half" in x
+    Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > > spectral_field;
+    Vector<FFTplan> forward_plan;
+    bool built_plan = false;
+    
+    // for CUDA builds we only need to build the plan once; track whether we did
+    for (int comp=0; comp<ncomp; ++comp) {
+        
+        variables_onegrid.ParallelCopy(variables,comp,0,1);
+        
+        if (!built_plan) {
+            for (MFIter mfi(variables_onegrid); mfi.isValid(); ++mfi) {
+
+                // grab a single box including ghost cell range
+                Box realspace_bx = mfi.fabbox();
+                
+                // size of box including ghost cell range
+                fft_size = realspace_bx.length(); 
+
+                // size of the box, except the 0th component is 'halved plus 1'
+                IntVect spectral_bx_size = fft_size;
+                spectral_bx_size[0] = fft_size[0]/2 + 1;
+
+                // spectral box
+                Box spectral_bx = Box(IntVect(0), spectral_bx_size - IntVect(1));
+
+                spectral_field.emplace_back(new BaseFab<GpuComplex<Real> >(spectral_bx,1,
+                                                                    The_Device_Arena()));
+                spectral_field.back()->setVal<RunOn::Device>(0.0); // touch the memory
+                FFTplan fplan;
+
+#ifdef AMREX_USE_CUDA // CUDA
+                cufftResult result = cufftPlan3d(&fplan, fft_size[2], fft_size[1], fft_size[0], CUFFT_D2Z);
+                if (result != CUFFT_SUCCESS) {
+                    amrex::AllPrint() << " cufftplan3d forward failed! Error: "
+                                      << cufftError(result) << "\n";
+                }
+#elif AMREX_USE_HIP // HIP
+                const std::size_t lengths[] = {std::size_t(fft_size[0]),std::size_t(fft_size[1]),std::size_t(fft_size[2])};
+                rocfft_status result = rocfft_plan_create(&fplan, rocfft_placement_notinplace, 
+                                                          rocfft_transform_type_real_forward, rocfft_precision_double,
+                                                          3, lengths, 1, nullptr);
+                Assert_rocfft_status("rocfft_plan_create", result);
+#else // host
+                fplan = fftw_plan_dft_r2c_3d(fft_size[2], fft_size[1], fft_size[0],
+                                              variables_onegrid[mfi].dataPtr(),
+                                              reinterpret_cast<FFTcomplex*>
+                                              (spectral_field.back()->dataPtr()),
+                                              FFTW_ESTIMATE);
+#endif
+                forward_plan.push_back(fplan);
+            }
+            
+            built_plan = true;
+        }
+        
+        ParallelDescriptor::Barrier();
+
+        // ForwardTransform
+        for (MFIter mfi(variables_onegrid); mfi.isValid(); ++mfi) {
+            int i = mfi.LocalIndex();
+#ifdef AMREX_USE_CUDA
+            cufftSetStream(forward_plan[i], amrex::Gpu::gpuStream());
+            cufftResult result = cufftExecD2Z(forward_plan[i],
+                                              variables_onegrid[mfi].dataPtr(),
+                                              reinterpret_cast<FFTcomplex*>
+                                                  (spectral_field[i]->dataPtr()));
+            if (result != CUFFT_SUCCESS) {
+                amrex::AllPrint() << " forward transform using cufftExec failed! Error: "
+                                  << cufftError(result) << "\n";
+	        }
+#elif AMREX_USE_HIP
+            rocfft_execution_info execinfo = nullptr;
+            rocfft_status result = rocfft_execution_info_create(&execinfo);
+            Assert_rocfft_status("rocfft_execution_info_create", result);
+
+            std::size_t buffersize = 0;
+            result = rocfft_plan_get_work_buffer_size(forward_plan[i], &buffersize);
+            Assert_rocfft_status("rocfft_plan_get_work_buffer_size", result);
+
+            void* buffer = amrex::The_Arena()->alloc(buffersize);
+            result = rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize);
+            Assert_rocfft_status("rocfft_execution_info_set_work_buffer", result);
+
+            result = rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream());
+            Assert_rocfft_status("rocfft_execution_info_set_stream", result);
+
+	        amrex::Real* variables_onegrid_ptr = variables_onegrid[mfi].dataPtr();
+	        FFTcomplex* spectral_field_ptr = reinterpret_cast<FFTcomplex*>(spectral_field[i]->dataPtr());
+            result = rocfft_execute(forward_plan[i],
+                                    (void**) &variables_onegrid_ptr, // in
+                                    (void**) &spectral_field_ptr, // out
+                                    execinfo);
+            Assert_rocfft_status("rocfft_execute", result);
+            amrex::Gpu::streamSynchronize();
+            amrex::The_Arena()->free(buffer);
+            result = rocfft_execution_info_destroy(execinfo);
+            Assert_rocfft_status("rocfft_execution_info_destroy", result);
+#else
+            fftw_execute(forward_plan[i]);
+#endif
+        }
+
+        // Integrate spectra over k-shells
+        IntegrateKScalar(spectral_field,variables_onegrid,var_names[comp],scaling[comp],sqrtnpts,step);
+    }
+    
+    // destroy fft plan
+    for (int i = 0; i < forward_plan.size(); ++i) {
+#ifdef AMREX_USE_CUDA
+        cufftDestroy(forward_plan[i]);
+#elif AMREX_USE_HIP
+        rocfft_plan_destroy(forward_plan[i]);
+#else
+        fftw_destroy_plan(forward_plan[i]);
+#endif
+    }
+}
+
+void TurbSpectrumVelDecomp(const MultiFab& vel,
+                           MultiFab& vel_decomp,
+                           const amrex::Geometry& geom,
+                           const int& step,
+                           const amrex::Real& scaling,
+                           const amrex::Vector< std::string >& var_names)
+{
+    BL_PROFILE_VAR("TurbSpectrumVelDecomp()",TurbSpectrumVelDecomp);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(vel.nComp() == 3, 
+        "TurbSpectrumVelDecomp: must have 3 components of input vel MultiFab");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(var_names.size() == 3, 
+        "TurbSpectrumVelDecomp: must have 3 names for output vel spectra (total, solenoidal, dilatational");
+    const GpuArray<Real, AMREX_SPACEDIM> dx = geom.CellSizeArray();
+
+    long npts;
+
+    // Initialize the boxarray "ba_onegrid" from the single box "domain"
+    BoxArray ba_onegrid;
+    {
+      Box domain = geom.Domain();
+      ba_onegrid.define(domain);
+      npts = (domain.length(0)*domain.length(1)*domain.length(2));
+    }
+    Real sqrtnpts = std::sqrt(npts);
+    DistributionMapping dmap_onegrid(ba_onegrid);
+    MultiFab vel_onegrid;
+    vel_onegrid.define(ba_onegrid, dmap_onegrid, 1, 0);
+
+#ifdef AMREX_USE_CUDA
+    using FFTplan = cufftHandle;
+    using FFTcomplex = cuDoubleComplex;
+#elif AMREX_USE_HIP
+    using FFTplan = rocfft_plan;
+    using FFTcomplex = double2;
+#else
+    using FFTplan = fftw_plan;
+    using FFTcomplex = fftw_complex;
+#endif
+
+    // size of box including ghost cell range
+    IntVect fft_size;
+    
+    // contain to store FFT - note it is shrunk by "half" in x
+    Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > > spectral_fieldx;
+    Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > > spectral_fieldy;
+    Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > > spectral_fieldz;
+    Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > > spectral_field_Sx;
+    Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > > spectral_field_Sy;
+    Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > > spectral_field_Sz;
+    Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > > spectral_field_Dx;
+    Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > > spectral_field_Dy;
+    Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > > spectral_field_Dz;
+    
+    // x-velocity
+    {
+        Vector<FFTplan> forward_plan;
+        vel_onegrid.ParallelCopy(vel,0,0,1);
+        for (MFIter mfi(vel_onegrid); mfi.isValid(); ++mfi) {
+            
+            // grab a single box including ghost cell range
+            Box realspace_bx = mfi.fabbox();
+
+            // size of box including ghost cell range
+            fft_size = realspace_bx.length(); // This will be different for hybrid FFT
+
+            // this is the size of the box, except the 0th component is 'halved plus 1'
+            IntVect spectral_bx_size = fft_size;
+            spectral_bx_size[0] = fft_size[0]/2 + 1;
+
+            // spectral box
+            Box spectral_bx = Box(IntVect(0), spectral_bx_size - IntVect(1));
+
+            spectral_fieldx.emplace_back(new BaseFab<GpuComplex<Real> >(spectral_bx,1,
+                                                                   The_Device_Arena()));
+            spectral_fieldx.back()->setVal<RunOn::Device>(0.0); // touch the memory
+
+            spectral_field_Sx.emplace_back(new BaseFab<GpuComplex<Real> >(spectral_bx,1,
+                                                                   The_Device_Arena()));
+            spectral_field_Sx.back()->setVal<RunOn::Device>(0.0); // touch the memory
+
+            spectral_field_Dx.emplace_back(new BaseFab<GpuComplex<Real> >(spectral_bx,1,
+                                                                   The_Device_Arena()));
+            spectral_field_Dx.back()->setVal<RunOn::Device>(0.0); // touch the memory
+
+            FFTplan fplan;
+
+#ifdef AMREX_USE_CUDA // CUDA
+            cufftResult result = cufftPlan3d(&fplan, fft_size[2], fft_size[1], fft_size[0], CUFFT_D2Z);
+            if (result != CUFFT_SUCCESS) {
+                amrex::AllPrint() << " cufftplan3d forward failed! Error: "
+                                  << cufftError(result) << "\n";
+            }
+#elif AMREX_USE_HIP // HIP
+            const std::size_t lengths[] = {std::size_t(fft_size[0]),std::size_t(fft_size[1]),std::size_t(fft_size[2])};
+            rocfft_status result = rocfft_plan_create(&fplan, rocfft_placement_notinplace, 
+                                                      rocfft_transform_type_real_forward, rocfft_precision_double,
+                                                      3, lengths, 1, nullptr);
+            Assert_rocfft_status("rocfft_plan_create", result);
+#else // host
+            fplan = fftw_plan_dft_r2c_3d(fft_size[2], fft_size[1], fft_size[0],
+                                              vel_onegrid[mfi].dataPtr(),
+                                              reinterpret_cast<FFTcomplex*>
+                                              (spectral_fieldx.back()->dataPtr()),
+                                              FFTW_ESTIMATE);
+#endif
+            forward_plan.push_back(fplan);
+        }
+
+        ParallelDescriptor::Barrier();
+        
+        // ForwardTransform
+        for (MFIter mfi(vel_onegrid); mfi.isValid(); ++mfi) {
+            int i = mfi.LocalIndex();
+#ifdef AMREX_USE_CUDA
+            cufftSetStream(forward_plan[i], amrex::Gpu::gpuStream());
+            cufftResult result = cufftExecD2Z(forward_plan[i],
+                                              vel_onegrid[mfi].dataPtr(),
+                                              reinterpret_cast<FFTcomplex*>
+                                                  (spectral_fieldx[i]->dataPtr()));
+            if (result != CUFFT_SUCCESS) {
+                amrex::AllPrint() << " forward transform using cufftExec failed! Error: "
+                                  << cufftError(result) << "\n";
+	        }
+#elif AMREX_USE_HIP
+            rocfft_execution_info execinfo = nullptr;
+            rocfft_status result = rocfft_execution_info_create(&execinfo);
+            Assert_rocfft_status("rocfft_execution_info_create", result);
+
+            std::size_t buffersize = 0;
+            result = rocfft_plan_get_work_buffer_size(forward_plan[i], &buffersize);
+            Assert_rocfft_status("rocfft_plan_get_work_buffer_size", result);
+
+            void* buffer = amrex::The_Arena()->alloc(buffersize);
+            result = rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize);
+            Assert_rocfft_status("rocfft_execution_info_set_work_buffer", result);
+
+            result = rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream());
+            Assert_rocfft_status("rocfft_execution_info_set_stream", result);
+
+	        amrex::Real* vel_onegrid_ptr = vel_onegrid[mfi].dataPtr();
+	        FFTcomplex* spectral_field_ptr = reinterpret_cast<FFTcomplex*>(spectral_fieldx[i]->dataPtr());
+            result = rocfft_execute(forward_plan[i],
+                                    (void**) &vel_onegrid_ptr, // in
+                                    (void**) &spectral_field_ptr, // out
+                                    execinfo);
+            Assert_rocfft_status("rocfft_execute", result);
+            amrex::Gpu::streamSynchronize();
+            amrex::The_Arena()->free(buffer);
+            result = rocfft_execution_info_destroy(execinfo);
+            Assert_rocfft_status("rocfft_execution_info_destroy", result);
+#else
+            fftw_execute(forward_plan[i]);
+#endif
+        }
+        
+        // destroy fft plan
+        for (int i = 0; i < forward_plan.size(); ++i) {
+#ifdef AMREX_USE_CUDA
+            cufftDestroy(forward_plan[i]);
+#elif AMREX_USE_HIP
+            rocfft_plan_destroy(forward_plan[i]);
+#else
+            fftw_destroy_plan(forward_plan[i]);
+#endif
+        }
+    
+    } // end x-vel
+
+    // y-velocity
+    {
+        Vector<FFTplan> forward_plan;
+        vel_onegrid.ParallelCopy(vel,1,0,1);
+        for (MFIter mfi(vel_onegrid); mfi.isValid(); ++mfi) {
+            
+            // grab a single box including ghost cell range
+            Box realspace_bx = mfi.fabbox();
+
+            // size of box including ghost cell range
+            fft_size = realspace_bx.length(); // This will be different for hybrid FFT
+
+            // this is the size of the box, except the 0th component is 'halved plus 1'
+            IntVect spectral_bx_size = fft_size;
+            spectral_bx_size[0] = fft_size[0]/2 + 1;
+
+            // spectral box
+            Box spectral_bx = Box(IntVect(0), spectral_bx_size - IntVect(1));
+
+            spectral_fieldy.emplace_back(new BaseFab<GpuComplex<Real> >(spectral_bx,1,
+                                                                   The_Device_Arena()));
+            spectral_fieldy.back()->setVal<RunOn::Device>(0.0); // touch the memory
+
+            spectral_field_Sy.emplace_back(new BaseFab<GpuComplex<Real> >(spectral_bx,1,
+                                                                   The_Device_Arena()));
+            spectral_field_Sy.back()->setVal<RunOn::Device>(0.0); // touch the memory
+
+            spectral_field_Dy.emplace_back(new BaseFab<GpuComplex<Real> >(spectral_bx,1,
+                                                                   The_Device_Arena()));
+            spectral_field_Dy.back()->setVal<RunOn::Device>(0.0); // touch the memory
+
+            FFTplan fplan;
+
+#ifdef AMREX_USE_CUDA // CUDA
+            cufftResult result = cufftPlan3d(&fplan, fft_size[2], fft_size[1], fft_size[0], CUFFT_D2Z);
+            if (result != CUFFT_SUCCESS) {
+                amrex::AllPrint() << " cufftplan3d forward failed! Error: "
+                                  << cufftError(result) << "\n";
+            }
+#elif AMREX_USE_HIP // HIP
+            const std::size_t lengths[] = {std::size_t(fft_size[0]),std::size_t(fft_size[1]),std::size_t(fft_size[2])};
+            rocfft_status result = rocfft_plan_create(&fplan, rocfft_placement_notinplace, 
+                                                      rocfft_transform_type_real_forward, rocfft_precision_double,
+                                                      3, lengths, 1, nullptr);
+            Assert_rocfft_status("rocfft_plan_create", result);
+#else // host
+            fplan = fftw_plan_dft_r2c_3d(fft_size[2], fft_size[1], fft_size[0],
+                                              vel_onegrid[mfi].dataPtr(),
+                                              reinterpret_cast<FFTcomplex*>
+                                              (spectral_fieldy.back()->dataPtr()),
+                                              FFTW_ESTIMATE);
+#endif
+            forward_plan.push_back(fplan);
+        }
+
+        ParallelDescriptor::Barrier();
+        
+        // ForwardTransform
+        for (MFIter mfi(vel_onegrid); mfi.isValid(); ++mfi) {
+            int i = mfi.LocalIndex();
+#ifdef AMREX_USE_CUDA
+            cufftSetStream(forward_plan[i], amrex::Gpu::gpuStream());
+            cufftResult result = cufftExecD2Z(forward_plan[i],
+                                              vel_onegrid[mfi].dataPtr(),
+                                              reinterpret_cast<FFTcomplex*>
+                                                  (spectral_fieldy[i]->dataPtr()));
+            if (result != CUFFT_SUCCESS) {
+                amrex::AllPrint() << " forward transform using cufftExec failed! Error: "
+                                  << cufftError(result) << "\n";
+	        }
+#elif AMREX_USE_HIP
+            rocfft_execution_info execinfo = nullptr;
+            rocfft_status result = rocfft_execution_info_create(&execinfo);
+            Assert_rocfft_status("rocfft_execution_info_create", result);
+
+            std::size_t buffersize = 0;
+            result = rocfft_plan_get_work_buffer_size(forward_plan[i], &buffersize);
+            Assert_rocfft_status("rocfft_plan_get_work_buffer_size", result);
+
+            void* buffer = amrex::The_Arena()->alloc(buffersize);
+            result = rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize);
+            Assert_rocfft_status("rocfft_execution_info_set_work_buffer", result);
+
+            result = rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream());
+            Assert_rocfft_status("rocfft_execution_info_set_stream", result);
+
+	        amrex::Real* vel_onegrid_ptr = vel_onegrid[mfi].dataPtr();
+	        FFTcomplex* spectral_field_ptr = reinterpret_cast<FFTcomplex*>(spectral_fieldy[i]->dataPtr());
+            result = rocfft_execute(forward_plan[i],
+                                    (void**) &vel_onegrid_ptr, // in
+                                    (void**) &spectral_field_ptr, // out
+                                    execinfo);
+            Assert_rocfft_status("rocfft_execute", result);
+            amrex::Gpu::streamSynchronize();
+            amrex::The_Arena()->free(buffer);
+            result = rocfft_execution_info_destroy(execinfo);
+            Assert_rocfft_status("rocfft_execution_info_destroy", result);
+#else
+            fftw_execute(forward_plan[i]);
+#endif
+        }
+        
+        // destroy fft plan
+        for (int i = 0; i < forward_plan.size(); ++i) {
+#ifdef AMREX_USE_CUDA
+            cufftDestroy(forward_plan[i]);
+#elif AMREX_USE_HIP
+            rocfft_plan_destroy(forward_plan[i]);
+#else
+            fftw_destroy_plan(forward_plan[i]);
+#endif
+        }
+    
+    } // end y-vel
+    
+    // z-velocity
+    {
+        Vector<FFTplan> forward_plan;
+        vel_onegrid.ParallelCopy(vel,2,0,1);
+        for (MFIter mfi(vel_onegrid); mfi.isValid(); ++mfi) {
+            
+            // grab a single box including ghost cell range
+            Box realspace_bx = mfi.fabbox();
+
+            // size of box including ghost cell range
+            fft_size = realspace_bx.length(); // This will be different for hybrid FFT
+
+            // this is the size of the box, except the 0th component is 'halved plus 1'
+            IntVect spectral_bx_size = fft_size;
+            spectral_bx_size[0] = fft_size[0]/2 + 1;
+
+            // spectral box
+            Box spectral_bx = Box(IntVect(0), spectral_bx_size - IntVect(1));
+
+            spectral_fieldz.emplace_back(new BaseFab<GpuComplex<Real> >(spectral_bx,1,
+                                                                   The_Device_Arena()));
+            spectral_fieldz.back()->setVal<RunOn::Device>(0.0); // touch the memory
+
+            spectral_field_Sz.emplace_back(new BaseFab<GpuComplex<Real> >(spectral_bx,1,
+                                                                   The_Device_Arena()));
+            spectral_field_Sz.back()->setVal<RunOn::Device>(0.0); // touch the memory
+
+            spectral_field_Dz.emplace_back(new BaseFab<GpuComplex<Real> >(spectral_bx,1,
+                                                                   The_Device_Arena()));
+            spectral_field_Dz.back()->setVal<RunOn::Device>(0.0); // touch the memory
+
+            FFTplan fplan;
+
+#ifdef AMREX_USE_CUDA // CUDA
+            cufftResult result = cufftPlan3d(&fplan, fft_size[2], fft_size[1], fft_size[0], CUFFT_D2Z);
+            if (result != CUFFT_SUCCESS) {
+                amrex::AllPrint() << " cufftplan3d forward failed! Error: "
+                                  << cufftError(result) << "\n";
+            }
+#elif AMREX_USE_HIP // HIP
+            const std::size_t lengths[] = {std::size_t(fft_size[0]),std::size_t(fft_size[1]),std::size_t(fft_size[2])};
+            rocfft_status result = rocfft_plan_create(&fplan, rocfft_placement_notinplace, 
+                                                      rocfft_transform_type_real_forward, rocfft_precision_double,
+                                                      3, lengths, 1, nullptr);
+            Assert_rocfft_status("rocfft_plan_create", result);
+#else // host
+            fplan = fftw_plan_dft_r2c_3d(fft_size[2], fft_size[1], fft_size[0],
+                                              vel_onegrid[mfi].dataPtr(),
+                                              reinterpret_cast<FFTcomplex*>
+                                              (spectral_fieldz.back()->dataPtr()),
+                                              FFTW_ESTIMATE);
+#endif
+            forward_plan.push_back(fplan);
+        }
+
+        ParallelDescriptor::Barrier();
+        
+        // ForwardTransform
+        for (MFIter mfi(vel_onegrid); mfi.isValid(); ++mfi) {
+            int i = mfi.LocalIndex();
+#ifdef AMREX_USE_CUDA
+            cufftSetStream(forward_plan[i], amrex::Gpu::gpuStream());
+            cufftResult result = cufftExecD2Z(forward_plan[i],
+                                              vel_onegrid[mfi].dataPtr(),
+                                              reinterpret_cast<FFTcomplex*>
+                                                  (spectral_fieldz[i]->dataPtr()));
+            if (result != CUFFT_SUCCESS) {
+                amrex::AllPrint() << " forward transform using cufftExec failed! Error: "
+                                  << cufftError(result) << "\n";
+	        }
+#elif AMREX_USE_HIP
+            rocfft_execution_info execinfo = nullptr;
+            rocfft_status result = rocfft_execution_info_create(&execinfo);
+            Assert_rocfft_status("rocfft_execution_info_create", result);
+
+            std::size_t buffersize = 0;
+            result = rocfft_plan_get_work_buffer_size(forward_plan[i], &buffersize);
+            Assert_rocfft_status("rocfft_plan_get_work_buffer_size", result);
+
+            void* buffer = amrex::The_Arena()->alloc(buffersize);
+            result = rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize);
+            Assert_rocfft_status("rocfft_execution_info_set_work_buffer", result);
+
+            result = rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream());
+            Assert_rocfft_status("rocfft_execution_info_set_stream", result);
+
+	        amrex::Real* vel_onegrid_ptr = vel_onegrid[mfi].dataPtr();
+	        FFTcomplex* spectral_field_ptr = reinterpret_cast<FFTcomplex*>(spectral_fieldz[i]->dataPtr());
+            result = rocfft_execute(forward_plan[i],
+                                    (void**) &vel_onegrid_ptr, // in
+                                    (void**) &spectral_field_ptr, // out
+                                    execinfo);
+            Assert_rocfft_status("rocfft_execute", result);
+            amrex::Gpu::streamSynchronize();
+            amrex::The_Arena()->free(buffer);
+            result = rocfft_execution_info_destroy(execinfo);
+            Assert_rocfft_status("rocfft_execution_info_destroy", result);
+#else
+            fftw_execute(forward_plan[i]);
+#endif
+        }
+        
+        // destroy fft plan
+        for (int i = 0; i < forward_plan.size(); ++i) {
+#ifdef AMREX_USE_CUDA
+            cufftDestroy(forward_plan[i]);
+#elif AMREX_USE_HIP
+            rocfft_plan_destroy(forward_plan[i]);
+#else
+            fftw_destroy_plan(forward_plan[i]);
+#endif
+        }
+    
+    } // end x-vel
+    
+
+    // Decompose velocity field into solenoidal and dilatational
+    for ( MFIter mfi(vel_onegrid,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+
+        const Box& bx = mfi.tilebox();
+        Array4< GpuComplex<Real> > spectral_tx = (*spectral_fieldx[0])  .array();
+        Array4< GpuComplex<Real> > spectral_ty = (*spectral_fieldy[0])  .array();
+        Array4< GpuComplex<Real> > spectral_tz = (*spectral_fieldz[0])  .array();
+        Array4< GpuComplex<Real> > spectral_sx = (*spectral_field_Sx[0]).array();
+        Array4< GpuComplex<Real> > spectral_sy = (*spectral_field_Sy[0]).array();
+        Array4< GpuComplex<Real> > spectral_sz = (*spectral_field_Sz[0]).array();
+        Array4< GpuComplex<Real> > spectral_dx = (*spectral_field_Dx[0]).array();
+        Array4< GpuComplex<Real> > spectral_dy = (*spectral_field_Dy[0]).array();
+        Array4< GpuComplex<Real> > spectral_dz = (*spectral_field_Dz[0]).array();
+            
+        amrex::ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+           int nx = n_cells[0]; 
+           int ny = n_cells[1]; 
+           int nz = n_cells[2];
+
+           Real GxR = 0.0, GxC = 0.0, GyR = 0.0, GyC = 0.0, GzR = 0.0, GzC = 0.0;
+           
+           if (i <= nx/2) {
+           
+               // Get the wavevector
+               int ki = i;
+               int kj = j;
+               if (j >= ny/2) kj = ny - j;
+               int kk = k;
+               if (k >= nz/2) kk = nz - k;
+
+               // Gradient Operators
+               GxR = (cos(2.0*M_PI*i/nx)-1.0)/dx[0];
+               GxC = (sin(2.0*M_PI*i/nx)-0.0)/dx[0];
+               GyR = (cos(2.0*M_PI*j/ny)-1.0)/dx[1];
+               GyC = (sin(2.0*M_PI*j/ny)-0.0)/dx[1];
+               GzR = (cos(2.0*M_PI*k/nz)-1.0)/dx[2];
+               GzC = (sin(2.0*M_PI*k/nz)-0.0)/dx[2];
+
+               // Scale Total velocity FFT components
+               spectral_tx(i,j,k) *= (1.0/sqrtnpts);
+               spectral_ty(i,j,k) *= (1.0/sqrtnpts);
+               spectral_tz(i,j,k) *= (1.0/sqrtnpts);
+
+               // Inverse Laplacian
+               Real Lap = GxR*GxR + GxC*GxC + GyR*GyR + GyC*GyC + GzR*GzR + GzC*GzC;
+
+               // Divergence of vel
+               Real divR = spectral_tx(i,j,k).real()*GxR - spectral_tx(i,j,k).imag()*GxC +
+                           spectral_ty(i,j,k).real()*GyR - spectral_ty(i,j,k).imag()*GyC +
+                           spectral_tz(i,j,k).real()*GzR - spectral_tz(i,j,k).imag()*GzC ;
+               Real divC = spectral_tx(i,j,k).real()*GxC + spectral_tx(i,j,k).imag()*GxR +
+                           spectral_ty(i,j,k).real()*GyC + spectral_ty(i,j,k).imag()*GyR +
+                           spectral_tz(i,j,k).real()*GzC + spectral_tz(i,j,k).imag()*GzR ;
+
+               if (Lap < 1.0e-12) { // zero mode for no bulk motion
+                   spectral_dx(i,j,k) *= 0.0;
+                   spectral_dy(i,j,k) *= 0.0;
+                   spectral_dz(i,j,k) *= 0.0;
+               }
+               else {
+                   // Dilatational velocity 
+                   GpuComplex<Real> copy_dx((divR*GxR + divC*GxC) / Lap, 
+                                            (divC*GxR - divR*GxC) / Lap);
+                   spectral_dx(i,j,k) = copy_dx;
+
+                   GpuComplex<Real> copy_dy((divR*GyR + divC*GyC) / Lap,
+                                            (divC*GyR - divR*GyC) / Lap);
+                   spectral_dy(i,j,k) = copy_dy;
+
+                   GpuComplex<Real> copy_dz((divR*GzR + divC*GzC) / Lap,
+                                            (divC*GzR - divR*GzC) / Lap);
+                   spectral_dz(i,j,k) = copy_dz;
+               }
+                   
+               // Solenoidal velocity
+               spectral_sx(i,j,k) = spectral_tx(i,j,k) - spectral_dx(i,j,k);
+               spectral_sy(i,j,k) = spectral_ty(i,j,k) - spectral_dy(i,j,k); 
+               spectral_sz(i,j,k) = spectral_tz(i,j,k) - spectral_dz(i,j,k);
+           }
+        });
+    }
+
+    ParallelDescriptor::Barrier();
+
+    // Integrate K spectrum for velocities
+    IntegrateKVelocity(spectral_fieldx,   spectral_fieldy,   spectral_fieldz,   vel_onegrid, "vel_total"     ,scaling,step);
+    IntegrateKVelocity(spectral_field_Sx, spectral_field_Sy, spectral_field_Sz, vel_onegrid, "vel_solenoidal",scaling,step);
+    IntegrateKVelocity(spectral_field_Dx, spectral_field_Dy, spectral_field_Dz, vel_onegrid, "vel_dilatational",scaling,step);
+
+
+    // Inverse Solenoidal and Dilatational Velocity Components
+    { // solenoidal x
+        MultiFab vel_decomp_onegrid;
+        vel_decomp_onegrid.define(ba_onegrid, dmap_onegrid, 1, 0);
+        vel_decomp_onegrid.setVal(0.0);
+        InverseFFTVel(spectral_field_Sx, vel_decomp_onegrid,fft_size);
+        // copy into external multifab
+        vel_decomp.ParallelCopy(vel_decomp_onegrid,0,0,1);
+    }
+    { // solenoidal y
+        MultiFab vel_decomp_onegrid;
+        vel_decomp_onegrid.define(ba_onegrid, dmap_onegrid, 1, 0);
+        vel_decomp_onegrid.setVal(0.0);
+        InverseFFTVel(spectral_field_Sy, vel_decomp_onegrid,fft_size);
+        // copy into external multifab
+        vel_decomp.ParallelCopy(vel_decomp_onegrid,0,1,1);
+    }
+    { // solenoidal z
+        MultiFab vel_decomp_onegrid;
+        vel_decomp_onegrid.define(ba_onegrid, dmap_onegrid, 1, 0);
+        vel_decomp_onegrid.setVal(0.0);
+        InverseFFTVel(spectral_field_Sz, vel_decomp_onegrid,fft_size);
+        // copy into external multifab
+        vel_decomp.ParallelCopy(vel_decomp_onegrid,0,2,1);
+    }
+    { // dilatational x
+        MultiFab vel_decomp_onegrid;
+        vel_decomp_onegrid.define(ba_onegrid, dmap_onegrid, 1, 0);
+        vel_decomp_onegrid.setVal(0.0);
+        InverseFFTVel(spectral_field_Dx, vel_decomp_onegrid,fft_size);
+        // copy into external multifab
+        vel_decomp.ParallelCopy(vel_decomp_onegrid,0,3,1);
+    }
+    { // dilatational y
+        MultiFab vel_decomp_onegrid;
+        vel_decomp_onegrid.define(ba_onegrid, dmap_onegrid, 1, 0);
+        vel_decomp_onegrid.setVal(0.0);
+        InverseFFTVel(spectral_field_Dy, vel_decomp_onegrid,fft_size);
+        // copy into external multifab
+        vel_decomp.ParallelCopy(vel_decomp_onegrid,0,4,1);
+    }
+    { // dilatational z
+        MultiFab vel_decomp_onegrid;
+        vel_decomp_onegrid.define(ba_onegrid, dmap_onegrid, 1, 0);
+        vel_decomp_onegrid.setVal(0.0);
+        InverseFFTVel(spectral_field_Dz, vel_decomp_onegrid,fft_size);
+        // copy into external multifab
+        vel_decomp.ParallelCopy(vel_decomp_onegrid,0,5,1);
+    }
+    vel_decomp.mult(1.0/sqrtnpts);
+}
+
+void IntegrateKScalar(const Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > >& spectral_field,
+                      const MultiFab& variables_onegrid,
+                      const std::string& name,
+                      const Real& scaling,
+                      const Real& sqrtnpts,
+                      const int& step)
+
+{
+    int npts = n_cells[0]/2;
+    Gpu::DeviceVector<Real> phisum_device(npts);
+    Gpu::DeviceVector<int>  phicnt_device(npts);
+
+    Gpu::HostVector<Real> phisum_host(npts);
+    
+    Real* phisum_ptr = phisum_device.dataPtr();  // pointer to data
+    int*  phicnt_ptr = phicnt_device.dataPtr();  // pointer to data
+
+    // Integrate spectra over k-shells
+    amrex::ParallelFor(npts, [=] AMREX_GPU_DEVICE (int d) noexcept
+    {
+      phisum_ptr[d] = 0.;
+      phicnt_ptr[d] = 0;
+    });
+
+    int nx = n_cells[0]; 
+    int ny = n_cells[1]; 
+    int nz = n_cells[2];
+    for ( MFIter mfi(variables_onegrid,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+        
+        const Box& bx = mfi.fabbox();
+
+        const Array4<const GpuComplex<Real> > spectral = (*spectral_field[0]).const_array();
+
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            if (i <= bx.length(0)/2) { // only half of kx-domain
+                int ki = i;
+                int kj = j;
+                if (j >= ny/2) kj = ny - j;
+                int kk = k;
+                if (k >= nz/2) kk = nz - k;
+
+                Real dist = (ki*ki + kj*kj + kk*kk);
+                dist = std::sqrt(dist);
+            
+                if ( dist <= n_cells[0]/2-0.5) {
+                    dist = dist+0.5;
+                    int cell = int(dist);
+                    Real real = spectral(i,j,k).real();
+                    Real imag = spectral(i,j,k).imag();
+                    Real cov  = (1.0/(scaling*sqrtnpts*sqrtnpts))*(real*real + imag*imag); 
+                    amrex::HostDevice::Atomic::Add(&(phisum_ptr[cell]), cov);
+                    amrex::HostDevice::Atomic::Add(&(phicnt_ptr[cell]),1);
+                }
+            }
+        });
+    }
+        
+    for (int d=1; d<npts; ++d) {
+        ParallelDescriptor::ReduceRealSum(phisum_device[d]);
+        ParallelDescriptor::ReduceIntSum(phicnt_device[d]);
+    }
+        
+    Real dk = 1.;
+    amrex::ParallelFor(npts, [=] AMREX_GPU_DEVICE (int d) noexcept
+    {
+        if (d != 0) {
+        phisum_ptr[d] *= 4.*M_PI*(d*d*dk+dk*dk*dk/12.)/phicnt_ptr[d];
+        }
+    });
+    
+    Gpu::copy(Gpu::deviceToHost, phisum_device.begin(), phisum_device.end(), phisum_host.begin());
+    
+    if (ParallelDescriptor::IOProcessor()) {
+        std::ofstream turb;
+        std::string turbBaseName = "turb_"+name;
+        std::string turbName = Concatenate(turbBaseName,step,7);
+        turbName += ".txt";
+        
+        turb.open(turbName);
+        for (int d=1; d<npts; ++d) {
+            turb << d << " " << phisum_host[d] << std::endl;
+        }
+        turb.close();
+    }
+}
+
+void IntegrateKVelocity(const Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > >& spectral_fieldx,
+                        const Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > >& spectral_fieldy,
+                        const Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > >& spectral_fieldz,
+                        const MultiFab& vel_onegrid,
+                        const std::string& name,
+                        const Real& scaling,
+                        const int& step)
+{
+    int npts = n_cells[0]/2;
+    
+    Gpu::DeviceVector<Real> phisum_device(npts);
+    Gpu::DeviceVector<int>  phicnt_device(npts);
+    Gpu::HostVector<Real> phisum_host(npts);
+    Real* phisum_ptr = phisum_device.dataPtr();  // pointer to data
+    int*  phicnt_ptr = phicnt_device.dataPtr();  // pointer to data
+
+    // Integrate spectra over k-shells
+    amrex::ParallelFor(npts, [=] AMREX_GPU_DEVICE (int d) noexcept
+    {
+      phisum_ptr[d] = 0.;
+      phicnt_ptr[d] = 0;
+    });
+
+    int nx = n_cells[0]; 
+    int ny = n_cells[1]; 
+    int nz = n_cells[2];
+    for ( MFIter mfi(vel_onegrid,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+        
+        const Box& bx = mfi.fabbox();
+
+        const Array4<const GpuComplex<Real> > spectralx = (*spectral_fieldx[0]).const_array();
+        const Array4<const GpuComplex<Real> > spectraly = (*spectral_fieldy[0]).const_array();
+        const Array4<const GpuComplex<Real> > spectralz = (*spectral_fieldz[0]).const_array();
+
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            if (i <= bx.length(0)/2) { // only half of kx-domain
+                int ki = i;
+                int kj = j;
+                if (j >= ny/2) kj = ny - j;
+                int kk = k;
+                if (k >= nz/2) kk = nz - k;
+
+                Real dist = (ki*ki + kj*kj + kk*kk);
+                dist = std::sqrt(dist);
+            
+                if ( dist <= n_cells[0]/2-0.5) {
+                    dist = dist+0.5;
+                    int cell = int(dist);
+                    Real real, imag, cov_x, cov_y, cov_z, cov;
+                    real = spectralx(i,j,k).real();
+                    imag = spectralx(i,j,k).imag();
+                    cov_x  = (1.0/scaling)*(real*real + imag*imag); 
+                    real = spectraly(i,j,k).real();
+                    imag = spectraly(i,j,k).imag();
+                    cov_y  = (1.0/scaling)*(real*real + imag*imag); 
+                    real = spectralz(i,j,k).real();
+                    imag = spectralz(i,j,k).imag();
+                    cov_z  = (1.0/scaling)*(real*real + imag*imag); 
+                    cov = cov_x + cov_y + cov_z;
+                    amrex::HostDevice::Atomic::Add(&(phisum_ptr[cell]), cov);
+                    amrex::HostDevice::Atomic::Add(&(phicnt_ptr[cell]),1);
+                }
+            }
+        });
+    }
+        
+    for (int d=1; d<npts; ++d) {
+        ParallelDescriptor::ReduceRealSum(phisum_device[d]);
+        ParallelDescriptor::ReduceIntSum(phicnt_device[d]);
+    }
+        
+    Real dk = 1.;
+    amrex::ParallelFor(npts, [=] AMREX_GPU_DEVICE (int d) noexcept
+    {
+        if (d != 0) {
+        phisum_ptr[d] *= 4.*M_PI*(d*d*dk+dk*dk*dk/12.)/phicnt_ptr[d];
+        }
+    });
+    
+    Gpu::copy(Gpu::deviceToHost, phisum_device.begin(), phisum_device.end(), phisum_host.begin());
+    
+    if (ParallelDescriptor::IOProcessor()) {
+        std::ofstream turb;
+        std::string turbBaseName = "turb_"+name;
+        std::string turbName = Concatenate(turbBaseName,step,7);
+        turbName += ".txt";
+        
+        turb.open(turbName);
+        for (int d=1; d<npts; ++d) {
+            turb << d << " " << phisum_host[d] << std::endl;
+        }
+        turb.close();
+    }
+}
+
+void InverseFFTVel(Vector<std::unique_ptr<BaseFab<GpuComplex<Real> > > >& spectral_field, 
+                   MultiFab& vel_decomp_onegrid, const IntVect& fft_size)
+{
+
+#ifdef AMREX_USE_CUDA
+    using FFTplan = cufftHandle;
+    using FFTcomplex = cuDoubleComplex;
+#elif AMREX_USE_HIP
+    using FFTplan = rocfft_plan;
+    using FFTcomplex = double2;
+#else
+    using FFTplan = fftw_plan;
+    using FFTcomplex = fftw_complex;
+#endif
+    
+    Vector<FFTplan> backward_plan;
+    
+    for (MFIter mfi(vel_decomp_onegrid); mfi.isValid(); ++mfi) {
+        FFTplan fplan;
+#ifdef AMREX_USE_CUDA // CUDA
+        cufftResult result = cufftPlan3d(&fplan, fft_size[2], fft_size[1], fft_size[0], CUFFT_Z2D);
+        if (result != CUFFT_SUCCESS) {
+            amrex::AllPrint() << " cufftplan3d forward failed! Error: "
+                              << cufftError(result) << "\n";
+        }
+#elif AMREX_USE_HIP // HIP
+        const std::size_t lengths[] = {std::size_t(fft_size[0]),std::size_t(fft_size[1]),std::size_t(fft_size[2])};
+        rocfft_status result = rocfft_plan_create(&fplan, rocfft_placement_notinplace, 
+                                                  rocfft_transform_type_real_inverse, rocfft_precision_double,
+                                                  3, lengths, 1, nullptr);
+        Assert_rocfft_status("rocfft_plan_create", result);
+#else // host
+        fplan = fftw_plan_dft_c2r_3d(fft_size[2], fft_size[1], fft_size[0],
+                                     reinterpret_cast<FFTcomplex*>
+                                     (spectral_field.back()->dataPtr()),
+                                     vel_decomp_onegrid[mfi].dataPtr(),
+                                     FFTW_ESTIMATE);
+#endif
+        backward_plan.push_back(fplan);
+    }
+    
+    ParallelDescriptor::Barrier();
+
+    // Backward Transform
+    for (MFIter mfi(vel_decomp_onegrid); mfi.isValid(); ++mfi) {
+        int i = mfi.LocalIndex();
+#ifdef AMREX_USE_CUDA
+        cufftSetStream(backward_plan[i], amrex::Gpu::gpuStream());
+        cufftResult result = cufftExecZ2D(backward_plan[i],
+                                          reinterpret_cast<FFTcomplex*>
+                                              (spectral_field[i]->dataPtr()),
+                                          vel_decomp_onegrid[mfi].dataPtr());
+        if (result != CUFFT_SUCCESS) {
+            amrex::AllPrint() << " forward transform using cufftExec failed! Error: "
+                              << cufftError(result) << "\n";
+	    }
+#elif AMREX_USE_HIP
+        rocfft_execution_info execinfo = nullptr;
+        rocfft_status result = rocfft_execution_info_create(&execinfo);
+        Assert_rocfft_status("rocfft_execution_info_create", result);
+
+        std::size_t buffersize = 0;
+        result = rocfft_plan_get_work_buffer_size(backward_plan[i], &buffersize);
+        Assert_rocfft_status("rocfft_plan_get_work_buffer_size", result);
+
+        void* buffer = amrex::The_Arena()->alloc(buffersize);
+        result = rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize);
+        Assert_rocfft_status("rocfft_execution_info_set_work_buffer", result);
+
+        result = rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream());
+        Assert_rocfft_status("rocfft_execution_info_set_stream", result);
+
+	    amrex::Real* vel_onegrid_ptr = vel_decomp_onegrid[mfi].dataPtr();
+	    FFTcomplex* spectral_field_ptr = reinterpret_cast<FFTcomplex*>(spectral_field[i]->dataPtr());
+        result = rocfft_execute(backward_plan[i],
+                                (void**) &vel_onegrid_ptr, // in
+                                (void**) &spectral_field_ptr, // out
+                                execinfo);
+        Assert_rocfft_status("rocfft_execute", result);
+        amrex::Gpu::streamSynchronize();
+        amrex::The_Arena()->free(buffer);
+        result = rocfft_execution_info_destroy(execinfo);
+        Assert_rocfft_status("rocfft_execution_info_destroy", result);
+#else
+        fftw_execute(backward_plan[i]);
+#endif
+    }
+    
+    // destroy fft plan
+    for (int i = 0; i < backward_plan.size(); ++i) {
+#ifdef AMREX_USE_CUDA
+        cufftDestroy(backward_plan[i]);
+#elif AMREX_USE_HIP
+        rocfft_plan_destroy(backward_plan[i]);
+#else
+        fftw_destroy_plan(backward_plan[i]);
+#endif
+    }
+
+}
+
diff --git a/src_compressible/compressible_functions.cpp b/src_compressible/compressible_functions.cpp
index 96bb6f50..3d818cfe 100644
--- a/src_compressible/compressible_functions.cpp
+++ b/src_compressible/compressible_functions.cpp
@@ -8,6 +8,7 @@ AMREX_GPU_MANAGED int compressible::do_1D;
 AMREX_GPU_MANAGED int compressible::do_2D;
 AMREX_GPU_MANAGED int compressible::all_correl;
 AMREX_GPU_MANAGED int compressible::nspec_surfcov = 0;
+AMREX_GPU_MANAGED int compressible::turbRestartRun = 1;
 AMREX_GPU_MANAGED bool compressible::do_reservoir = false;
 AMREX_GPU_MANAGED amrex::Real compressible::zeta_ratio = -1.0;
 
@@ -64,7 +65,14 @@ void InitializeCompressibleNamespace()
     all_correl = 0;
     pp.query("all_correl",all_correl);
 
-
+    // restart for turbulence
+    // if 1: will advance time, if 0: only stats no advance time
+    pp.query("turbRestartRun",turbRestartRun);
+    if (turbRestartRun == 0) {
+      if (restart <= 0) amrex::Abort("turbRestartRun requires restarting from a checkpoint, restart > 0");
+      if (max_step != restart+1) amrex::Abort("this is a single step run; max_step should be equal to restart+1");
+    }
+    
     // do reservoir?
     if ((bc_mass_lo[0] == 4) or (bc_mass_lo[1] == 4) or (bc_mass_lo[2] == 4) or
         (bc_mass_hi[0] == 4) or (bc_mass_hi[1] == 4) or (bc_mass_hi[2] == 4)) {
diff --git a/src_compressible/compressible_namespace.H b/src_compressible/compressible_namespace.H
index e75c9935..d3195911 100644
--- a/src_compressible/compressible_namespace.H
+++ b/src_compressible/compressible_namespace.H
@@ -7,6 +7,7 @@ namespace compressible {
     extern AMREX_GPU_MANAGED int do_2D;
     extern AMREX_GPU_MANAGED int all_correl;
     extern AMREX_GPU_MANAGED int nspec_surfcov;
+    extern AMREX_GPU_MANAGED int turbRestartRun;
     extern AMREX_GPU_MANAGED bool do_reservoir;
     extern AMREX_GPU_MANAGED amrex::Real zeta_ratio;
 
diff --git a/src_compressible_stag/Checkpoint.cpp b/src_compressible_stag/Checkpoint.cpp
index caf1a722..e0b0201c 100644
--- a/src_compressible_stag/Checkpoint.cpp
+++ b/src_compressible_stag/Checkpoint.cpp
@@ -643,7 +643,7 @@ void ReadCheckPoint3D(int& step,
     dmap.define(ba, ParallelDescriptor::NProcs());
     
 #if defined(TURB)
-    if (turbForcing > 1) {
+    if ((turbForcing > 1) and (turbRestartRun)) {
         turbforce.define(ba,dmap,turb_a,turb_b,turb_c,turb_d,turb_alpha);
     }
 #endif
@@ -694,7 +694,7 @@ void ReadCheckPoint3D(int& step,
 
 #if defined(TURB)
         // Read in turbulent forcing
-        if (turbForcing > 1) {
+        if ((turbForcing > 1) and (turbRestartRun)) {
             Real fs_temp;
             Real fc_temp;
             for (int i=0; i<132; ++i) {
@@ -1391,28 +1391,29 @@ void Read_Copy_MF_Checkpoint(amrex::MultiFab& mf, std::string mf_name, const std
                              BoxArray& ba_old, DistributionMapping& dmap_old,
                              int NVARS, int ghost, int nodal_flag)
 {
-    // define temporary MF
-    MultiFab mf_temp;
-    if (nodal_flag < 0) {
-        if (ghost) {
-            mf_temp.define(ba_old,dmap_old,NVARS,ngc);
-        }
-        else {
-            mf_temp.define(ba_old,dmap_old,NVARS,0);
-        }
-
-    }
-    else {
-        if (ghost) {
-            mf_temp.define(convert(ba_old,nodal_flag_dir[nodal_flag]),dmap_old,NVARS,ngc);
-        }
-        else {
-            mf_temp.define(convert(ba_old,nodal_flag_dir[nodal_flag]),dmap_old,NVARS,0);
-        }
-
-    }
+    //// define temporary MF
+    //MultiFab mf_temp;
+    //if (nodal_flag < 0) {
+    //    if (ghost) {
+    //        mf_temp.define(ba_old,dmap_old,NVARS,ngc);
+    //    }
+    //    else {
+    //        mf_temp.define(ba_old,dmap_old,NVARS,0);
+    //    }
+
+    //}
+    //else {
+    //    if (ghost) {
+    //        mf_temp.define(convert(ba_old,nodal_flag_dir[nodal_flag]),dmap_old,NVARS,ngc);
+    //    }
+    //    else {
+    //        mf_temp.define(convert(ba_old,nodal_flag_dir[nodal_flag]),dmap_old,NVARS,0);
+    //    }
+
+    //}
     
     // Read into temporary MF from file
+    MultiFab mf_temp;
     VisMF::Read(mf_temp,amrex::MultiFabFileFullPrefix(0, checkpointname, "Level_", mf_name));
 
     // Copy temporary MF into the new MF
diff --git a/src_compressible_stag/DeriveVelProp.cpp b/src_compressible_stag/DeriveVelProp.cpp
index 87f3232d..c437cd38 100644
--- a/src_compressible_stag/DeriveVelProp.cpp
+++ b/src_compressible_stag/DeriveVelProp.cpp
@@ -8,6 +8,7 @@ void GetTurbQty(std::array< MultiFab, AMREX_SPACEDIM >& vel,
                 std::array< MultiFab, AMREX_SPACEDIM >& cumom,
                 MultiFab& prim,
                 MultiFab& eta,
+                MultiFab& zeta,
                 const amrex::Geometry& geom,
                 Real& turbKE, Real& c_speed,
                 Real& u_rms,
@@ -26,11 +27,11 @@ void GetTurbQty(std::array< MultiFab, AMREX_SPACEDIM >& vel,
     // Setup temp MultiFabs
     std::array< MultiFab, AMREX_SPACEDIM > macTemp;
     MultiFab gradU;
+    MultiFab eta_bulk_diss;
     MultiFab sound_speed;
     MultiFab ccTemp;
     MultiFab ccTempA;
     MultiFab ccTempDiv;
-    MultiFab eta_kin; // kinematic viscosity
     std::array< MultiFab, NUM_EDGE > curlU;
     std::array< MultiFab, NUM_EDGE > eta_edge;
     std::array< MultiFab, NUM_EDGE > curlUtemp;
@@ -42,7 +43,7 @@ void GetTurbQty(std::array< MultiFab, AMREX_SPACEDIM >& vel,
     ccTemp.define(prim.boxArray(),prim.DistributionMap(),1,0);
     ccTempA.define(prim.boxArray(),prim.DistributionMap(),1,0);
     ccTempDiv.define(prim.boxArray(),prim.DistributionMap(),1,0);
-    eta_kin.define(prim.boxArray(),prim.DistributionMap(),1,ngc);
+    if (visc_type == 3) eta_bulk_diss.define(prim.boxArray(),prim.DistributionMap(),1,0);
 #if (AMREX_SPACEDIM == 3)
     curlU[0].define(convert(prim.boxArray(),nodal_flag_xy), prim.DistributionMap(), 1, 0);
     curlU[1].define(convert(prim.boxArray(),nodal_flag_xz), prim.DistributionMap(), 1, 0);
@@ -63,19 +64,6 @@ void GetTurbQty(std::array< MultiFab, AMREX_SPACEDIM >& vel,
     curlUtemp[0].define(convert(prim.boxArray(),nodal_flag_xy), prim.DistributionMap(), 1, 0);
 #endif
 
-    // Get Kinematic Viscosity
-    for ( MFIter mfi(eta,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
-        // grow the box by ngc
-        const Box& bx = amrex::grow(mfi.tilebox(), ngc);
-        const Array4<Real> & eta_kin_fab = eta_kin.array(mfi);
-        const Array4<const Real>& eta_fab = eta.array(mfi);
-        const Array4<const Real>& prim_fab = prim.array(mfi);
-        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
-        {
-            eta_kin_fab(i,j,k) = eta_fab(i,j,k) / prim_fab(i,j,k,0);
-        });
-    }
-
     // Setup temp variables
     Real temp;
     Vector<Real> tempvec(3);
@@ -87,14 +75,38 @@ void GetTurbQty(std::array< MultiFab, AMREX_SPACEDIM >& vel,
     Vector<Real> eps_s_vec(3); // solenoidal dissipation
 
     // turbulent kinetic energy
-    StagInnerProd(cumom,0,vel,0,macTemp,rhouu);
+//   StagInnerProd(cumom,0,vel,0,macTemp,rhouu);
+    {
+        auto mask = cumom[0].OwnerMask(geom.periodicity());
+	rhouu[0] = MultiFab::Dot(cumom[0],0,vel[0],0,1,0);
+    }
+    {
+        auto mask = cumom[1].OwnerMask(geom.periodicity());
+	rhouu[1] = MultiFab::Dot(cumom[1],0,vel[1],0,1,0);
+    }
+    {
+        auto mask = cumom[2].OwnerMask(geom.periodicity());
+	rhouu[2] = MultiFab::Dot(cumom[2],0,vel[2],0,1,0);
+    }
     rhouu[0] /= (n_cells[0]+1)*n_cells[1]*n_cells[2];
     rhouu[1] /= (n_cells[1]+1)*n_cells[2]*n_cells[0];
     rhouu[2] /= (n_cells[2]+1)*n_cells[0]*n_cells[1];
     turbKE = 0.5*( rhouu[0] + rhouu[1] + rhouu[2] );
 
     // RMS velocity
-    StagInnerProd(vel,0,vel,0,macTemp,uu);
+//    StagInnerProd(vel,0,vel,0,macTemp,uu);
+    {
+        auto mask = vel[0].OwnerMask(geom.periodicity());
+	uu[0] = MultiFab::Dot(vel[0],0,vel[0],0,1,0);
+    }
+    {
+        auto mask = vel[1].OwnerMask(geom.periodicity());
+	uu[1] = MultiFab::Dot(vel[1],0,vel[1],0,1,0);
+    }
+    {
+        auto mask = vel[2].OwnerMask(geom.periodicity());
+	uu[2] = MultiFab::Dot(vel[2],0,vel[2],0,1,0);
+    }
     uu[0] /= (n_cells[0]+1)*n_cells[1]*n_cells[2];
     uu[1] /= (n_cells[1]+1)*n_cells[2]*n_cells[0];
     uu[2] /= (n_cells[2]+1)*n_cells[0]*n_cells[1];
@@ -167,28 +179,53 @@ void GetTurbQty(std::array< MultiFab, AMREX_SPACEDIM >& vel,
     // Compute \omega (curl)
     ComputeCurlFaceToEdge(vel,curlU,geom);
     
-    // Solenoidal dissipation: <eta_kin \omega_i \omega_i>/<rho>
-    AverageCCToEdge(eta_kin,eta_edge,0,1,SPEC_BC_COMP,geom);
+    // Solenoidal dissipation: <eta \omega_i \omega_i>
+    AverageCCToEdge(eta,eta_edge,0,1,SPEC_BC_COMP,geom);
     EdgeInnerProd(curlU,0,curlU,0,curlUtemp,tempvec);
-    EdgeInnerProd(curlUtemp,0,eta_edge,0,curlU,eps_s_vec);
+//    EdgeInnerProd(curlUtemp,0,eta_edge,0,curlU,eps_s_vec);
+    {
+        auto mask = curlUtemp[0].OwnerMask(geom.periodicity());
+	eps_s_vec[0] = MultiFab::Dot(curlUtemp[0],0,eta_edge[0],0,1,0);
+    }
+    {
+        auto mask = curlUtemp[1].OwnerMask(geom.periodicity());
+	eps_s_vec[1] = MultiFab::Dot(curlUtemp[1],0,eta_edge[1],0,1,0);
+    }
+    {
+        auto mask = curlUtemp[2].OwnerMask(geom.periodicity());
+	eps_s_vec[2] = MultiFab::Dot(curlUtemp[2],0,eta_edge[2],0,1,0);
+    }
     eps_s_vec[0] /= (n_cells[0]+1)*(n_cells[1]+1)*n_cells[2];
     eps_s_vec[1] /= (n_cells[0]+1)*(n_cells[2]+1)*n_cells[1];
     eps_s_vec[2] /= (n_cells[1]+1)*(n_cells[2]+1)*n_cells[0];
     eps_s = (eps_s_vec[0] + eps_s_vec[1] + eps_s_vec[2]);
 
-    // Dilational dissipation (4/3)*<eta_kin (\sum_i du_i/dx_i)^2>/<rho>
-    CCInnerProd(ccTempDiv,0,eta_kin,0,ccTemp,eps_d);
-    eps_d *= dProb*(4.0/3.0);
+    // Dilational dissipation (4/3)*<eta (\sum_i du_i/dx_i)^2>
+//    CCInnerProd(ccTempDiv,0,eta,0,ccTemp,eps_d);
+    if (visc_type == 3) {
+      // get eta_bulk_diss = kappa + 4/3 eta
+      MultiFab::LinComb(eta_bulk_diss, 1.0, zeta, 0, 
+                        1.3333333333, eta, 0, 
+                        0, 1, 0);
+      eps_d = MultiFab::Dot(eta_bulk_diss, 0, ccTempDiv, 0, 1, 0);
+      eps_d *= dProb;
+    }
+    else {
+      eps_d = MultiFab::Dot(eta, 0, ccTempDiv, 0, 1, 0);
+      eps_d *= dProb*(4.0/3.0);
+    }
 
     // Ratio of Dilational to Solenoidal dissipation
     eps_ratio = eps_d/eps_s;
     Real eps_t = eps_s + eps_d;
 
     // Kolmogorov scales
-    Real eta_kin_avg = ComputeSpatialMean(eta_kin, 0);
-    kolm_s = pow((eta_kin_avg*eta_kin_avg*eta_kin_avg/eps_s),0.25);
-    kolm_d = pow((eta_kin_avg*eta_kin_avg*eta_kin_avg/eps_d),0.25);
-    kolm_t = pow((eta_kin_avg*eta_kin_avg*eta_kin_avg/eps_t),0.25);
+    kolm_s = pow((eta_avg*eta_avg*eta_avg/(rho_avg*rho_avg*eps_s)),0.25);
+    kolm_d = pow((eta_avg*eta_avg*eta_avg/(rho_avg*rho_avg*eps_d)),0.25);
+    kolm_t = pow((eta_avg*eta_avg*eta_avg/(rho_avg*rho_avg*eps_t)),0.25);
+//    kolm_s = pow((eta_avg*eta_avg*eta_avg/eps_s),0.25);
+//    kolm_d = pow((eta_avg*eta_avg*eta_avg/eps_d),0.25);
+//    kolm_t = pow((eta_avg*eta_avg*eta_avg/eps_t),0.25);
 
 }
 #endif
@@ -485,9 +522,9 @@ void EvaluateWritePlotFileVelGrad(int step,
         
         const Box& bx = mfi.tilebox();
         
-        const Array4<Real>             out = output.array(mfi);
+        const Array4<      Real>&             out   = output.array(mfi);
 
-        const Array4<const Real>  v_decomp = vel_decomp.array(mfi);
+        const Array4<const Real>&  v_decomp         = vel_decomp.array(mfi);
         
         AMREX_D_TERM(Array4<Real const> const& velx = vel[0].array(mfi);,
                      Array4<Real const> const& vely = vel[1].array(mfi);,
@@ -495,11 +532,16 @@ void EvaluateWritePlotFileVelGrad(int step,
 
         amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
         {
-            for (int n=0;n<6;++n) {
-                out(i,j,k,n) = v_decomp(i,j,k,n);
-            }
-            out(i,j,k,6) = sqrt(out(i,j,k,0)*out(i,j,k,0) + out(i,j,k,1)*out(i,j,k,1) + out(i,j,k,2)*out(i,j,k,2)); // mag solenoidal
-            out(i,j,k,7) = sqrt(out(i,j,k,3)*out(i,j,k,3) + out(i,j,k,4)*out(i,j,k,4) + out(i,j,k,5)*out(i,j,k,5)); // mag dilatational
+            
+            out(i,j,k,0) = v_decomp(i,j,k,0);
+            out(i,j,k,1) = v_decomp(i,j,k,1);
+            out(i,j,k,2) = v_decomp(i,j,k,2);
+            out(i,j,k,3) = v_decomp(i,j,k,3);
+            out(i,j,k,4) = v_decomp(i,j,k,4);
+            out(i,j,k,5) = v_decomp(i,j,k,5);
+
+            out(i,j,k,6) = std::sqrt(out(i,j,k,0)*out(i,j,k,0) + out(i,j,k,1)*out(i,j,k,1) + out(i,j,k,2)*out(i,j,k,2)); // mag solenoidal
+            out(i,j,k,7) = std::sqrt(out(i,j,k,3)*out(i,j,k,3) + out(i,j,k,4)*out(i,j,k,4) + out(i,j,k,5)*out(i,j,k,5)); // mag dilatational
 
             // divergence
             out(i,j,k,8) = (velx(i+1,j,k) - velx(i,j,k))/dx[0] +
@@ -549,7 +591,7 @@ void EvaluateWritePlotFileVelGrad(int step,
             Real w3_pp  = u32_pp - u23_pp;
 
             // vorticity magnitude: sqrt(w1*w1 + w2*w2 + w3*w3)
-            out(i,j,k,9) = sqrt(0.25*(w1_mm*w1_mm + w1_mp*w1_mp + w1_pm*w1_pm + w1_pp*w1_pp +
+            out(i,j,k,9) = std::sqrt(0.25*(w1_mm*w1_mm + w1_mp*w1_mp + w1_pm*w1_pm + w1_pp*w1_pp +
                                       w2_mm*w2_mm + w2_mp*w2_mp + w2_pm*w2_pm + w2_pp*w2_pp +
                                       w3_mm*w3_mm + w3_mp*w3_mp + w3_pm*w3_pm + w3_pp*w3_pp));
         });
@@ -562,8 +604,8 @@ void EvaluateWritePlotFileVelGrad(int step,
     varNames[1] = "uy_s";
     varNames[2] = "uz_s";
     varNames[3] = "ux_d";
-    varNames[4] = "ux_d";
-    varNames[5] = "uy_d";
+    varNames[4] = "uy_d";
+    varNames[5] = "uz_d";
     varNames[6] = "umag_s";
     varNames[7] = "umag_d";
     varNames[8] = "div";
@@ -572,3 +614,123 @@ void EvaluateWritePlotFileVelGrad(int step,
 }
 #endif
 
+#if defined(TURB)
+void EvaluateWritePlotFileVelGradTiny(int step,
+                                  const amrex::Real time,
+                                  const amrex::Geometry& geom,
+                                  const std::array<MultiFab, AMREX_SPACEDIM>& vel,
+                                  const amrex::MultiFab& vel_decomp_in)
+{
+    BL_PROFILE_VAR("EvaluateWritePlotFileVelGradTiny()",EvaluateWritePlotFileVelGradTiny);
+
+    MultiFab output;
+    
+    // 0: vorticity wx_sifted
+    // 1: vorticity wy_shifted
+    // 2: vorticity wz_shifted
+    // 3: vorticity wx_avg
+    // 4: vorticity wy_avg
+    // 5: vorticity wz_avg
+    // 6: vorticity_mag_shft_then_sq = sqrt(wx + wy + wz)
+    // 7: vorticity_mag_avg_then_sq = sqrt(wx + wy + wz)
+    // 8: vorticity_mag_sq_then_avg = sqrt(wx + wy + wz)
+    // 9: divergence =  u_1,1 + u_2,2 + u_3,3
+    output.define(convert(vel[0].boxArray(),IntVect(AMREX_D_DECL(0,0,0))), vel[0].DistributionMap(), 10, 0);
+    output.setVal(0.0);
+
+    const GpuArray<Real, AMREX_SPACEDIM> dx = geom.CellSizeArray();
+
+    for ( MFIter mfi(output,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+        
+        const Box& bx = mfi.tilebox();
+        
+        const Array4<      Real>&             out   = output.array(mfi);
+
+        AMREX_D_TERM(Array4<Real const> const& velx = vel[0].array(mfi);,
+                     Array4<Real const> const& vely = vel[1].array(mfi);,
+                     Array4<Real const> const& velz = vel[2].array(mfi););
+
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            
+            // divergence
+            out(i,j,k,9) = (velx(i+1,j,k) - velx(i,j,k))/dx[0] +
+                           (vely(i,j+1,k) - vely(i,j,k))/dx[1] + 
+                           (velz(i,j,k+1) - velz(i,j,k))/dx[2] ;
+
+            // on edges: u_1,2 and u_2,1 and curl w1 = u_2,1 - u_1,2
+            Real u12_mm = (velx(i,j,k) - velx(i,j-1,k))/dx[1];
+            Real u21_mm = (vely(i,j,k) - vely(i-1,j,k))/dx[0];
+            Real w1_mm  = u21_mm - u12_mm;
+            Real u12_mp = (velx(i,j+1,k) - velx(i,j,k))/dx[1];
+            Real u21_mp = (vely(i,j+1,k) - vely(i-1,j+1,k))/dx[0];
+            Real w1_mp  = u21_mp - u12_mp;
+            Real u12_pm = (velx(i+1,j,k) - velx(i+1,j-1,k))/dx[1];
+            Real u21_pm = (vely(i+1,j,k) - vely(i,j,k))/dx[0];
+            Real w1_pm  = u21_pm - u12_pm;
+            Real u12_pp = (velx(i+1,j+1,k) - velx(i+1,j,k))/dx[1];
+            Real u21_pp = (vely(i+1,j+1,k) - vely(i,j+1,k))/dx[0];
+            Real w1_pp  = u21_pp - u12_pp;
+            out(i,j,k,0) = w1_mm;
+            out(i,j,k,3) = 0.5*(w1_mm+w1_mp+w1_pm+w1_pp);
+
+            // on edges: u_1,3 and u_3,1 and curl w2 = u_1,3 - u_3,1
+            Real u13_mm = (velx(i,j,k) - velx(i,j,k-1))/dx[2];
+            Real u31_mm = (velz(i,j,k) - velz(i-1,j,k))/dx[0];
+            Real w2_mm  = u13_mm - u31_mm;
+            Real u13_mp = (velx(i,j,k+1) - velx(i,j,k))/dx[2];
+            Real u31_mp = (velz(i,j,k+1) - velz(i-1,j,k+1))/dx[0];
+            Real w2_mp  = u13_mp - u31_mp;
+            Real u13_pm = (velx(i+1,j,k) - velx(i+1,j,k-1))/dx[2];
+            Real u31_pm = (velz(i+1,j,k) - velz(i,j,k))/dx[0];
+            Real w2_pm  = u13_pm - u31_pm;
+            Real u13_pp = (velx(i+1,j,k+1) - velx(i+1,j,k))/dx[2];
+            Real u31_pp = (velz(i+1,j,k+1) - velz(i,j,k+1))/dx[0];
+            Real w2_pp  = u13_pp - u31_pp;
+            out(i,j,k,1) = w2_mm;
+            out(i,j,k,4) = 0.5*(w2_mm+w2_mp+w2_pm+w2_pp);
+
+            // on edges: u_2,3 and u_3,2 and curl w2 = u_3,2 - u_2,3
+            Real u23_mm = (vely(i,j,k) - vely(i,j,k-1))/dx[2];
+            Real u32_mm = (velz(i,j,k) - velz(i,j-1,k))/dx[1];
+            Real w3_mm  = u32_mm - u23_mm;
+            Real u23_mp = (vely(i,j,k+1) - vely(i,j,k))/dx[2];
+            Real u32_mp = (velz(i,j,k+1) - velz(i,j-1,k+1))/dx[1];
+            Real w3_mp  = u32_mp - u23_mp;
+            Real u23_pm = (vely(i,j+1,k) - vely(i,j+1,k-1))/dx[2];
+            Real u32_pm = (velz(i,j+1,k) - velz(i,j,k))/dx[1];
+            Real w3_pm  = u32_pm - u23_pm;
+            Real u23_pp = (vely(i,j+1,k+1) - vely(i,j+1,k))/dx[2];
+            Real u32_pp = (velz(i,j+1,k+1) - velz(i,j,k+1))/dx[1];
+            Real w3_pp  = u32_pp - u23_pp;
+            out(i,j,k,2) = w3_mm;
+            out(i,j,k,5) = 0.5*(w3_mm+w3_mp+w3_pm+w3_pp);
+
+            // vorticity magnitude: sqrt(w1*w1 + w2*w2 + w3*w3)
+            out(i,j,k,6) = sqrt(w1_mm*w1_mm + w2_mm*w2_mm + w3_mm*w3_mm);
+            out(i,j,k,7) = sqrt(out(i,j,k,4)*out(i,j,k,4) + out(i,j,k,5)*out(i,j,k,5)
+                                + out(i,j,k,6)*out(i,j,k,6));
+            out(i,j,k,8) = std::sqrt(0.25*(w1_mm*w1_mm + w1_mp*w1_mp + w1_pm*w1_pm + w1_pp*w1_pp +
+                                      w2_mm*w2_mm + w2_mp*w2_mp + w2_pm*w2_pm + w2_pp*w2_pp +
+                                      w3_mm*w3_mm + w3_mp*w3_mp + w3_pm*w3_pm + w3_pp*w3_pp));
+        });
+    }
+
+    // Write on a plotfile
+    std::string plotfilename = amrex::Concatenate("vort_div",step,9);
+    amrex::Vector<std::string> varNames(10);
+    varNames[0] = "w1_shift";
+    varNames[1] = "w2_shift";
+    varNames[2] = "w3_shift";
+    varNames[3] = "w1_avg";
+    varNames[4] = "w2_avg";
+    varNames[5] = "w3_avg";
+    varNames[6] = "vort_mag_shft";
+    varNames[7] = "vort_mag_shft_avg";
+    varNames[8] = "vort_mag_avg";
+    varNames[9] = "div";
+    WriteSingleLevelPlotfile(plotfilename,output,varNames,geom,time,step);
+}
+#endif
+
+
diff --git a/src_compressible_stag/compressible_functions_stag.H b/src_compressible_stag/compressible_functions_stag.H
index 10143477..f0b628ae 100644
--- a/src_compressible_stag/compressible_functions_stag.H
+++ b/src_compressible_stag/compressible_functions_stag.H
@@ -54,6 +54,11 @@ void EvaluateWritePlotFileVelGrad(int step,
                                   const amrex::Geometry& geom,
                                   const std::array<MultiFab, AMREX_SPACEDIM>& vel,
                                   const amrex::MultiFab& vel_decomp);
+void EvaluateWritePlotFileVelGradTiny(int step,
+                                  const amrex::Real time,
+                                  const amrex::Geometry& geom,
+                                  const std::array<MultiFab, AMREX_SPACEDIM>& vel,
+                                  const amrex::MultiFab& vel_decomp);
 #endif
 
 void conservedToPrimitiveStag(MultiFab& prim_in, std::array<MultiFab, AMREX_SPACEDIM>& velStag_in,
@@ -434,6 +439,7 @@ void GetTurbQty(std::array< MultiFab, AMREX_SPACEDIM >& vel,
                 std::array< MultiFab, AMREX_SPACEDIM >& cumom,
                 MultiFab& prim,
                 MultiFab& eta,
+                MultiFab& zeta,
                 const amrex::Geometry& geom,
                 Real& turbKE, Real& c_speed,
                 Real& u_rms,
diff --git a/src_compressible_stag/main_driver.cpp b/src_compressible_stag/main_driver.cpp
index 98267a1e..6dc86a31 100644
--- a/src_compressible_stag/main_driver.cpp
+++ b/src_compressible_stag/main_driver.cpp
@@ -1,3 +1,4 @@
+#include "TurbSpectra.H"
 #include "common_functions.H"
 #include "compressible_functions.H"
 #include "compressible_functions_stag.H"
@@ -260,10 +261,16 @@ void main_driver(const char* argv)
 
 #if defined(TURB)
     // data structure for turbulence diagnostics
+    MultiFab MFTurbScalar;
+    MultiFab MFTurbVel;
+    MultiFab vel_decomp;
     std::string turbfilename = "turbstats";
     std::ofstream turboutfile;
     std::string turbfilenamedecomp = "turbstatsdecomp";
     std::ofstream turboutfiledecomp;
+    // need to use dVol for scaling
+    Real dVol = (AMREX_SPACEDIM==2) ? dx[0]*dx[1]*cell_depth : dx[0]*dx[1]*dx[2];
+    Real dVolinv = 1.0/dVol;
 #endif
 
     /////////////////////////////////////////////
@@ -434,40 +441,6 @@ void main_driver(const char* argv)
         var_scaling_cons[d] = 1./(dx[0]*dx[1]*dx[2]);
     }
 
-#if defined(TURB)
-    //////////////////////////////////////////////////////////////
-    // structure factor variables names and scaling for turbulence
-    // variables are velocities, density, pressure and temperature
-    //////////////////////////////////////////////////////////////
-    // need to use dVol for scaling
-    Real dVol = (AMREX_SPACEDIM==2) ? dx[0]*dx[1]*cell_depth : dx[0]*dx[1]*dx[2];
-    Real dVolinv = 1.0/dVol;
-    
-    MultiFab structFactMFTurbVel;
-    MultiFab structFactMFTurbScalar;
-    MultiFab vel_decomp;
-
-    Vector< std::string > var_names_turbVelTotal{"ux","uy","uz"};
-    Vector<Real> var_scaling_turbVelTotal(3, dVolinv);
-    amrex::Vector< int > s_pairA_turbVelTotal(3);
-    amrex::Vector< int > s_pairB_turbVelTotal(3);
-    for (int d=0; d<3; ++d) {
-        s_pairA_turbVelTotal[d] = d;
-        s_pairB_turbVelTotal[d] = d;
-    }
-    
-    Vector<Real> var_scaling_turbVelDecomp(6, dVolinv);
-    
-    Vector< std::string > var_names_turbScalar{"rho","tenp","press"};
-    Vector<Real> var_scaling_turbScalar(3, dVolinv);
-    amrex::Vector< int > s_pairA_turbScalar(3);
-    amrex::Vector< int > s_pairB_turbScalar(3);
-    for (int d=0; d<3; ++d) {
-        s_pairA_turbScalar[d] = d;
-        s_pairB_turbScalar[d] = d;
-    }
-#endif
-    
     // object for turbulence forcing
     TurbForcingComp turbforce;
 
@@ -731,6 +704,7 @@ void main_driver(const char* argv)
 #if defined(TURB)
             if (turbForcing > 0) {
                 EvaluateWritePlotFileVelGrad(0, 0.0, geom, vel, vel_decomp);
+                EvaluateWritePlotFileVelGradTiny(0, 0.0, geom, vel, vel_decomp);
             }
 #endif
 
@@ -764,6 +738,15 @@ void main_driver(const char* argv)
 
 
     } // end t=0 setup
+    
+#if defined(TURB)
+    if (turbForcing >= 1) {
+        MFTurbVel.define(ba, dmap, 3, 0);
+        MFTurbScalar.define(ba, dmap, 3, 0);
+        vel_decomp.define(ba, dmap, 6, 0);
+        vel_decomp.setVal(0.0);
+    }
+#endif
 
     ///////////////////////////////////////////
     // Setup Structure factor
@@ -900,26 +883,6 @@ void main_driver(const char* argv)
 
         }
     }
-    
-#if defined(TURB)
-    if (turbForcing >= 1) {
-        
-        structFactMFTurbVel.define(ba, dmap, 3, 0);
-        structFactMFTurbScalar.define(ba, dmap, 6, 0);
-        vel_decomp.define(ba, dmap, 6, 0);
-        vel_decomp.setVal(0.0);
-
-        turbStructFactVelTotal.define(ba,dmap,
-                var_names_turbVelTotal,var_scaling_turbVelTotal,
-                s_pairA_turbVelTotal,s_pairB_turbVelTotal);
-        turbStructFactScalar.define(ba,dmap,
-                var_names_turbScalar,var_scaling_turbScalar,
-                s_pairA_turbScalar,s_pairB_turbScalar);
-        turbStructFactVelDecomp.defineDecomp(ba,dmap,
-                var_names_turbVelTotal,var_scaling_turbVelDecomp,
-                s_pairA_turbVelTotal,s_pairB_turbVelTotal);
-    }
-#endif
 
     if (n_ads_spec>0) {
 
@@ -1080,7 +1043,7 @@ void main_driver(const char* argv)
 
 #if defined(TURB)
     // Initialize Turbulence Forcing Object
-    if (turbForcing > 1) {
+    if ((turbForcing > 1) and (turbRestartRun)) {
         turbforce.Initialize(geom);
     }
 #endif
@@ -1127,8 +1090,13 @@ void main_driver(const char* argv)
 	}
 
         // FHD
-        RK3stepStag(cu, cumom, prim, vel, source, eta, zeta, kappa, chi, D, 
-            faceflux, edgeflux_x, edgeflux_y, edgeflux_z, cenflux, ranchem, geom, dt, step, turbforce);
+        if (turbRestartRun) {
+          RK3stepStag(cu, cumom, prim, vel, source, eta, zeta, kappa, chi, D, 
+              faceflux, edgeflux_x, edgeflux_y, edgeflux_z, cenflux, ranchem, geom, dt, step, turbforce);
+        }
+	  else {
+	      calculateTransportCoeffs(prim, eta, zeta, kappa, chi, D);
+	  }
 
 	if (n_ads_spec>0 && splitting_MFsurfchem == 1) sample_MFsurfchem(cu, prim, surfcov, dNadsdes, geom, dt/2.0);
 
@@ -1277,6 +1245,9 @@ void main_driver(const char* argv)
                 writePlt = ((step+1)%plot_int == 0);
             }
         }
+#if defined(TURB)
+        if ((turbRestartRun == 0) and (turbForcing >= 1)) writePlt = true;
+#endif
         
         if (writePlt) {
             //yzAverage(cuMeans, cuVars, primMeans, primVars, spatialCross,
@@ -1284,12 +1255,6 @@ void main_driver(const char* argv)
             WritePlotFileStag(step, time, geom, cu, cuMeans, cuVars, cumom, cumomMeans, cumomVars,
                               prim, primMeans, primVars, vel, velMeans, velVars, coVars, surfcov, surfcovMeans, surfcovVars, surfcovcoVars, eta, kappa, zeta);
             
-#if defined(TURB)
-            if (turbForcing > 0) {
-                EvaluateWritePlotFileVelGrad(step, time, geom, vel, vel_decomp);
-            }
-#endif
-
             if (plot_cross) {
                 if (do_1D) {
                     WriteSpatialCross1D(spatialCross1D, step, geom, ncross);
@@ -1315,27 +1280,26 @@ void main_driver(const char* argv)
 
                 // copy velocities into structFactMFTurb
                 for(int d=0; d<AMREX_SPACEDIM; d++) {
-                    ShiftFaceToCC(vel[d], 0, structFactMFTurbVel, d, 1);
+                    ShiftFaceToCC(vel[d], 0, MFTurbVel, d, 1);
                 }
-                MultiFab::Copy(structFactMFTurbScalar, prim, 0, 0, 1, 0);
-                MultiFab::Copy(structFactMFTurbScalar, prim, 4, 1, 1, 0);
-                MultiFab::Copy(structFactMFTurbScalar, prim, 5, 2, 1, 0);
+                MultiFab::Copy(MFTurbScalar, prim, 0, 0, 1, 0);
+                MultiFab::Copy(MFTurbScalar, prim, 4, 1, 1, 0);
+                MultiFab::Copy(MFTurbScalar, prim, 5, 2, 1, 0);
                 
-                 // decomposed velocities
-                    turbStructFactVelDecomp.FortStructureDecomp(structFactMFTurbVel,geom,1);
-                    turbStructFactVelDecomp.GetDecompVel(vel_decomp,geom);
-                    turbStructFactVelDecomp.CallFinalize(geom);
-                    turbStructFactVelDecomp.IntegratekShellsDecomp(step,geom,"vel_solenoid","vel_dilation");
+                // decomposed velocities
+                Vector< std::string > var_names_turbVel{"vel_total","vel_solenoidal","vel_dilation"};
+                Real scaling_turb_veldecomp = dVolinv;
+                TurbSpectrumVelDecomp(MFTurbVel, vel_decomp, geom, step, scaling_turb_veldecomp, var_names_turbVel);
                 
-                 // total velocity
-                    turbStructFactVelTotal.FortStructure(structFactMFTurbVel,geom,1);
-                    turbStructFactVelTotal.CallFinalize(geom);
-                    turbStructFactVelTotal.IntegratekShells(step,geom,"vel_total");
-                
-                 // scalars
-                    turbStructFactScalar.FortStructure(structFactMFTurbScalar,geom,1);
-                    turbStructFactScalar.CallFinalize(geom);
-                    turbStructFactScalar.IntegratekShellsScalar(step,geom,var_names_turbScalar);
+                // scalars
+                Vector< std::string > var_names_turbScalar{"rho","temp","press"};
+                Vector<Real> scaling_turb_scalar(3, dVolinv);
+                TurbSpectrumScalar(MFTurbScalar, geom, step, scaling_turb_scalar, var_names_turbScalar);
+            }
+            
+            if (turbForcing > 0) {
+                EvaluateWritePlotFileVelGrad(step, time, geom, vel, vel_decomp);
+                EvaluateWritePlotFileVelGradTiny(step, time, geom, vel, vel_decomp);
             }
 #endif
         }
@@ -1343,7 +1307,8 @@ void main_driver(const char* argv)
 
 #if defined(TURB)
         // turbulence outputs
-        if ((turbForcing >= 1) and (step%1000 == 0)) {
+        if (((turbForcing >= 1) and (step%1000 == 0)) or
+            ((turbForcing >= 1) and (turbRestartRun == 0))) {
 
             Real turbKE, c_speed, u_rms, taylor_len, taylor_Re, taylor_Ma,
             skew, kurt, eps_s, eps_d, eps_ratio, kolm_s, kolm_d, kolm_t;
@@ -1351,7 +1316,7 @@ void main_driver(const char* argv)
                 vel[i].FillBoundary(geom.periodicity());
                 cumom[i].FillBoundary(geom.periodicity());
             }
-            GetTurbQty(vel, cumom, prim, eta, geom,
+            GetTurbQty(vel, cumom, prim, eta, zeta, geom,
                        turbKE, c_speed, u_rms,
                        taylor_len, taylor_Re, taylor_Ma,
                        skew, kurt,
@@ -1377,7 +1342,9 @@ void main_driver(const char* argv)
             turboutfile << std::endl;
         }
         
-        if ((turbForcing >= 1) and (writePlt)) {
+        if (((turbForcing >= 1) and (writePlt)) or
+            ((turbForcing >= 1) and (turbRestartRun == 0))) {
+            
             Real turbKE_s, turbKE_d, delta_turbKE;
             Real u_rms_s, u_rms_d, delta_u_rms;
             Real taylor_Ma_d;