From f67154937f457369a5ce9006f72a5ad9ef7e1b89 Mon Sep 17 00:00:00 2001
From: Ishan Srivastava <ishan66@gmail.com>
Date: Fri, 26 Jan 2024 14:19:42 -0800
Subject: [PATCH] spectral calculations for turbulence outupt

---
 .../SPECTRAL_FILTER/GNUmakefile               |   65 ++
 .../SPECTRAL_FILTER/Make.package              |    5 +
 .../SPECTRAL_FILTER/build_perlmutter.sh       |   30 +
 .../SPECTRAL_FILTER/main.cpp                  |   28 +
 .../SPECTRAL_FILTER/main_driver.cpp           |  358 ++++++
 .../SPECTRAL_FILTER/spectral_functions.H      |  100 ++
 .../SPECTRAL_FILTER/spectral_functions.cpp    | 1005 +++++++++++++++++
 exec/compressible_stag/build_perlmutter.sh    |   30 +
 8 files changed, 1621 insertions(+)
 create mode 100644 exec/compressible_stag/SPECTRAL_FILTER/GNUmakefile
 create mode 100644 exec/compressible_stag/SPECTRAL_FILTER/Make.package
 create mode 100755 exec/compressible_stag/SPECTRAL_FILTER/build_perlmutter.sh
 create mode 100644 exec/compressible_stag/SPECTRAL_FILTER/main.cpp
 create mode 100644 exec/compressible_stag/SPECTRAL_FILTER/main_driver.cpp
 create mode 100644 exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.H
 create mode 100644 exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.cpp
 create mode 100755 exec/compressible_stag/build_perlmutter.sh

diff --git a/exec/compressible_stag/SPECTRAL_FILTER/GNUmakefile b/exec/compressible_stag/SPECTRAL_FILTER/GNUmakefile
new file mode 100644
index 00000000..141c3a62
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/GNUmakefile
@@ -0,0 +1,65 @@
+AMREX_HOME ?= ../../../../amrex/
+
+DEBUG         = FALSE
+USE_MPI       = TRUE
+USE_OMP       = FALSE
+USE_CUDA      = FALSE
+USE_HIP       = FALSE
+COMP          = gnu
+DIM           = 3
+TINY_PROFILE  = FALSE
+
+USE_HEFFTE_FFTW   = FALSE
+USE_HEFFTE_CUFFT  = FALSE
+USE_HEFFTE_ROCFFT = FALSE
+
+ifeq ($(USE_HEFFTE_FFTW),TRUE)
+  HEFFTE_HOME ?= ../../../../heffte/
+else ifeq ($(USE_HEFFTE_CUFFT),TRUE)
+  HEFFTE_HOME ?= ../../../../heffte/
+else ifeq ($(USE_HEFFTE_ROCFFT),TRUE)
+  HEFFTE_HOME ?= ../../../../heffte/
+endif
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+VPATH_LOCATIONS   += .
+INCLUDE_LOCATIONS += .
+
+ifeq ($(USE_HEFFTE_FFTW),TRUE)
+  include $(HEFFTE_HOME)/src/Make.package
+else ifeq ($(USE_HEFFTE_CUFFT),TRUE)
+  include $(HEFFTE_HOME)/src/Make.package
+else ifeq ($(USE_HEFFTE_ROCFFT),TRUE)
+  include $(HEFFTE_HOME)/src/Make.package
+endif
+
+include ./Make.package
+
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
+
+ifeq ($(USE_CUDA),TRUE)
+  LIBRARIES += -lcufft
+else ifeq ($(USE_HIP),TRUE)
+  # Use rocFFT.  ROC_PATH is defined in amrex
+  INCLUDE_LOCATIONS += $(ROC_PATH)/rocfft/include
+  LIBRARY_LOCATIONS += $(ROC_PATH)/rocfft/lib
+  LIBRARIES += -L$(ROC_PATH)/rocfft/lib -lrocfft
+else
+  LIBRARIES += -L$(FFTW_DIR) -lfftw3_mpi -lfftw3 -lfftw3f
+endif
+
+ifeq ($(DO_TURB), TRUE)
+  DEFINES += -DTURB
+endif
+
+ifeq ($(USE_HEFFTE_FFTW),TRUE)
+  DEFINES += -DHEFFTE_FFTW
+  LIBRARIES += -L$(FFTW_DIR) -lfftw3_mpi -lfftw3 -lfftw3f
+else ifeq ($(USE_HEFFTE_CUFFT),TRUE)
+  DEFINES += -DHEFFTE_CUFFT
+else ifeq ($(USE_HEFFTE_ROCFFT),TRUE)
+  DEFINES += -DHEFFTE_ROCFFT
+endif
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/Make.package b/exec/compressible_stag/SPECTRAL_FILTER/Make.package
new file mode 100644
index 00000000..e0391922
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/Make.package
@@ -0,0 +1,5 @@
+CEXE_sources += main.cpp
+CEXE_sources += main_driver.cpp
+CEXE_sources += spectral_functions.cpp
+
+CEXE_headers += spectral_functions.H
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/build_perlmutter.sh b/exec/compressible_stag/SPECTRAL_FILTER/build_perlmutter.sh
new file mode 100755
index 00000000..f6becf08
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/build_perlmutter.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/bash
+
+# required dependencies
+module load gpu
+module load PrgEnv-gnu
+module load craype
+module load craype-x86-milan
+module load craype-accel-nvidia80
+module load cudatoolkit
+module load cmake/3.24.3
+
+# necessary to use CUDA-Aware MPI and run a job
+export CRAY_ACCEL_TARGET=nvidia80
+
+# optimize CUDA compilation for A100
+export AMREX_CUDA_ARCH=8.0
+
+# optimize CPU microarchitecture for AMD EPYC 3rd Gen (Milan/Zen3)
+# note: the cc/CC/ftn wrappers below add those
+export CXXFLAGS="-march=znver3"
+export CFLAGS="-march=znver3"
+
+# compiler environment hints
+export CC=cc
+export CXX=CC
+export FC=ftn
+export CUDACXX=$(which nvcc)
+export CUDAHOSTCXX=CC
+
+make -j10 USE_CUDA=TRUE USE_HEFFTE_CUFFT=TRUE USE_ASSERTION=TRUE
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/main.cpp b/exec/compressible_stag/SPECTRAL_FILTER/main.cpp
new file mode 100644
index 00000000..95c149e5
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/main.cpp
@@ -0,0 +1,28 @@
+#include <AMReX.H>
+//#include <AMReX_ParmParse.H>
+
+// function declaration
+void main_driver (const char* argv);
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc,argv);
+
+// this specific part has been moved to Flagellum/main_driver.cpp
+//    {
+//        amrex::ParmParse pp("particles");
+//#ifdef AMREX_USE_GPU
+//        bool particles_do_tiling = true;
+//#else
+//        bool particles_do_tiling = false;
+//#endif
+//        pp.queryAdd("do_tiling", particles_do_tiling);
+//    }
+
+    // argv[1] contains the name of the inputs file entered at the command line
+    main_driver(argv[1]);
+
+    amrex::Finalize();
+
+    return 0;
+}
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/main_driver.cpp b/exec/compressible_stag/SPECTRAL_FILTER/main_driver.cpp
new file mode 100644
index 00000000..2c8c7fab
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/main_driver.cpp
@@ -0,0 +1,358 @@
+#include "spectral_functions.H"
+#include <AMReX_Vector.H>
+#include <AMReX_MPMD.H>
+#include "AMReX_ParmParse.H"
+
+#include "chrono"
+
+using namespace std::chrono;
+using namespace amrex;
+
+// argv contains the name of the inputs file entered at the command line
+void main_driver(const char* argv)
+{
+    BL_PROFILE_VAR("main_driver()",main_driver);
+    
+    amrex::Vector<amrex::IntVect> nodal_flag_dir;
+    amrex::IntVect                nodal_flag_x;
+    amrex::IntVect                nodal_flag_y;
+    amrex::IntVect                nodal_flag_z;
+    nodal_flag_dir.resize(3);
+
+    for (int i=0; i<3; ++i) {
+        nodal_flag_x[i] = int(i==0);
+        nodal_flag_y[i] = int(i==1);
+        nodal_flag_z[i] = int(i==2);
+        AMREX_D_TERM(nodal_flag_dir[0][i] = nodal_flag_x[i];,
+                     nodal_flag_dir[1][i] = nodal_flag_y[i];,
+                     nodal_flag_dir[2][i] = nodal_flag_z[i];);
+    }
+
+    // timer
+    Real ts1 = ParallelDescriptor::second();
+    
+    std::string inputs_file = argv;
+
+    ParmParse pp;
+    amrex::Vector<amrex::Real> temp_real(3,0.);
+    amrex::Vector<int>         temp_int (3,0 );
+
+    amrex::Vector<int>   max_grid_size(3,1 );
+    amrex::Vector<int>   n_cells(3,0 );
+    amrex::Vector<Real>  prob_lo(3,0 );
+    amrex::Vector<Real>  prob_hi(3,0 );
+    
+    if (pp.queryarr("n_cells",temp_int,0,3)) {
+        for (int i=0; i<3; ++i) {
+            n_cells[i] = temp_int[i];
+        }
+    }
+    int npts = n_cells[0]*n_cells[1]*n_cells[2];
+    if (pp.queryarr("prob_lo",temp_real,0,3)) {
+        for (int i=0; i<3; ++i) {
+            prob_lo[i] = temp_real[i];
+        }
+    }
+    if (pp.queryarr("prob_hi",temp_real,0,3)) {
+        for (int i=0; i<3; ++i) {
+            prob_hi[i] = temp_real[i];
+        }
+    }
+    pp.queryarr("max_grid_size",max_grid_size,0,3);
+
+    int restart;
+    pp.query("restart",restart);
+
+    int nprimvars;
+    pp.query("nprimvars",nprimvars);
+
+    amrex::IntVect ngc;
+    for (int i=0; i<3; ++i) {
+        ngc[i] = 1;           // number of ghost cells
+    }
+    if (pp.queryarr("ngc",temp_int,0,3)) {
+        for (int i=0; i<3; ++i) {
+            ngc[i] = temp_int[i];
+        }
+    }
+
+    amrex::Real kmin;
+    pp.query("kmin",kmin);
+
+    amrex::Real kmax;
+    pp.query("kmax",kmax);
+    
+    std::array< MultiFab, 3 > vel;
+    MultiFab prim;
+    
+    // make BoxArray and Geometry
+    BoxArray ba;
+    Geometry geom;
+    DistributionMapping dmap;
+
+    IntVect dom_lo(AMREX_D_DECL(           0,            0,            0));
+    IntVect dom_hi(AMREX_D_DECL(n_cells[0]-1, n_cells[1]-1, n_cells[2]-1));
+    Box domain(dom_lo, dom_hi);
+
+    // This defines the physical box, [-1,1] in each direction.
+    RealBox real_box({AMREX_D_DECL(prob_lo[0],prob_lo[1],prob_lo[2])},
+                     {AMREX_D_DECL(prob_hi[0],prob_hi[1],prob_hi[2])});
+
+    // This defines a Geometry object
+    Vector<int> is_periodic(3,1);  // force to be periodic -- can change later
+    geom.define(domain,&real_box,CoordSys::cartesian,is_periodic.data());
+
+    const Real* dx = geom.CellSize();
+    const RealBox& realDomain = geom.ProbDomain();
+
+    SpectralReadCheckPoint(geom, domain, prim, vel, ba, dmap, n_cells, nprimvars, max_grid_size, ngc, restart);
+    
+    MultiFab MFTurbScalar;
+    MultiFab MFTurbVel;
+    MultiFab vel_decomp_filter;
+    MultiFab scalar_filter;
+    MFTurbVel.define(ba, dmap, 3, 0);
+    MFTurbScalar.define(ba, dmap, 1, 0);
+    vel_decomp_filter.define(ba, dmap, 9, 0);
+    scalar_filter.define(ba, dmap, 1, 0);
+    vel_decomp_filter.setVal(0.0);
+    scalar_filter.setVal(0.0);
+
+    // Set BC: 1) fill boundary 2) physical
+    for (int d=0; d<3; d++) {
+        vel[d].FillBoundary(geom.periodicity());
+    }
+    prim.FillBoundary(geom.periodicity());
+
+    for(int d=0; d<3; d++) {
+        ShiftFaceToCC(vel[d], 0, MFTurbVel, d, 1);
+    }
+    MultiFab::Copy(MFTurbScalar, prim, 0, 0, 1, 0);
+    
+    SpectralVelDecomp(MFTurbVel, vel_decomp_filter, kmin, kmax, geom, n_cells);
+    SpectralScalarDecomp(MFTurbScalar, scalar_filter, kmin, kmax, geom, n_cells);
+
+    SpectralWritePlotFile(restart, kmin, kmax, geom, vel_decomp_filter, scalar_filter);
+
+    // Turbulence Diagnostics
+    Real u_rms, u_rms_s, u_rms_d, delta_u_rms;
+    Real taylor_len, taylor_Re_eta;
+    Real skew, skew_s, skew_d, kurt, kurt_s, kurt_d;
+    {
+      vel_decomp_filter.FillBoundary(geom.periodicity());
+    
+      Vector<Real> dProb(3);
+      dProb[0] = 1.0/((n_cells[0]+1)*n_cells[1]*n_cells[2]);
+      dProb[1] = 1.0/((n_cells[1]+1)*n_cells[2]*n_cells[0]);
+      dProb[2] = 1.0/((n_cells[2]+1)*n_cells[0]*n_cells[1]);
+
+      // Setup temp MultiFabs
+      std::array< MultiFab, AMREX_SPACEDIM > gradU;
+      std::array< MultiFab, AMREX_SPACEDIM > faceTemp;
+      MultiFab sound_speed;
+      MultiFab ccTemp;
+      MultiFab ccTempA;
+      AMREX_D_TERM(gradU[0].define(convert(prim.boxArray(),nodal_flag_x), prim.DistributionMap(), 6, 0);,
+                   gradU[1].define(convert(prim.boxArray(),nodal_flag_y), prim.DistributionMap(), 6, 0);,
+                   gradU[2].define(convert(prim.boxArray(),nodal_flag_z), prim.DistributionMap(), 6, 0););   
+      AMREX_D_TERM(faceTemp[0].define(convert(prim.boxArray(),nodal_flag_x), prim.DistributionMap(), 1, 0);,
+                   faceTemp[1].define(convert(prim.boxArray(),nodal_flag_y), prim.DistributionMap(), 1, 0);,
+                   faceTemp[2].define(convert(prim.boxArray(),nodal_flag_z), prim.DistributionMap(), 1, 0););   
+      sound_speed.define(prim.boxArray(),prim.DistributionMap(),1,0);
+      ccTemp.define(prim.boxArray(),prim.DistributionMap(),1,0);
+      ccTempA.define(prim.boxArray(),prim.DistributionMap(),1,0);
+
+      // Setup temp variables
+      Vector<Real> gradU2_temp(3);
+      Vector<Real> gradU2(3);
+      Vector<Real> gradU3(3);
+      Vector<Real> gradU4(3);
+      Vector<Real> gradU2_s(3);
+      Vector<Real> gradU3_s(3);
+      Vector<Real> gradU4_s(3);
+      Vector<Real> gradU2_d(3);
+      Vector<Real> gradU3_d(3);
+      Vector<Real> gradU4_d(3);
+      
+      Vector<int> comps   {0,1,2};
+      Vector<int> comps_s{3,4,5};
+      Vector<int> comps_d{6,7,8};
+
+      // turbulent kinetic energy (total)
+      ccTemp.setVal(0.0);
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,0,vel_decomp_filter,0,0,1,0); //uu
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,1,vel_decomp_filter,1,0,1,0); //vv
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,2,vel_decomp_filter,2,0,1,0); //ww
+      u_rms = ccTemp.sum(0)/npts;
+      u_rms = sqrt(u_rms/3.0);
+      MultiFab::Multiply(ccTemp,prim,0,0,1,0); // rho*(uu+vv+ww)
+
+      // turbulent kinetic energy (solenoidal)
+      ccTemp.setVal(0.0);
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,3,vel_decomp_filter,0,0,1,0); //uu
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,4,vel_decomp_filter,1,0,1,0); //vv
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,5,vel_decomp_filter,2,0,1,0); //ww
+      u_rms_s = ccTemp.sum(0)/npts;
+      u_rms_s = sqrt(u_rms_s/3.0);
+      MultiFab::Multiply(ccTemp,prim,0,0,1,0); // rho*(uu+vv+ww)
+
+      // turbulent kinetic energy (dilatational)
+      ccTemp.setVal(0.0);
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,6,vel_decomp_filter,0,0,1,0); //uu
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,7,vel_decomp_filter,1,0,1,0); //vv
+      MultiFab::AddProduct(ccTemp,vel_decomp_filter,8,vel_decomp_filter,2,0,1,0); //ww
+      u_rms_d = ccTemp.sum(0)/npts;
+      u_rms_d = sqrt(u_rms_d/3.0);
+      MultiFab::Multiply(ccTemp,prim,0,0,1,0); // rho*(uu+vv+ww)
+      
+      // ratio of turbulent kinetic energies
+      delta_u_rms  = u_rms_d/u_rms_s;
+
+      // compute gradU = [du/dx dv/dy dw/dz] at cell-centers
+      ComputeGrad(vel_decomp_filter,gradU,0,0,9,-1,geom,0);
+
+      // Compute Velocity gradient moment sum
+      // 2nd moment (total)
+      FCMoments(gradU,comps,faceTemp,2,gradU2_temp);
+      gradU2[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU2[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU2[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+      ccTemp.setVal(0.0);
+      ccTempA.setVal(0.0);
+      ShiftFaceToCC(faceTemp[0],0,ccTempA,0,1);
+      MultiFab::Add(ccTemp,ccTempA,0,0,1,0);
+      ShiftFaceToCC(faceTemp[1],0,ccTempA,0,1);
+      MultiFab::Add(ccTemp,ccTempA,0,0,1,0);
+      ShiftFaceToCC(faceTemp[2],0,ccTempA,0,1);
+      MultiFab::Add(ccTemp,ccTempA,0,0,1,0);
+      Real avg_mom2 = ccTemp.sum(0)/npts;
+      // 2nd moment (solenoidal)
+      FCMoments(gradU,comps_s,faceTemp,2,gradU2_temp);
+      gradU2_s[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU2_s[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU2_s[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+      // 2nd moment (dilatational)
+      FCMoments(gradU,comps_d,faceTemp,2,gradU2_temp);
+      gradU2_d[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU2_d[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU2_d[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+
+      // Taylor Mach
+      //ComputeSoundSpeed(sound_speed,prim,2);
+      //Real c_speed = sound_speed.sum(0)/npts;
+      Real rho_avg = prim.sum(0)/npts;
+      // Taylor Ma
+      //taylor_Ma = sqrt(3.0)*u_rms/c_speed;
+      // Taylor Microscale
+      taylor_len = sqrt(3.0)*u_rms/sqrt(avg_mom2); // from Wang et al., JFM, 2012
+      taylor_Re_eta = rho_avg*taylor_len*u_rms; // from from John, Donzis, Sreenivasan, PRL 2019
+
+      // Compute Velocity gradient moment sum
+      // 3rd moment (total)
+      FCMoments(gradU,comps,faceTemp,3,gradU2_temp);
+      gradU3[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU3[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU3[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+      // 3rd moment (solenoidal)
+      FCMoments(gradU,comps_s,faceTemp,3,gradU2_temp);
+      gradU3_s[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU3_s[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU3_s[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+      // 3rd moment (dilatational)
+      FCMoments(gradU,comps_d,faceTemp,3,gradU2_temp);
+      gradU3_d[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU3_d[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU3_d[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+
+      // Compute Velocity gradient moment sum
+      // 4th moment (total)
+      FCMoments(gradU,comps,faceTemp,4,gradU2_temp);
+      gradU4[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU4[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU4[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+      // 4th moment (solenoidal)
+      FCMoments(gradU,comps_s,faceTemp,4,gradU2_temp);
+      gradU4_s[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU4_s[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU4_s[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+      // 4th moment (dilatational)
+      FCMoments(gradU,comps_d,faceTemp,4,gradU2_temp);
+      gradU4_d[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity()));
+      gradU4_d[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity()));
+      gradU4_d[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity()));
+              
+      // Skewness
+      // <\sum_i (du_i/dx_i)^3> / (\sum_i <(du_i/dx_i)^2>^1.5)
+      skew   = (gradU3[0] + gradU3[1] + gradU3[2])/
+               (pow(gradU2[0],1.5) + pow(gradU2[1],1.5) + pow(gradU2[2],1.5)); 
+      skew_s = (gradU3_s[0] + gradU3_s[1] + gradU3_s[2])/
+               (pow(gradU2_s[0],1.5) + pow(gradU2_s[1],1.5) + pow(gradU2_s[2],1.5)); 
+      skew_d = (gradU3_d[0] + gradU3_d[1] + gradU3_d[2])/
+               (pow(gradU2_d[0],1.5) + pow(gradU2_d[1],1.5) + pow(gradU2_d[2],1.5)); 
+              
+      // Kurtosis
+      // <\sum_i (du_i/dx_i)^4> / (\sum_i <(du_i/dx_i)^2>^2)
+      kurt   = (gradU4[0] + gradU4[1] + gradU4[2])/
+               (pow(gradU2[0],2.0) + pow(gradU2[1],2.0) + pow(gradU2[2],2.0)); 
+      kurt_s = (gradU4_s[0] + gradU4_s[1] + gradU4_s[2])/
+               (pow(gradU2_s[0],2.0) + pow(gradU2_s[1],2.0) + pow(gradU2_s[2],2.0)); 
+      kurt_d = (gradU4_d[0] + gradU4_d[1] + gradU4_d[2])/
+               (pow(gradU2_d[0],2.0) + pow(gradU2_d[1],2.0) + pow(gradU2_d[2],2.0)); 
+    }
+    std::string turbfilename = "turbstats_";
+    std::ostringstream os;
+    os << std::setprecision(3) << kmin;
+    turbfilename += os.str();;
+    std::ostringstream oss;
+    oss << std::setprecision(3) << kmax;
+    turbfilename += oss.str();
+    
+    std::ofstream turboutfile;
+    turboutfile.open(turbfilename, std::ios::app);
+    turboutfile << "u_rms " << "u_rms_s " << "u_rms_d " << "delta_u_rms " 
+                << "TaylorLen " << "TaylorRe*Eta "
+                << "skew " << "skew_s " << "skew_d "
+                << "kurt " << "kurt_s " << "kurt_d "
+                << std::endl;
+     turboutfile << u_rms << " ";
+     turboutfile << u_rms_s << " ";
+     turboutfile << u_rms_d << " ";
+     turboutfile << delta_u_rms << " ";
+     turboutfile << taylor_len << " ";
+     turboutfile << taylor_Re_eta << " ";
+     turboutfile << skew << " ";
+     turboutfile << skew_s << " ";
+     turboutfile << skew_d << " ";
+     turboutfile << kurt << " ";
+     turboutfile << kurt_s << " ";
+     turboutfile << kurt_d << " ";
+     turboutfile << std::endl;
+
+    // timer
+    Real ts2 = ParallelDescriptor::second() - ts1;
+    ParallelDescriptor::ReduceRealMax(ts2,  ParallelDescriptor::IOProcessorNumber());
+    amrex::Print() << "Time (spectral filtering) " << ts2 << " seconds\n";
+    
+    // MultiFab memory usage
+    const int IOProc = ParallelDescriptor::IOProcessorNumber();
+
+    amrex::Long min_fab_megabytes  = amrex::TotalBytesAllocatedInFabsHWM()/1048576;
+    amrex::Long max_fab_megabytes  = min_fab_megabytes;
+
+    ParallelDescriptor::ReduceLongMin(min_fab_megabytes, IOProc);
+    ParallelDescriptor::ReduceLongMax(max_fab_megabytes, IOProc);
+
+    amrex::Print() << "High-water FAB megabyte spread across MPI nodes: ["
+                   << min_fab_megabytes << " ... " << max_fab_megabytes << "]\n";
+
+    min_fab_megabytes  = amrex::TotalBytesAllocatedInFabs()/1048576;
+    max_fab_megabytes  = min_fab_megabytes;
+
+    ParallelDescriptor::ReduceLongMin(min_fab_megabytes, IOProc);
+    ParallelDescriptor::ReduceLongMax(max_fab_megabytes, IOProc);
+
+    amrex::Print() << "Curent     FAB megabyte spread across MPI nodes: ["
+                   << min_fab_megabytes << " ... " << max_fab_megabytes << "]\n";
+    
+    turboutfile.close();
+}
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.H b/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.H
new file mode 100644
index 00000000..5cd30f67
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.H
@@ -0,0 +1,100 @@
+#ifndef _spectral_functions_stag_H_
+#define _spectral_functions_stag_H_
+
+#include <heffte.h>
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_ArrayLim.H>
+
+#include <AMReX_Vector.H>
+#include <AMReX_VisMF.H>
+
+
+#ifdef AMREX_USE_CUDA
+#include <cufft.h>
+#elif AMREX_USE_HIP
+#  if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
+#    include <rocfft/rocfft.h>
+#  else
+#    include <rocfft.h>
+#  endif
+#else
+#include <fftw3.h>
+#include <fftw3-mpi.h>
+#endif
+
+#include <AMReX_GpuComplex.H>
+
+#include <string>
+
+#define ALIGN 16
+
+using namespace amrex;
+
+#if !defined(HEFFTE_FFTW) && !defined(HEFFTE_CUFFT) && !defined(HEFFTE_ROCFFT)
+#ifdef AMREX_USE_CUDA
+std::string cufftError (const cufftResult& err);
+#endif
+#ifdef AMREX_USE_HIP
+std::string rocfftError (const rocfft_status err);
+void Assert_rocfft_status (std::string const& name, rocfft_status status);
+#endif
+#endif
+
+void SpectralReadCheckPoint(amrex::Geometry& geom,
+                            const amrex::Box& domain,
+                            amrex::MultiFab& prim,
+                            std::array<MultiFab, AMREX_SPACEDIM>& vel,
+                            BoxArray& ba, DistributionMapping& dmap,
+                            const amrex::Vector<int> n_cells,
+                            const int nprimvars,
+                            const amrex::Vector<int> max_grid_size,
+                            const amrex::IntVect ngc,
+                            const int restart);
+
+void SpectralVelDecomp(const MultiFab& vel,
+                       MultiFab& vel_decomp_filter,
+                       const amrex::Real kmin,
+                       const amrex::Real kmax,
+                       const amrex::Geometry& geom,
+                       const amrex::Vector<int> n_cells);
+
+void SpectralScalarDecomp(const MultiFab& scalar,
+                          MultiFab& scalar_filter,
+                          const amrex::Real kmin,
+                          const amrex::Real kmax,
+                          const amrex::Geometry& geom,
+                          const amrex::Vector<int> n_cells);
+
+void SpectralWritePlotFile(const int step,
+                           const amrex::Real& kmin,
+                           const amrex::Real& kmax,
+                           const amrex::Geometry& geom,
+                           const amrex::MultiFab& vel_decomp_in,
+                           const amrex::MultiFab& scalar_in);
+
+void Read_Copy_MF_Checkpoint(amrex::MultiFab& mf, std::string mf_name, 
+                             const std::string& checkpointname, 
+                             BoxArray& ba_old, DistributionMapping& dmap_old,
+                             int NVARS, int NGC, const amrex::IntVect ngc,
+                             int nodal_flag=-1);
+
+void ShiftFaceToCC(const MultiFab& face_in, int face_in_comp,
+                   MultiFab& cc_in, int cc_in_comp,
+                   int ncomp);
+
+void ComputeGrad(const MultiFab & phi_in, std::array<MultiFab, AMREX_SPACEDIM> & gphi,
+                 int start_incomp, int start_outcomp, int ncomp, int bccomp, const Geometry & geom,
+                 int increment);
+
+void SumStag(const std::array<MultiFab, AMREX_SPACEDIM>& m1,
+	     amrex::Vector<amrex::Real>& sum);
+
+void FCMoments(const std::array<MultiFab, 3>& m1,
+		       const amrex::Vector<int>& comps,
+               std::array<MultiFab, 3>&  mscr,
+		       const int& power,
+		       amrex::Vector<amrex::Real>& prod_val);
+
+#endif
+
diff --git a/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.cpp b/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.cpp
new file mode 100644
index 00000000..1569b4e4
--- /dev/null
+++ b/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.cpp
@@ -0,0 +1,1005 @@
+#include "spectral_functions.H"
+#include "AMReX_PlotFileUtil.H"
+#include "AMReX_PlotFileDataImpl.H"
+
+#include <sys/stat.h>
+
+#include "chrono"
+#include <AMReX_MultiFabUtil.H>
+#include "AMReX_PlotFileUtil.H"
+#include "AMReX_BoxArray.H"
+
+using namespace std::chrono;
+
+namespace {
+    void GotoNextLine (std::istream& is)
+    {
+        constexpr std::streamsize bl_ignore_max { 100000 };
+        is.ignore(bl_ignore_max, '\n');
+    }
+}
+
+void SpectralReadCheckPoint(amrex::Geometry& geom,
+                            const amrex::Box& domain,
+                            amrex::MultiFab& prim,
+                            std::array<MultiFab, AMREX_SPACEDIM>& vel,
+                            BoxArray& ba, DistributionMapping& dmap,
+                            const amrex::Vector<int> n_cells,
+                            const int nprimvars,
+                            const amrex::Vector<int> max_grid_size,
+                            const amrex::IntVect ngc,
+                            const int restart)
+{
+    // timer for profiling
+    BL_PROFILE_VAR("SpectralReadCheckPoint()",SpectralReadCheckPoint);
+
+    // checkpoint file name, e.g., chk0000010
+    const std::string& checkpointname = amrex::Concatenate("chk",restart,9);
+
+    amrex::Print() << "Restart from checkpoint " << checkpointname << "\n";
+
+    VisMF::IO_Buffer io_buffer(VisMF::GetIOBufferSize());
+
+    std::string line, word;
+
+    // read in old boxarray, and create old distribution map (this is to read in MFabs)
+    BoxArray ba_old;
+    DistributionMapping dmap_old;
+
+    // initialize new boxarray
+    ba.define(domain);
+    ba.maxSize(IntVect(max_grid_size));
+    dmap.define(ba, ParallelDescriptor::NProcs());
+    
+    amrex::Vector<amrex::IntVect> nodal_flag_dir;
+    amrex::IntVect                nodal_flag_x;
+    amrex::IntVect                nodal_flag_y;
+    amrex::IntVect                nodal_flag_z;
+    nodal_flag_dir.resize(3);
+
+    for (int i=0; i<3; ++i) {
+        nodal_flag_x[i] = int(i==0);
+        nodal_flag_y[i] = int(i==1);
+        nodal_flag_z[i] = int(i==2);
+        AMREX_D_TERM(nodal_flag_dir[0][i] = nodal_flag_x[i];,
+                     nodal_flag_dir[1][i] = nodal_flag_y[i];,
+                     nodal_flag_dir[2][i] = nodal_flag_z[i];);
+    }
+    
+    // Header
+    {
+        std::string File(checkpointname + "/Header");
+        Vector<char> fileCharPtr;
+        ParallelDescriptor::ReadAndBcastFile(File, fileCharPtr);
+        std::string fileCharPtrString(fileCharPtr.dataPtr());
+        std::istringstream is(fileCharPtrString, std::istringstream::in);
+
+        // read in title line
+        std::getline(is, line);
+
+        // read in time step number
+        int step;
+        is >> step;
+        GotoNextLine(is);
+
+        // read in time
+        Real time;
+        is >> time;
+        GotoNextLine(is);
+
+        // read in statsCount
+        int statsCount;
+        is >> statsCount;
+        GotoNextLine(is);
+
+        // read in BoxArray (fluid) from Header
+        ba_old.readFrom(is);
+        GotoNextLine(is);
+
+        // create old distribution mapping
+        dmap_old.define(ba_old, ParallelDescriptor::NProcs());
+
+        prim.define(ba,dmap,nprimvars,ngc);
+        // velocity and momentum (instantaneous, means, variances)
+        for (int d=0; d<AMREX_SPACEDIM; d++) {
+            vel[d].define(convert(ba,nodal_flag_dir[d]), dmap, 1, ngc);
+        }
+    }
+
+    // C++ random number engine
+    // each MPI process reads in its own file
+    int comm_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank);
+
+    int n_ranks;
+    MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);
+    
+    // read in the MultiFab data
+    Read_Copy_MF_Checkpoint(prim,"prim",checkpointname,ba_old,dmap_old,nprimvars,1,ngc);
+
+    Read_Copy_MF_Checkpoint(vel[0],"velx",checkpointname,ba_old,dmap_old,1,1,ngc,0);
+    Read_Copy_MF_Checkpoint(vel[1],"vely",checkpointname,ba_old,dmap_old,1,1,ngc,1);
+    Read_Copy_MF_Checkpoint(vel[2],"velz",checkpointname,ba_old,dmap_old,1,1,ngc,2);
+
+    // FillBoundaries
+    prim.FillBoundary(geom.periodicity());
+    vel[0].FillBoundary(geom.periodicity());
+    vel[1].FillBoundary(geom.periodicity());
+    vel[2].FillBoundary(geom.periodicity());
+}
+
+void SpectralVelDecomp(const MultiFab& vel,
+                       MultiFab& vel_decomp_filter,
+                       const amrex::Real kmin,
+                       const amrex::Real kmax,
+                       const amrex::Geometry& geom,
+                       const amrex::Vector<int> n_cells)
+{
+    BL_PROFILE_VAR("SpectralVelDecomp()",SpectralVelDecomp);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(vel.nComp() == 3, 
+        "SpectralVelDecomp: must have 3 components of input vel MultiFab");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(vel.local_size() == 1, 
+        "SpectralVelDecomp: Must have one Box per MPI process when using heFFTe");
+
+    const GpuArray<Real, AMREX_SPACEDIM> dx = geom.CellSizeArray();
+    
+    long npts;
+    Box domain = geom.Domain();
+    npts = (domain.length(0)*domain.length(1)*domain.length(2));
+    Real sqrtnpts = std::sqrt(npts);
+    
+    // get box array and distribution map of vel
+    DistributionMapping dm = vel.DistributionMap();
+    BoxArray ba            = vel.boxArray();
+    
+    // since there is 1 MPI rank per box, each MPI rank obtains its local box and the associated boxid
+    Box local_box;
+    int local_boxid;
+    {
+        for (int i = 0; i < ba.size(); ++i) {
+            Box b = ba[i];
+            // each MPI rank has its own local_box Box and local_boxid ID
+            if (ParallelDescriptor::MyProc() == dm[i]) {
+                local_box = b;
+                local_boxid = i;
+            }
+        }
+    }
+
+    // now each MPI rank works on its own box
+    // for real->complex fft's, the fft is stored in an (nx/2+1) x ny x nz dataset
+
+    // start by coarsening each box by 2 in the x-direction
+    Box c_local_box = amrex::coarsen(local_box, IntVect(AMREX_D_DECL(2,1,1)));
+
+    // if the coarsened box's high-x index is even, we shrink the size in 1 in x
+    // this avoids overlap between coarsened boxes
+    if (c_local_box.bigEnd(0) * 2 == local_box.bigEnd(0)) {
+        c_local_box.setBig(0,c_local_box.bigEnd(0)-1);
+    }
+    // for any boxes that touch the hi-x domain we
+    // increase the size of boxes by 1 in x
+    // this makes the overall fft dataset have size (Nx/2+1 x Ny x Nz)
+    if (local_box.bigEnd(0) == geom.Domain().bigEnd(0)) {
+        c_local_box.growHi(0,1);
+    }
+
+    // each MPI rank gets storage for its piece of the fft
+    BaseFab<GpuComplex<Real> > spectral_field_Tx(c_local_box, 1, The_Device_Arena()); // totalx
+    BaseFab<GpuComplex<Real> > spectral_field_Ty(c_local_box, 1, The_Device_Arena()); // totaly
+    BaseFab<GpuComplex<Real> > spectral_field_Tz(c_local_box, 1, The_Device_Arena()); // totalz
+    BaseFab<GpuComplex<Real> > spectral_field_Sx(c_local_box, 1, The_Device_Arena()); // solenoidalx
+    BaseFab<GpuComplex<Real> > spectral_field_Sy(c_local_box, 1, The_Device_Arena()); // solenoidaly
+    BaseFab<GpuComplex<Real> > spectral_field_Sz(c_local_box, 1, The_Device_Arena()); // solenoidalz
+    BaseFab<GpuComplex<Real> > spectral_field_Dx(c_local_box, 1, The_Device_Arena()); // dilatationalx
+    BaseFab<GpuComplex<Real> > spectral_field_Dy(c_local_box, 1, The_Device_Arena()); // dilatationaly
+    BaseFab<GpuComplex<Real> > spectral_field_Dz(c_local_box, 1, The_Device_Arena()); // dilatationalz
+	  MultiFab vel_single(ba, dm, 1, 0);
+    	
+    int r2c_direction = 0;
+    
+    // ForwardTransform
+    // X
+    using heffte_complex = typename heffte::fft_output<Real>::type;
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      vel_single.ParallelCopy(vel, 0, 0, 1);
+	    heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tx.dataPtr();
+      fft.forward(vel_single[local_boxid].dataPtr(),spectral_data);
+    }
+    // Y
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      vel_single.ParallelCopy(vel, 1, 0, 1);
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Ty.dataPtr();
+      fft.forward(vel_single[local_boxid].dataPtr(),spectral_data);
+    }
+    // Z
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      vel_single.ParallelCopy(vel, 2, 0, 1);
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tz.dataPtr();
+      fft.forward(vel_single[local_boxid].dataPtr(),spectral_data);
+    }
+    
+    Gpu::streamSynchronize();
+    
+    int nx = n_cells[0]; 
+    int ny = n_cells[1]; 
+    int nz = n_cells[2];
+    
+    // Decompose velocity field into solenoidal and dilatational
+    Array4< GpuComplex<Real> > spectral_tx = spectral_field_Tx.array();
+    Array4< GpuComplex<Real> > spectral_ty = spectral_field_Ty.array();
+    Array4< GpuComplex<Real> > spectral_tz = spectral_field_Tz.array();
+    Array4< GpuComplex<Real> > spectral_sx = spectral_field_Sx.array();
+    Array4< GpuComplex<Real> > spectral_sy = spectral_field_Sy.array();
+    Array4< GpuComplex<Real> > spectral_sz = spectral_field_Sz.array();
+    Array4< GpuComplex<Real> > spectral_dx = spectral_field_Dx.array();
+    Array4< GpuComplex<Real> > spectral_dy = spectral_field_Dy.array();
+    Array4< GpuComplex<Real> > spectral_dz = spectral_field_Dz.array();
+    ParallelFor(c_local_box, [=] AMREX_GPU_DEVICE(int i, int j, int k)
+    {
+
+       Real GxR = 0.0, GxC = 0.0, GyR = 0.0, GyC = 0.0, GzR = 0.0, GzC = 0.0;
+       
+       if (i <= nx/2) { 
+           // Gradient Operators
+           GxR = (cos(2.0*M_PI*i/nx)-1.0)/dx[0];
+           GxC = (sin(2.0*M_PI*i/nx)-0.0)/dx[0];
+           GyR = (cos(2.0*M_PI*j/ny)-1.0)/dx[1];
+           GyC = (sin(2.0*M_PI*j/ny)-0.0)/dx[1];
+           GzR = (cos(2.0*M_PI*k/nz)-1.0)/dx[2];
+           GzC = (sin(2.0*M_PI*k/nz)-0.0)/dx[2];
+       }
+       else { // conjugate
+            amrex::Abort("check the code; i should not go beyond bx.length(0)/2");
+       }
+
+       // Get the wavenumber
+       int ki = i;
+       int kj = j;
+       int kk = k;
+       Real knum = (ki*ki + kj*kj + kk*kk);
+       knum = std::sqrt(knum);
+
+       // Scale Total velocity FFT components with Filtering
+       if ((knum >= kmin) and (knum <= kmax)) {
+         spectral_tx(i,j,k) *= (1.0/sqrtnpts);
+         spectral_ty(i,j,k) *= (1.0/sqrtnpts);
+         spectral_tz(i,j,k) *= (1.0/sqrtnpts);
+       }
+       else {
+         spectral_tx(i,j,k) *= 0.0;
+         spectral_ty(i,j,k) *= 0.0;
+         spectral_tz(i,j,k) *= 0.0;
+       }
+
+       // Inverse Laplacian
+       Real Lap = GxR*GxR + GxC*GxC + GyR*GyR + GyC*GyC + GzR*GzR + GzC*GzC;
+
+       // Divergence of vel
+       Real divR = spectral_tx(i,j,k).real()*GxR - spectral_tx(i,j,k).imag()*GxC +
+                   spectral_ty(i,j,k).real()*GyR - spectral_ty(i,j,k).imag()*GyC +
+                   spectral_tz(i,j,k).real()*GzR - spectral_tz(i,j,k).imag()*GzC ;
+       Real divC = spectral_tx(i,j,k).real()*GxC + spectral_tx(i,j,k).imag()*GxR +
+                   spectral_ty(i,j,k).real()*GyC + spectral_ty(i,j,k).imag()*GyR +
+                   spectral_tz(i,j,k).real()*GzC + spectral_tz(i,j,k).imag()*GzR ;
+
+       if (Lap < 1.0e-12) { // zero mode for no bulk motion
+           spectral_dx(i,j,k) *= 0.0;
+           spectral_dy(i,j,k) *= 0.0;
+           spectral_dz(i,j,k) *= 0.0;
+       }
+       else {
+
+           // Dilatational velocity 
+           GpuComplex<Real> copy_dx((divR*GxR + divC*GxC) / Lap, 
+                                    (divC*GxR - divR*GxC) / Lap);
+           spectral_dx(i,j,k) = copy_dx;
+
+           GpuComplex<Real> copy_dy((divR*GyR + divC*GyC) / Lap,
+                                    (divC*GyR - divR*GyC) / Lap);
+           spectral_dy(i,j,k) = copy_dy;
+
+           GpuComplex<Real> copy_dz((divR*GzR + divC*GzC) / Lap,
+                                    (divC*GzR - divR*GzC) / Lap);
+           spectral_dz(i,j,k) = copy_dz;
+       }
+           
+       // Solenoidal velocity
+       spectral_sx(i,j,k) = spectral_tx(i,j,k) - spectral_dx(i,j,k);
+       spectral_sy(i,j,k) = spectral_ty(i,j,k) - spectral_dy(i,j,k); 
+       spectral_sz(i,j,k) = spectral_tz(i,j,k) - spectral_dz(i,j,k);
+
+    });
+
+    Gpu::streamSynchronize();
+    
+	  MultiFab vel_decomp_filter_single(ba, dm, 1, 0);
+    // inverse Fourier transform filtered total velocity 
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tx.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 0, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Ty.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 1, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tz.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 2, 1);
+    }
+    // inverse Fourier transform filtered solenoidal and dilatational components 
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Sx.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 3, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Sy.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 4, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Sz.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 5, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Dx.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 6, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Dy.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 7, 1);
+    }
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field_Dz.dataPtr();
+      fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 8, 1);
+    }
+
+    
+    vel_decomp_filter.mult(1.0/sqrtnpts);
+
+}
+
+
+void SpectralScalarDecomp(const MultiFab& scalar,
+                          MultiFab& scalar_filter,
+                          const amrex::Real kmin,
+                          const amrex::Real kmax,
+                          const amrex::Geometry& geom,
+                          const amrex::Vector<int> n_cells)
+{
+    BL_PROFILE_VAR("SpectralScalarDecomp()",SpectralScalarDecomp);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(scalar.nComp() == 1, 
+        "SpectralScalarDecomp: must have 1 components of input scalar MultiFab");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(scalar.local_size() == 1, 
+        "SpectralScalarDecomp: Must have one Box per MPI process when using heFFTe");
+
+    const GpuArray<Real, AMREX_SPACEDIM> dx = geom.CellSizeArray();
+    
+    long npts;
+    Box domain = geom.Domain();
+    npts = (domain.length(0)*domain.length(1)*domain.length(2));
+    Real sqrtnpts = std::sqrt(npts);
+    
+    // get box array and distribution map of vel
+    DistributionMapping dm = scalar.DistributionMap();
+    BoxArray ba            = scalar.boxArray();
+    
+    // since there is 1 MPI rank per box, each MPI rank obtains its local box and the associated boxid
+    Box local_box;
+    int local_boxid;
+    {
+        for (int i = 0; i < ba.size(); ++i) {
+            Box b = ba[i];
+            // each MPI rank has its own local_box Box and local_boxid ID
+            if (ParallelDescriptor::MyProc() == dm[i]) {
+                local_box = b;
+                local_boxid = i;
+            }
+        }
+    }
+
+    // now each MPI rank works on its own box
+    // for real->complex fft's, the fft is stored in an (nx/2+1) x ny x nz dataset
+
+    // start by coarsening each box by 2 in the x-direction
+    Box c_local_box = amrex::coarsen(local_box, IntVect(AMREX_D_DECL(2,1,1)));
+
+    // if the coarsened box's high-x index is even, we shrink the size in 1 in x
+    // this avoids overlap between coarsened boxes
+    if (c_local_box.bigEnd(0) * 2 == local_box.bigEnd(0)) {
+        c_local_box.setBig(0,c_local_box.bigEnd(0)-1);
+    }
+    // for any boxes that touch the hi-x domain we
+    // increase the size of boxes by 1 in x
+    // this makes the overall fft dataset have size (Nx/2+1 x Ny x Nz)
+    if (local_box.bigEnd(0) == geom.Domain().bigEnd(0)) {
+        c_local_box.growHi(0,1);
+    }
+
+    // each MPI rank gets storage for its piece of the fft
+    BaseFab<GpuComplex<Real> > spectral_field(c_local_box, 1, The_Device_Arena());
+	  MultiFab scalar_single(ba, dm, 1, 0);
+    	
+    int r2c_direction = 0;
+    
+    // ForwardTransform
+    using heffte_complex = typename heffte::fft_output<Real>::type;
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      scalar_single.ParallelCopy(scalar, 0, 0, 1);
+	    heffte_complex* spectral_data = (heffte_complex*) spectral_field.dataPtr();
+      fft.forward(scalar_single[local_boxid].dataPtr(),spectral_data);
+    }
+    
+    Gpu::streamSynchronize();
+    
+    // filtering
+    Array4< GpuComplex<Real> > spectral = spectral_field.array();
+    int nx = n_cells[0]; 
+    int ny = n_cells[1]; 
+    int nz = n_cells[2];
+    ParallelFor(c_local_box, [=] AMREX_GPU_DEVICE(int i, int j, int k)
+    {
+
+       if (i <= nx/2) { 
+       }
+       else { // conjugate
+            amrex::Abort("check the code; i should not go beyond bx.length(0)/2");
+       }
+
+       // Get the wavenumber
+       int ki = i;
+       int kj = j;
+       int kk = k;
+       Real knum = (ki*ki + kj*kj + kk*kk);
+       knum = std::sqrt(knum);
+
+       // Scale Scalar FFT components with Filtering
+       if ((knum >= kmin) and (knum <= kmax)) {
+         spectral(i,j,k) *= (1.0/sqrtnpts);
+         spectral(i,j,k) *= (1.0/sqrtnpts);
+         spectral(i,j,k) *= (1.0/sqrtnpts);
+       }
+       else {
+         spectral(i,j,k) *= 0.0;
+         spectral(i,j,k) *= 0.0;
+         spectral(i,j,k) *= 0.0;
+       }
+    });
+
+    Gpu::streamSynchronize();
+    
+	  MultiFab scalar_filter_single(ba, dm, 1, 0);
+    // inverse Fourier transform filtered scalar 
+    {
+#if defined(HEFFTE_CUFFT)
+      heffte::fft3d_r2c<heffte::backend::cufft> fft
+#elif defined(HEFFTE_ROCFFT)
+      heffte::fft3d_r2c<heffte::backend::rocfft> fft
+#elif defined(HEFFTE_FFTW)
+      heffte::fft3d_r2c<heffte::backend::fftw> fft
+#endif
+      ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
+      {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
+      {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
+      {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
+      r2c_direction, ParallelDescriptor::Communicator());
+	    
+      heffte_complex* spectral_data = (heffte_complex*) spectral_field.dataPtr();
+      fft.backward(spectral_data, scalar_filter_single[local_boxid].dataPtr());
+    
+      Gpu::streamSynchronize();
+      scalar_filter.ParallelCopy(scalar_filter_single, 0, 0, 1);
+    }
+    
+    scalar_filter.mult(1.0/sqrtnpts);
+
+}
+
+void SpectralWritePlotFile(const int step,
+                           const amrex::Real& kmin,
+                           const amrex::Real& kmax,
+                           const amrex::Geometry& geom,
+                           const amrex::MultiFab& vel_decomp_in,
+                           const amrex::MultiFab& scalar_in)
+{
+
+    MultiFab output;
+    
+    // Cell-Centered Velocity Gradient Stats (1,2,3 are directions)
+    // 0: ux
+    // 1: uy
+    // 2: uz
+    // 3: ux_s
+    // 4: uy_s
+    // 5: uz_s
+    // 6: ux_d
+    // 7: uy_d
+    // 8: uz_d
+    // 9: umag
+    // 10: umag_s
+    // 11: umag_d
+    // 12: scalar
+    // 13: divergence = u_1,1 + u_2,2 + u_3,3
+    // 14: vorticity w1
+    // 15: vorticity w2
+    // 16: vorticity w3
+    // 17: vorticity mag: sqrt(w1**2 + w2**2 + w3**2)
+    output.define(vel_decomp_in.boxArray(), vel_decomp_in.DistributionMap(), 18, 0);
+    output.setVal(0.0);
+
+    const GpuArray<Real, AMREX_SPACEDIM> dx = geom.CellSizeArray();
+
+    for ( MFIter mfi(output,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+        
+        const Box& bx = mfi.tilebox();
+        
+        const Array4<      Real>&             out   = output.array(mfi);
+
+        const Array4<const Real>&  v_decomp         = vel_decomp_in.array(mfi);
+
+        const Array4<const Real>&  sca              = scalar_in.array(mfi);
+        
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            
+            out(i,j,k,0) = v_decomp(i,j,k,0);
+            out(i,j,k,1) = v_decomp(i,j,k,1);
+            out(i,j,k,2) = v_decomp(i,j,k,2);
+            out(i,j,k,3) = v_decomp(i,j,k,3);
+            out(i,j,k,4) = v_decomp(i,j,k,4);
+            out(i,j,k,5) = v_decomp(i,j,k,5);
+            out(i,j,k,6) = v_decomp(i,j,k,6);
+            out(i,j,k,7) = v_decomp(i,j,k,7);
+            out(i,j,k,8) = v_decomp(i,j,k,8);
+
+            out(i,j,k,9)  = std::sqrt(out(i,j,k,0)*out(i,j,k,0) + out(i,j,k,1)*out(i,j,k,1) + out(i,j,k,2)*out(i,j,k,2)); // mag
+            out(i,j,k,10) = std::sqrt(out(i,j,k,3)*out(i,j,k,3) + out(i,j,k,4)*out(i,j,k,4) + out(i,j,k,5)*out(i,j,k,5)); // mag solednoidal
+            out(i,j,k,11) = std::sqrt(out(i,j,k,6)*out(i,j,k,6) + out(i,j,k,7)*out(i,j,k,7) + out(i,j,k,8)*out(i,j,k,8)); // mag solednoidal
+            
+            out(i,j,k,12) = sca(i,j,k,0);
+
+            // divergence
+            out(i,j,k,13) = 0.5*( (v_decomp(i+1,j,k,0) - v_decomp(i-1,j,k,0))/dx[0] +
+                                  (v_decomp(i,j+1,k,1) - v_decomp(i,j-1,k,1))/dx[1] +
+                                  (v_decomp(i,j,k+1,2) - v_decomp(i,j,k-1,2))/dx[2] );
+
+            // curl w1 = u_2,1 - u_1,2
+            out(i,j,k,14) = 0.5*( (v_decomp(i+1,j,k,1) - v_decomp(i-1,j,k,1))/dx[0] -
+                                  (v_decomp(i,j+1,k,0) - v_decomp(i,j-1,k,0))/dx[1] );
+
+            // curl w2 = u_1,3 - u_3,1
+            out(i,j,k,15) = 0.5*( (v_decomp(i,j,k+1,0) - v_decomp(i,j,k-1,0))/dx[2] -
+                                  (v_decomp(i+1,j,k,2) - v_decomp(i-1,j,k,2))/dx[0] );
+
+            // curl w2 = u_3,2 - u_2,3
+            out(i,j,k,16) = 0.5*( (v_decomp(i,j+1,k,2) - v_decomp(i,j-1,k,2))/dx[1] -
+                                  (v_decomp(i,j,k+1,1) - v_decomp(i,j,k-1,1))/dx[2] );
+
+            // vorticity magnitude: sqrt(w1*w1 + w2*w2 + w3*w3)
+            out(i,j,k,17) = std::sqrt( out(i,j,k,14)*out(i,j,k,14) + out(i,j,k,15)*out(i,j,k,15) + out(i,j,k,16)*out(i,j,k,16) );
+        });
+    }
+
+    // Write on a plotfile
+    std::string plotfilename = amrex::Concatenate("filtered_",step,9);
+    std::ostringstream os;
+    os << std::setprecision(3) << kmin;
+    plotfilename += os.str();;
+    std::ostringstream oss;
+    oss << std::setprecision(3) << kmax;
+    plotfilename += oss.str();
+
+    amrex::Vector<std::string> varNames(18);
+    varNames[0] = "ux";
+    varNames[1] = "uy";
+    varNames[2] = "uz";
+    varNames[3] = "ux_s";
+    varNames[4] = "uy_s";
+    varNames[5] = "uz_s";
+    varNames[6] = "ux_d";
+    varNames[7] = "uy_d";
+    varNames[8] = "uz_d";
+    varNames[9] = "umag";
+    varNames[10] = "umag_s";
+    varNames[11] = "umag_d";
+    varNames[12] = "rho";
+    varNames[13] = "div";
+    varNames[14] = "w1";
+    varNames[15] = "w2";
+    varNames[16] = "w3";
+    varNames[17] = "vort";
+    WriteSingleLevelPlotfile(plotfilename,output,varNames,geom,0.0,step);
+}
+
+void Read_Copy_MF_Checkpoint(amrex::MultiFab& mf, std::string mf_name, const std::string& checkpointname, 
+                             BoxArray& ba_old, DistributionMapping& dmap_old,
+                             int NVARS, int ghost, const amrex::IntVect ngc, 
+                             int nodal_flag)
+{
+    // Read into temporary MF from file
+    MultiFab mf_temp;
+    VisMF::Read(mf_temp,amrex::MultiFabFileFullPrefix(0, checkpointname, "Level_", mf_name));
+
+    // Copy temporary MF into the new MF
+    if (ghost) {
+        mf.ParallelCopy(mf_temp, 0, 0, NVARS, ngc, ngc);
+    }
+    else {
+        mf.ParallelCopy(mf_temp, 0, 0, NVARS, 0, 0);
+    }
+}
+
+void ShiftFaceToCC(const MultiFab& face_in, int face_comp,
+                   MultiFab& cc_in, int cc_comp, int ncomp)
+{
+
+    BL_PROFILE_VAR("ShiftFaceToCC()",ShiftFaceToCC);
+
+    if (!face_in.is_nodal(0) && !face_in.is_nodal(1) && !face_in.is_nodal(2)) {
+        Abort("ShiftFaceToCC requires a face-centered MultiFab");
+    }
+
+    // Loop over boxes (note that mfi takes a cell-centered multifab as an argument)
+    for (MFIter mfi(cc_in,TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+
+        const Box& bx = mfi.tilebox();
+
+        Array4<Real const> const& face = face_in.array(mfi);
+
+        Array4<Real> const& cc = cc_in.array(mfi);
+
+        amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+        {
+            cc(i,j,k,cc_comp+n) = face(i,j,k,face_comp+n);
+        });
+    }
+}
+
+void ComputeGrad(const MultiFab & phi_in, std::array<MultiFab, AMREX_SPACEDIM> & gphi,
+                 int start_incomp, int start_outcomp, int ncomp, int bccomp, const Geometry & geom,
+                 int increment)
+{
+    BL_PROFILE_VAR("ComputeGrad()",ComputeGrad);
+    
+    // Physical Domain
+    Box dom(geom.Domain());
+    
+    const GpuArray<Real, AMREX_SPACEDIM> dx = geom.CellSizeArray();
+
+    // if not incrementing, initialize data to zero
+    if (increment == 0) {
+        for (int dir=0; dir<AMREX_SPACEDIM; ++dir) {
+            gphi[dir].setVal(0.,start_outcomp,ncomp,0);
+        }
+    }
+    
+    // Loop over boxes (note that mfi takes a cell-centered multifab as an argument)
+    for ( MFIter mfi(phi_in,TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+
+        const Array4<Real const> & phi = phi_in.array(mfi);
+
+        AMREX_D_TERM(const Array4<Real> & gphix = gphi[0].array(mfi);,
+                     const Array4<Real> & gphiy = gphi[1].array(mfi);,
+                     const Array4<Real> & gphiz = gphi[2].array(mfi););
+
+        AMREX_D_TERM(const Box & bx_x = mfi.nodaltilebox(0);,
+                     const Box & bx_y = mfi.nodaltilebox(1);,
+                     const Box & bx_z = mfi.nodaltilebox(2););
+        
+        amrex::ParallelFor(bx_x, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+        {
+            gphix(i,j,k,start_outcomp+n) += (phi(i,j,k,start_incomp+n)-phi(i-1,j,k,start_incomp+n)) / dx[0];
+        },
+                           bx_y, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+        {
+            gphiy(i,j,k,start_outcomp+n) += (phi(i,j,k,start_incomp+n)-phi(i,j-1,k,start_incomp+n)) / dx[1];
+        }
+                         , bx_z, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+        {
+            gphiz(i,j,k,start_outcomp+n) += (phi(i,j,k,start_incomp+n)-phi(i,j,k-1,start_incomp+n)) / dx[2];
+        }
+        );
+
+    } // end MFIter
+}
+
+void FCMoments(const std::array<MultiFab, 3>& m1,
+		       const amrex::Vector<int>& comps,
+               std::array<MultiFab, 3>&  mscr,
+		       const int& power,
+		       amrex::Vector<amrex::Real>& prod_val)
+{
+
+  BL_PROFILE_VAR("FCMoments()",FCMoments);
+
+  for (int d=0; d<3; ++d) {
+    MultiFab::Copy(mscr[d],m1[d],comps[d],0,1,0);
+    for(int i=1; i<power; i++){
+      MultiFab::Multiply(mscr[d],m1[d],comps[d],0,1,0);
+    }
+  }
+  SumStag(mscr,prod_val);
+}
+
+void SumStag(const std::array<MultiFab, AMREX_SPACEDIM>& m1,
+	     amrex::Vector<amrex::Real>& sum)
+{
+  BL_PROFILE_VAR("SumStag()",SumStag);
+
+  // Initialize to zero
+  std::fill(sum.begin(), sum.end(), 0.);
+
+  ReduceOps<ReduceOpSum> reduce_op;
+
+  //////// x-faces
+
+  ReduceData<Real> reduce_datax(reduce_op);
+  using ReduceTuple = typename decltype(reduce_datax)::Type;
+
+  for (MFIter mfi(m1[0],TilingIfNotGPU()); mfi.isValid(); ++mfi)
+  {
+      const Box& bx = mfi.tilebox();
+      const Box& bx_grid = mfi.validbox();
+
+      auto const& fab = m1[0].array(mfi);
+
+      int xlo = bx_grid.smallEnd(0);
+      int xhi = bx_grid.bigEnd(0);
+
+      reduce_op.eval(bx, reduce_datax,
+      [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple
+      {
+          Real weight = (i>xlo && i<xhi) ? 1.0 : 0.5;
+          return {fab(i,j,k)*weight};
+      });
+  }
+
+  sum[0] = amrex::get<0>(reduce_datax.value());
+  ParallelDescriptor::ReduceRealSum(sum[0]);
+
+  //////// y-faces
+
+  ReduceData<Real> reduce_datay(reduce_op);
+
+  for (MFIter mfi(m1[1],TilingIfNotGPU()); mfi.isValid(); ++mfi)
+  {
+      const Box& bx = mfi.tilebox();
+      const Box& bx_grid = mfi.validbox();
+
+      auto const& fab = m1[1].array(mfi);
+
+      int ylo = bx_grid.smallEnd(1);
+      int yhi = bx_grid.bigEnd(1);
+
+      reduce_op.eval(bx, reduce_datay,
+      [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple
+      {
+          Real weight = (j>ylo && j<yhi) ? 1.0 : 0.5;
+          return {fab(i,j,k)*weight};
+      });
+  }
+
+  sum[1] = amrex::get<0>(reduce_datay.value());
+  ParallelDescriptor::ReduceRealSum(sum[1]);
+
+  //////// z-faces
+
+  ReduceData<Real> reduce_dataz(reduce_op);
+
+  for (MFIter mfi(m1[2],TilingIfNotGPU()); mfi.isValid(); ++mfi)
+  {
+      const Box& bx = mfi.tilebox();
+      const Box& bx_grid = mfi.validbox();
+
+      auto const& fab = m1[2].array(mfi);
+
+      int zlo = bx_grid.smallEnd(2);
+      int zhi = bx_grid.bigEnd(2);
+
+      reduce_op.eval(bx, reduce_dataz,
+      [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple
+      {
+          Real weight = (k>zlo && k<zhi) ? 1.0 : 0.5;
+          return {fab(i,j,k)*weight};
+      });
+  }
+
+  sum[2] = amrex::get<0>(reduce_dataz.value());
+  ParallelDescriptor::ReduceRealSum(sum[2]);
+}
+
diff --git a/exec/compressible_stag/build_perlmutter.sh b/exec/compressible_stag/build_perlmutter.sh
new file mode 100755
index 00000000..2118c705
--- /dev/null
+++ b/exec/compressible_stag/build_perlmutter.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/bash
+
+# required dependencies
+module load gpu
+module load PrgEnv-gnu
+module load craype
+module load craype-x86-milan
+module load craype-accel-nvidia80
+module load cudatoolkit
+module load cmake/3.24.3
+
+# necessary to use CUDA-Aware MPI and run a job
+export CRAY_ACCEL_TARGET=nvidia80
+
+# optimize CUDA compilation for A100
+export AMREX_CUDA_ARCH=8.0
+
+# optimize CPU microarchitecture for AMD EPYC 3rd Gen (Milan/Zen3)
+# note: the cc/CC/ftn wrappers below add those
+export CXXFLAGS="-march=znver3"
+export CFLAGS="-march=znver3"
+
+# compiler environment hints
+export CC=cc
+export CXX=CC
+export FC=ftn
+export CUDACXX=$(which nvcc)
+export CUDAHOSTCXX=CC
+
+make -j10 USE_CUDA=TRUE DO_TURB=TRUE MAX_SPEC=2 USE_HEFFTE_CUFFT=TRUE USE_ASSERTION=TRUE