From f67154937f457369a5ce9006f72a5ad9ef7e1b89 Mon Sep 17 00:00:00 2001 From: Ishan Srivastava Date: Fri, 26 Jan 2024 14:19:42 -0800 Subject: [PATCH] spectral calculations for turbulence outupt --- .../SPECTRAL_FILTER/GNUmakefile | 65 ++ .../SPECTRAL_FILTER/Make.package | 5 + .../SPECTRAL_FILTER/build_perlmutter.sh | 30 + .../SPECTRAL_FILTER/main.cpp | 28 + .../SPECTRAL_FILTER/main_driver.cpp | 358 ++++++ .../SPECTRAL_FILTER/spectral_functions.H | 100 ++ .../SPECTRAL_FILTER/spectral_functions.cpp | 1005 +++++++++++++++++ exec/compressible_stag/build_perlmutter.sh | 30 + 8 files changed, 1621 insertions(+) create mode 100644 exec/compressible_stag/SPECTRAL_FILTER/GNUmakefile create mode 100644 exec/compressible_stag/SPECTRAL_FILTER/Make.package create mode 100755 exec/compressible_stag/SPECTRAL_FILTER/build_perlmutter.sh create mode 100644 exec/compressible_stag/SPECTRAL_FILTER/main.cpp create mode 100644 exec/compressible_stag/SPECTRAL_FILTER/main_driver.cpp create mode 100644 exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.H create mode 100644 exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.cpp create mode 100755 exec/compressible_stag/build_perlmutter.sh diff --git a/exec/compressible_stag/SPECTRAL_FILTER/GNUmakefile b/exec/compressible_stag/SPECTRAL_FILTER/GNUmakefile new file mode 100644 index 00000000..141c3a62 --- /dev/null +++ b/exec/compressible_stag/SPECTRAL_FILTER/GNUmakefile @@ -0,0 +1,65 @@ +AMREX_HOME ?= ../../../../amrex/ + +DEBUG = FALSE +USE_MPI = TRUE +USE_OMP = FALSE +USE_CUDA = FALSE +USE_HIP = FALSE +COMP = gnu +DIM = 3 +TINY_PROFILE = FALSE + +USE_HEFFTE_FFTW = FALSE +USE_HEFFTE_CUFFT = FALSE +USE_HEFFTE_ROCFFT = FALSE + +ifeq ($(USE_HEFFTE_FFTW),TRUE) + HEFFTE_HOME ?= ../../../../heffte/ +else ifeq ($(USE_HEFFTE_CUFFT),TRUE) + HEFFTE_HOME ?= ../../../../heffte/ +else ifeq ($(USE_HEFFTE_ROCFFT),TRUE) + HEFFTE_HOME ?= ../../../../heffte/ +endif + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +VPATH_LOCATIONS += . +INCLUDE_LOCATIONS += . + +ifeq ($(USE_HEFFTE_FFTW),TRUE) + include $(HEFFTE_HOME)/src/Make.package +else ifeq ($(USE_HEFFTE_CUFFT),TRUE) + include $(HEFFTE_HOME)/src/Make.package +else ifeq ($(USE_HEFFTE_ROCFFT),TRUE) + include $(HEFFTE_HOME)/src/Make.package +endif + +include ./Make.package + +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules + +ifeq ($(USE_CUDA),TRUE) + LIBRARIES += -lcufft +else ifeq ($(USE_HIP),TRUE) + # Use rocFFT. ROC_PATH is defined in amrex + INCLUDE_LOCATIONS += $(ROC_PATH)/rocfft/include + LIBRARY_LOCATIONS += $(ROC_PATH)/rocfft/lib + LIBRARIES += -L$(ROC_PATH)/rocfft/lib -lrocfft +else + LIBRARIES += -L$(FFTW_DIR) -lfftw3_mpi -lfftw3 -lfftw3f +endif + +ifeq ($(DO_TURB), TRUE) + DEFINES += -DTURB +endif + +ifeq ($(USE_HEFFTE_FFTW),TRUE) + DEFINES += -DHEFFTE_FFTW + LIBRARIES += -L$(FFTW_DIR) -lfftw3_mpi -lfftw3 -lfftw3f +else ifeq ($(USE_HEFFTE_CUFFT),TRUE) + DEFINES += -DHEFFTE_CUFFT +else ifeq ($(USE_HEFFTE_ROCFFT),TRUE) + DEFINES += -DHEFFTE_ROCFFT +endif diff --git a/exec/compressible_stag/SPECTRAL_FILTER/Make.package b/exec/compressible_stag/SPECTRAL_FILTER/Make.package new file mode 100644 index 00000000..e0391922 --- /dev/null +++ b/exec/compressible_stag/SPECTRAL_FILTER/Make.package @@ -0,0 +1,5 @@ +CEXE_sources += main.cpp +CEXE_sources += main_driver.cpp +CEXE_sources += spectral_functions.cpp + +CEXE_headers += spectral_functions.H diff --git a/exec/compressible_stag/SPECTRAL_FILTER/build_perlmutter.sh b/exec/compressible_stag/SPECTRAL_FILTER/build_perlmutter.sh new file mode 100755 index 00000000..f6becf08 --- /dev/null +++ b/exec/compressible_stag/SPECTRAL_FILTER/build_perlmutter.sh @@ -0,0 +1,30 @@ +#!/usr/bin/bash + +# required dependencies +module load gpu +module load PrgEnv-gnu +module load craype +module load craype-x86-milan +module load craype-accel-nvidia80 +module load cudatoolkit +module load cmake/3.24.3 + +# necessary to use CUDA-Aware MPI and run a job +export CRAY_ACCEL_TARGET=nvidia80 + +# optimize CUDA compilation for A100 +export AMREX_CUDA_ARCH=8.0 + +# optimize CPU microarchitecture for AMD EPYC 3rd Gen (Milan/Zen3) +# note: the cc/CC/ftn wrappers below add those +export CXXFLAGS="-march=znver3" +export CFLAGS="-march=znver3" + +# compiler environment hints +export CC=cc +export CXX=CC +export FC=ftn +export CUDACXX=$(which nvcc) +export CUDAHOSTCXX=CC + +make -j10 USE_CUDA=TRUE USE_HEFFTE_CUFFT=TRUE USE_ASSERTION=TRUE diff --git a/exec/compressible_stag/SPECTRAL_FILTER/main.cpp b/exec/compressible_stag/SPECTRAL_FILTER/main.cpp new file mode 100644 index 00000000..95c149e5 --- /dev/null +++ b/exec/compressible_stag/SPECTRAL_FILTER/main.cpp @@ -0,0 +1,28 @@ +#include +//#include + +// function declaration +void main_driver (const char* argv); + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc,argv); + +// this specific part has been moved to Flagellum/main_driver.cpp +// { +// amrex::ParmParse pp("particles"); +//#ifdef AMREX_USE_GPU +// bool particles_do_tiling = true; +//#else +// bool particles_do_tiling = false; +//#endif +// pp.queryAdd("do_tiling", particles_do_tiling); +// } + + // argv[1] contains the name of the inputs file entered at the command line + main_driver(argv[1]); + + amrex::Finalize(); + + return 0; +} diff --git a/exec/compressible_stag/SPECTRAL_FILTER/main_driver.cpp b/exec/compressible_stag/SPECTRAL_FILTER/main_driver.cpp new file mode 100644 index 00000000..2c8c7fab --- /dev/null +++ b/exec/compressible_stag/SPECTRAL_FILTER/main_driver.cpp @@ -0,0 +1,358 @@ +#include "spectral_functions.H" +#include +#include +#include "AMReX_ParmParse.H" + +#include "chrono" + +using namespace std::chrono; +using namespace amrex; + +// argv contains the name of the inputs file entered at the command line +void main_driver(const char* argv) +{ + BL_PROFILE_VAR("main_driver()",main_driver); + + amrex::Vector nodal_flag_dir; + amrex::IntVect nodal_flag_x; + amrex::IntVect nodal_flag_y; + amrex::IntVect nodal_flag_z; + nodal_flag_dir.resize(3); + + for (int i=0; i<3; ++i) { + nodal_flag_x[i] = int(i==0); + nodal_flag_y[i] = int(i==1); + nodal_flag_z[i] = int(i==2); + AMREX_D_TERM(nodal_flag_dir[0][i] = nodal_flag_x[i];, + nodal_flag_dir[1][i] = nodal_flag_y[i];, + nodal_flag_dir[2][i] = nodal_flag_z[i];); + } + + // timer + Real ts1 = ParallelDescriptor::second(); + + std::string inputs_file = argv; + + ParmParse pp; + amrex::Vector temp_real(3,0.); + amrex::Vector temp_int (3,0 ); + + amrex::Vector max_grid_size(3,1 ); + amrex::Vector n_cells(3,0 ); + amrex::Vector prob_lo(3,0 ); + amrex::Vector prob_hi(3,0 ); + + if (pp.queryarr("n_cells",temp_int,0,3)) { + for (int i=0; i<3; ++i) { + n_cells[i] = temp_int[i]; + } + } + int npts = n_cells[0]*n_cells[1]*n_cells[2]; + if (pp.queryarr("prob_lo",temp_real,0,3)) { + for (int i=0; i<3; ++i) { + prob_lo[i] = temp_real[i]; + } + } + if (pp.queryarr("prob_hi",temp_real,0,3)) { + for (int i=0; i<3; ++i) { + prob_hi[i] = temp_real[i]; + } + } + pp.queryarr("max_grid_size",max_grid_size,0,3); + + int restart; + pp.query("restart",restart); + + int nprimvars; + pp.query("nprimvars",nprimvars); + + amrex::IntVect ngc; + for (int i=0; i<3; ++i) { + ngc[i] = 1; // number of ghost cells + } + if (pp.queryarr("ngc",temp_int,0,3)) { + for (int i=0; i<3; ++i) { + ngc[i] = temp_int[i]; + } + } + + amrex::Real kmin; + pp.query("kmin",kmin); + + amrex::Real kmax; + pp.query("kmax",kmax); + + std::array< MultiFab, 3 > vel; + MultiFab prim; + + // make BoxArray and Geometry + BoxArray ba; + Geometry geom; + DistributionMapping dmap; + + IntVect dom_lo(AMREX_D_DECL( 0, 0, 0)); + IntVect dom_hi(AMREX_D_DECL(n_cells[0]-1, n_cells[1]-1, n_cells[2]-1)); + Box domain(dom_lo, dom_hi); + + // This defines the physical box, [-1,1] in each direction. + RealBox real_box({AMREX_D_DECL(prob_lo[0],prob_lo[1],prob_lo[2])}, + {AMREX_D_DECL(prob_hi[0],prob_hi[1],prob_hi[2])}); + + // This defines a Geometry object + Vector is_periodic(3,1); // force to be periodic -- can change later + geom.define(domain,&real_box,CoordSys::cartesian,is_periodic.data()); + + const Real* dx = geom.CellSize(); + const RealBox& realDomain = geom.ProbDomain(); + + SpectralReadCheckPoint(geom, domain, prim, vel, ba, dmap, n_cells, nprimvars, max_grid_size, ngc, restart); + + MultiFab MFTurbScalar; + MultiFab MFTurbVel; + MultiFab vel_decomp_filter; + MultiFab scalar_filter; + MFTurbVel.define(ba, dmap, 3, 0); + MFTurbScalar.define(ba, dmap, 1, 0); + vel_decomp_filter.define(ba, dmap, 9, 0); + scalar_filter.define(ba, dmap, 1, 0); + vel_decomp_filter.setVal(0.0); + scalar_filter.setVal(0.0); + + // Set BC: 1) fill boundary 2) physical + for (int d=0; d<3; d++) { + vel[d].FillBoundary(geom.periodicity()); + } + prim.FillBoundary(geom.periodicity()); + + for(int d=0; d<3; d++) { + ShiftFaceToCC(vel[d], 0, MFTurbVel, d, 1); + } + MultiFab::Copy(MFTurbScalar, prim, 0, 0, 1, 0); + + SpectralVelDecomp(MFTurbVel, vel_decomp_filter, kmin, kmax, geom, n_cells); + SpectralScalarDecomp(MFTurbScalar, scalar_filter, kmin, kmax, geom, n_cells); + + SpectralWritePlotFile(restart, kmin, kmax, geom, vel_decomp_filter, scalar_filter); + + // Turbulence Diagnostics + Real u_rms, u_rms_s, u_rms_d, delta_u_rms; + Real taylor_len, taylor_Re_eta; + Real skew, skew_s, skew_d, kurt, kurt_s, kurt_d; + { + vel_decomp_filter.FillBoundary(geom.periodicity()); + + Vector dProb(3); + dProb[0] = 1.0/((n_cells[0]+1)*n_cells[1]*n_cells[2]); + dProb[1] = 1.0/((n_cells[1]+1)*n_cells[2]*n_cells[0]); + dProb[2] = 1.0/((n_cells[2]+1)*n_cells[0]*n_cells[1]); + + // Setup temp MultiFabs + std::array< MultiFab, AMREX_SPACEDIM > gradU; + std::array< MultiFab, AMREX_SPACEDIM > faceTemp; + MultiFab sound_speed; + MultiFab ccTemp; + MultiFab ccTempA; + AMREX_D_TERM(gradU[0].define(convert(prim.boxArray(),nodal_flag_x), prim.DistributionMap(), 6, 0);, + gradU[1].define(convert(prim.boxArray(),nodal_flag_y), prim.DistributionMap(), 6, 0);, + gradU[2].define(convert(prim.boxArray(),nodal_flag_z), prim.DistributionMap(), 6, 0);); + AMREX_D_TERM(faceTemp[0].define(convert(prim.boxArray(),nodal_flag_x), prim.DistributionMap(), 1, 0);, + faceTemp[1].define(convert(prim.boxArray(),nodal_flag_y), prim.DistributionMap(), 1, 0);, + faceTemp[2].define(convert(prim.boxArray(),nodal_flag_z), prim.DistributionMap(), 1, 0);); + sound_speed.define(prim.boxArray(),prim.DistributionMap(),1,0); + ccTemp.define(prim.boxArray(),prim.DistributionMap(),1,0); + ccTempA.define(prim.boxArray(),prim.DistributionMap(),1,0); + + // Setup temp variables + Vector gradU2_temp(3); + Vector gradU2(3); + Vector gradU3(3); + Vector gradU4(3); + Vector gradU2_s(3); + Vector gradU3_s(3); + Vector gradU4_s(3); + Vector gradU2_d(3); + Vector gradU3_d(3); + Vector gradU4_d(3); + + Vector comps {0,1,2}; + Vector comps_s{3,4,5}; + Vector comps_d{6,7,8}; + + // turbulent kinetic energy (total) + ccTemp.setVal(0.0); + MultiFab::AddProduct(ccTemp,vel_decomp_filter,0,vel_decomp_filter,0,0,1,0); //uu + MultiFab::AddProduct(ccTemp,vel_decomp_filter,1,vel_decomp_filter,1,0,1,0); //vv + MultiFab::AddProduct(ccTemp,vel_decomp_filter,2,vel_decomp_filter,2,0,1,0); //ww + u_rms = ccTemp.sum(0)/npts; + u_rms = sqrt(u_rms/3.0); + MultiFab::Multiply(ccTemp,prim,0,0,1,0); // rho*(uu+vv+ww) + + // turbulent kinetic energy (solenoidal) + ccTemp.setVal(0.0); + MultiFab::AddProduct(ccTemp,vel_decomp_filter,3,vel_decomp_filter,0,0,1,0); //uu + MultiFab::AddProduct(ccTemp,vel_decomp_filter,4,vel_decomp_filter,1,0,1,0); //vv + MultiFab::AddProduct(ccTemp,vel_decomp_filter,5,vel_decomp_filter,2,0,1,0); //ww + u_rms_s = ccTemp.sum(0)/npts; + u_rms_s = sqrt(u_rms_s/3.0); + MultiFab::Multiply(ccTemp,prim,0,0,1,0); // rho*(uu+vv+ww) + + // turbulent kinetic energy (dilatational) + ccTemp.setVal(0.0); + MultiFab::AddProduct(ccTemp,vel_decomp_filter,6,vel_decomp_filter,0,0,1,0); //uu + MultiFab::AddProduct(ccTemp,vel_decomp_filter,7,vel_decomp_filter,1,0,1,0); //vv + MultiFab::AddProduct(ccTemp,vel_decomp_filter,8,vel_decomp_filter,2,0,1,0); //ww + u_rms_d = ccTemp.sum(0)/npts; + u_rms_d = sqrt(u_rms_d/3.0); + MultiFab::Multiply(ccTemp,prim,0,0,1,0); // rho*(uu+vv+ww) + + // ratio of turbulent kinetic energies + delta_u_rms = u_rms_d/u_rms_s; + + // compute gradU = [du/dx dv/dy dw/dz] at cell-centers + ComputeGrad(vel_decomp_filter,gradU,0,0,9,-1,geom,0); + + // Compute Velocity gradient moment sum + // 2nd moment (total) + FCMoments(gradU,comps,faceTemp,2,gradU2_temp); + gradU2[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity())); + gradU2[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity())); + gradU2[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity())); + ccTemp.setVal(0.0); + ccTempA.setVal(0.0); + ShiftFaceToCC(faceTemp[0],0,ccTempA,0,1); + MultiFab::Add(ccTemp,ccTempA,0,0,1,0); + ShiftFaceToCC(faceTemp[1],0,ccTempA,0,1); + MultiFab::Add(ccTemp,ccTempA,0,0,1,0); + ShiftFaceToCC(faceTemp[2],0,ccTempA,0,1); + MultiFab::Add(ccTemp,ccTempA,0,0,1,0); + Real avg_mom2 = ccTemp.sum(0)/npts; + // 2nd moment (solenoidal) + FCMoments(gradU,comps_s,faceTemp,2,gradU2_temp); + gradU2_s[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity())); + gradU2_s[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity())); + gradU2_s[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity())); + // 2nd moment (dilatational) + FCMoments(gradU,comps_d,faceTemp,2,gradU2_temp); + gradU2_d[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity())); + gradU2_d[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity())); + gradU2_d[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity())); + + // Taylor Mach + //ComputeSoundSpeed(sound_speed,prim,2); + //Real c_speed = sound_speed.sum(0)/npts; + Real rho_avg = prim.sum(0)/npts; + // Taylor Ma + //taylor_Ma = sqrt(3.0)*u_rms/c_speed; + // Taylor Microscale + taylor_len = sqrt(3.0)*u_rms/sqrt(avg_mom2); // from Wang et al., JFM, 2012 + taylor_Re_eta = rho_avg*taylor_len*u_rms; // from from John, Donzis, Sreenivasan, PRL 2019 + + // Compute Velocity gradient moment sum + // 3rd moment (total) + FCMoments(gradU,comps,faceTemp,3,gradU2_temp); + gradU3[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity())); + gradU3[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity())); + gradU3[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity())); + // 3rd moment (solenoidal) + FCMoments(gradU,comps_s,faceTemp,3,gradU2_temp); + gradU3_s[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity())); + gradU3_s[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity())); + gradU3_s[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity())); + // 3rd moment (dilatational) + FCMoments(gradU,comps_d,faceTemp,3,gradU2_temp); + gradU3_d[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity())); + gradU3_d[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity())); + gradU3_d[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity())); + + // Compute Velocity gradient moment sum + // 4th moment (total) + FCMoments(gradU,comps,faceTemp,4,gradU2_temp); + gradU4[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity())); + gradU4[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity())); + gradU4[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity())); + // 4th moment (solenoidal) + FCMoments(gradU,comps_s,faceTemp,4,gradU2_temp); + gradU4_s[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity())); + gradU4_s[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity())); + gradU4_s[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity())); + // 4th moment (dilatational) + FCMoments(gradU,comps_d,faceTemp,4,gradU2_temp); + gradU4_d[0] = dProb[0]*(faceTemp[0].sum_unique(0,false,geom.periodicity())); + gradU4_d[1] = dProb[1]*(faceTemp[1].sum_unique(0,false,geom.periodicity())); + gradU4_d[2] = dProb[2]*(faceTemp[2].sum_unique(0,false,geom.periodicity())); + + // Skewness + // <\sum_i (du_i/dx_i)^3> / (\sum_i <(du_i/dx_i)^2>^1.5) + skew = (gradU3[0] + gradU3[1] + gradU3[2])/ + (pow(gradU2[0],1.5) + pow(gradU2[1],1.5) + pow(gradU2[2],1.5)); + skew_s = (gradU3_s[0] + gradU3_s[1] + gradU3_s[2])/ + (pow(gradU2_s[0],1.5) + pow(gradU2_s[1],1.5) + pow(gradU2_s[2],1.5)); + skew_d = (gradU3_d[0] + gradU3_d[1] + gradU3_d[2])/ + (pow(gradU2_d[0],1.5) + pow(gradU2_d[1],1.5) + pow(gradU2_d[2],1.5)); + + // Kurtosis + // <\sum_i (du_i/dx_i)^4> / (\sum_i <(du_i/dx_i)^2>^2) + kurt = (gradU4[0] + gradU4[1] + gradU4[2])/ + (pow(gradU2[0],2.0) + pow(gradU2[1],2.0) + pow(gradU2[2],2.0)); + kurt_s = (gradU4_s[0] + gradU4_s[1] + gradU4_s[2])/ + (pow(gradU2_s[0],2.0) + pow(gradU2_s[1],2.0) + pow(gradU2_s[2],2.0)); + kurt_d = (gradU4_d[0] + gradU4_d[1] + gradU4_d[2])/ + (pow(gradU2_d[0],2.0) + pow(gradU2_d[1],2.0) + pow(gradU2_d[2],2.0)); + } + std::string turbfilename = "turbstats_"; + std::ostringstream os; + os << std::setprecision(3) << kmin; + turbfilename += os.str();; + std::ostringstream oss; + oss << std::setprecision(3) << kmax; + turbfilename += oss.str(); + + std::ofstream turboutfile; + turboutfile.open(turbfilename, std::ios::app); + turboutfile << "u_rms " << "u_rms_s " << "u_rms_d " << "delta_u_rms " + << "TaylorLen " << "TaylorRe*Eta " + << "skew " << "skew_s " << "skew_d " + << "kurt " << "kurt_s " << "kurt_d " + << std::endl; + turboutfile << u_rms << " "; + turboutfile << u_rms_s << " "; + turboutfile << u_rms_d << " "; + turboutfile << delta_u_rms << " "; + turboutfile << taylor_len << " "; + turboutfile << taylor_Re_eta << " "; + turboutfile << skew << " "; + turboutfile << skew_s << " "; + turboutfile << skew_d << " "; + turboutfile << kurt << " "; + turboutfile << kurt_s << " "; + turboutfile << kurt_d << " "; + turboutfile << std::endl; + + // timer + Real ts2 = ParallelDescriptor::second() - ts1; + ParallelDescriptor::ReduceRealMax(ts2, ParallelDescriptor::IOProcessorNumber()); + amrex::Print() << "Time (spectral filtering) " << ts2 << " seconds\n"; + + // MultiFab memory usage + const int IOProc = ParallelDescriptor::IOProcessorNumber(); + + amrex::Long min_fab_megabytes = amrex::TotalBytesAllocatedInFabsHWM()/1048576; + amrex::Long max_fab_megabytes = min_fab_megabytes; + + ParallelDescriptor::ReduceLongMin(min_fab_megabytes, IOProc); + ParallelDescriptor::ReduceLongMax(max_fab_megabytes, IOProc); + + amrex::Print() << "High-water FAB megabyte spread across MPI nodes: [" + << min_fab_megabytes << " ... " << max_fab_megabytes << "]\n"; + + min_fab_megabytes = amrex::TotalBytesAllocatedInFabs()/1048576; + max_fab_megabytes = min_fab_megabytes; + + ParallelDescriptor::ReduceLongMin(min_fab_megabytes, IOProc); + ParallelDescriptor::ReduceLongMax(max_fab_megabytes, IOProc); + + amrex::Print() << "Curent FAB megabyte spread across MPI nodes: [" + << min_fab_megabytes << " ... " << max_fab_megabytes << "]\n"; + + turboutfile.close(); +} diff --git a/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.H b/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.H new file mode 100644 index 00000000..5cd30f67 --- /dev/null +++ b/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.H @@ -0,0 +1,100 @@ +#ifndef _spectral_functions_stag_H_ +#define _spectral_functions_stag_H_ + +#include +#include +#include +#include + +#include +#include + + +#ifdef AMREX_USE_CUDA +#include +#elif AMREX_USE_HIP +# if __has_include() // ROCm 5.3+ +# include +# else +# include +# endif +#else +#include +#include +#endif + +#include + +#include + +#define ALIGN 16 + +using namespace amrex; + +#if !defined(HEFFTE_FFTW) && !defined(HEFFTE_CUFFT) && !defined(HEFFTE_ROCFFT) +#ifdef AMREX_USE_CUDA +std::string cufftError (const cufftResult& err); +#endif +#ifdef AMREX_USE_HIP +std::string rocfftError (const rocfft_status err); +void Assert_rocfft_status (std::string const& name, rocfft_status status); +#endif +#endif + +void SpectralReadCheckPoint(amrex::Geometry& geom, + const amrex::Box& domain, + amrex::MultiFab& prim, + std::array& vel, + BoxArray& ba, DistributionMapping& dmap, + const amrex::Vector n_cells, + const int nprimvars, + const amrex::Vector max_grid_size, + const amrex::IntVect ngc, + const int restart); + +void SpectralVelDecomp(const MultiFab& vel, + MultiFab& vel_decomp_filter, + const amrex::Real kmin, + const amrex::Real kmax, + const amrex::Geometry& geom, + const amrex::Vector n_cells); + +void SpectralScalarDecomp(const MultiFab& scalar, + MultiFab& scalar_filter, + const amrex::Real kmin, + const amrex::Real kmax, + const amrex::Geometry& geom, + const amrex::Vector n_cells); + +void SpectralWritePlotFile(const int step, + const amrex::Real& kmin, + const amrex::Real& kmax, + const amrex::Geometry& geom, + const amrex::MultiFab& vel_decomp_in, + const amrex::MultiFab& scalar_in); + +void Read_Copy_MF_Checkpoint(amrex::MultiFab& mf, std::string mf_name, + const std::string& checkpointname, + BoxArray& ba_old, DistributionMapping& dmap_old, + int NVARS, int NGC, const amrex::IntVect ngc, + int nodal_flag=-1); + +void ShiftFaceToCC(const MultiFab& face_in, int face_in_comp, + MultiFab& cc_in, int cc_in_comp, + int ncomp); + +void ComputeGrad(const MultiFab & phi_in, std::array & gphi, + int start_incomp, int start_outcomp, int ncomp, int bccomp, const Geometry & geom, + int increment); + +void SumStag(const std::array& m1, + amrex::Vector& sum); + +void FCMoments(const std::array& m1, + const amrex::Vector& comps, + std::array& mscr, + const int& power, + amrex::Vector& prod_val); + +#endif + diff --git a/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.cpp b/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.cpp new file mode 100644 index 00000000..1569b4e4 --- /dev/null +++ b/exec/compressible_stag/SPECTRAL_FILTER/spectral_functions.cpp @@ -0,0 +1,1005 @@ +#include "spectral_functions.H" +#include "AMReX_PlotFileUtil.H" +#include "AMReX_PlotFileDataImpl.H" + +#include + +#include "chrono" +#include +#include "AMReX_PlotFileUtil.H" +#include "AMReX_BoxArray.H" + +using namespace std::chrono; + +namespace { + void GotoNextLine (std::istream& is) + { + constexpr std::streamsize bl_ignore_max { 100000 }; + is.ignore(bl_ignore_max, '\n'); + } +} + +void SpectralReadCheckPoint(amrex::Geometry& geom, + const amrex::Box& domain, + amrex::MultiFab& prim, + std::array& vel, + BoxArray& ba, DistributionMapping& dmap, + const amrex::Vector n_cells, + const int nprimvars, + const amrex::Vector max_grid_size, + const amrex::IntVect ngc, + const int restart) +{ + // timer for profiling + BL_PROFILE_VAR("SpectralReadCheckPoint()",SpectralReadCheckPoint); + + // checkpoint file name, e.g., chk0000010 + const std::string& checkpointname = amrex::Concatenate("chk",restart,9); + + amrex::Print() << "Restart from checkpoint " << checkpointname << "\n"; + + VisMF::IO_Buffer io_buffer(VisMF::GetIOBufferSize()); + + std::string line, word; + + // read in old boxarray, and create old distribution map (this is to read in MFabs) + BoxArray ba_old; + DistributionMapping dmap_old; + + // initialize new boxarray + ba.define(domain); + ba.maxSize(IntVect(max_grid_size)); + dmap.define(ba, ParallelDescriptor::NProcs()); + + amrex::Vector nodal_flag_dir; + amrex::IntVect nodal_flag_x; + amrex::IntVect nodal_flag_y; + amrex::IntVect nodal_flag_z; + nodal_flag_dir.resize(3); + + for (int i=0; i<3; ++i) { + nodal_flag_x[i] = int(i==0); + nodal_flag_y[i] = int(i==1); + nodal_flag_z[i] = int(i==2); + AMREX_D_TERM(nodal_flag_dir[0][i] = nodal_flag_x[i];, + nodal_flag_dir[1][i] = nodal_flag_y[i];, + nodal_flag_dir[2][i] = nodal_flag_z[i];); + } + + // Header + { + std::string File(checkpointname + "/Header"); + Vector fileCharPtr; + ParallelDescriptor::ReadAndBcastFile(File, fileCharPtr); + std::string fileCharPtrString(fileCharPtr.dataPtr()); + std::istringstream is(fileCharPtrString, std::istringstream::in); + + // read in title line + std::getline(is, line); + + // read in time step number + int step; + is >> step; + GotoNextLine(is); + + // read in time + Real time; + is >> time; + GotoNextLine(is); + + // read in statsCount + int statsCount; + is >> statsCount; + GotoNextLine(is); + + // read in BoxArray (fluid) from Header + ba_old.readFrom(is); + GotoNextLine(is); + + // create old distribution mapping + dmap_old.define(ba_old, ParallelDescriptor::NProcs()); + + prim.define(ba,dmap,nprimvars,ngc); + // velocity and momentum (instantaneous, means, variances) + for (int d=0; d n_cells) +{ + BL_PROFILE_VAR("SpectralVelDecomp()",SpectralVelDecomp); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(vel.nComp() == 3, + "SpectralVelDecomp: must have 3 components of input vel MultiFab"); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(vel.local_size() == 1, + "SpectralVelDecomp: Must have one Box per MPI process when using heFFTe"); + + const GpuArray dx = geom.CellSizeArray(); + + long npts; + Box domain = geom.Domain(); + npts = (domain.length(0)*domain.length(1)*domain.length(2)); + Real sqrtnpts = std::sqrt(npts); + + // get box array and distribution map of vel + DistributionMapping dm = vel.DistributionMap(); + BoxArray ba = vel.boxArray(); + + // since there is 1 MPI rank per box, each MPI rank obtains its local box and the associated boxid + Box local_box; + int local_boxid; + { + for (int i = 0; i < ba.size(); ++i) { + Box b = ba[i]; + // each MPI rank has its own local_box Box and local_boxid ID + if (ParallelDescriptor::MyProc() == dm[i]) { + local_box = b; + local_boxid = i; + } + } + } + + // now each MPI rank works on its own box + // for real->complex fft's, the fft is stored in an (nx/2+1) x ny x nz dataset + + // start by coarsening each box by 2 in the x-direction + Box c_local_box = amrex::coarsen(local_box, IntVect(AMREX_D_DECL(2,1,1))); + + // if the coarsened box's high-x index is even, we shrink the size in 1 in x + // this avoids overlap between coarsened boxes + if (c_local_box.bigEnd(0) * 2 == local_box.bigEnd(0)) { + c_local_box.setBig(0,c_local_box.bigEnd(0)-1); + } + // for any boxes that touch the hi-x domain we + // increase the size of boxes by 1 in x + // this makes the overall fft dataset have size (Nx/2+1 x Ny x Nz) + if (local_box.bigEnd(0) == geom.Domain().bigEnd(0)) { + c_local_box.growHi(0,1); + } + + // each MPI rank gets storage for its piece of the fft + BaseFab > spectral_field_Tx(c_local_box, 1, The_Device_Arena()); // totalx + BaseFab > spectral_field_Ty(c_local_box, 1, The_Device_Arena()); // totaly + BaseFab > spectral_field_Tz(c_local_box, 1, The_Device_Arena()); // totalz + BaseFab > spectral_field_Sx(c_local_box, 1, The_Device_Arena()); // solenoidalx + BaseFab > spectral_field_Sy(c_local_box, 1, The_Device_Arena()); // solenoidaly + BaseFab > spectral_field_Sz(c_local_box, 1, The_Device_Arena()); // solenoidalz + BaseFab > spectral_field_Dx(c_local_box, 1, The_Device_Arena()); // dilatationalx + BaseFab > spectral_field_Dy(c_local_box, 1, The_Device_Arena()); // dilatationaly + BaseFab > spectral_field_Dz(c_local_box, 1, The_Device_Arena()); // dilatationalz + MultiFab vel_single(ba, dm, 1, 0); + + int r2c_direction = 0; + + // ForwardTransform + // X + using heffte_complex = typename heffte::fft_output::type; + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + vel_single.ParallelCopy(vel, 0, 0, 1); + heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tx.dataPtr(); + fft.forward(vel_single[local_boxid].dataPtr(),spectral_data); + } + // Y + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + vel_single.ParallelCopy(vel, 1, 0, 1); + heffte_complex* spectral_data = (heffte_complex*) spectral_field_Ty.dataPtr(); + fft.forward(vel_single[local_boxid].dataPtr(),spectral_data); + } + // Z + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + vel_single.ParallelCopy(vel, 2, 0, 1); + heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tz.dataPtr(); + fft.forward(vel_single[local_boxid].dataPtr(),spectral_data); + } + + Gpu::streamSynchronize(); + + int nx = n_cells[0]; + int ny = n_cells[1]; + int nz = n_cells[2]; + + // Decompose velocity field into solenoidal and dilatational + Array4< GpuComplex > spectral_tx = spectral_field_Tx.array(); + Array4< GpuComplex > spectral_ty = spectral_field_Ty.array(); + Array4< GpuComplex > spectral_tz = spectral_field_Tz.array(); + Array4< GpuComplex > spectral_sx = spectral_field_Sx.array(); + Array4< GpuComplex > spectral_sy = spectral_field_Sy.array(); + Array4< GpuComplex > spectral_sz = spectral_field_Sz.array(); + Array4< GpuComplex > spectral_dx = spectral_field_Dx.array(); + Array4< GpuComplex > spectral_dy = spectral_field_Dy.array(); + Array4< GpuComplex > spectral_dz = spectral_field_Dz.array(); + ParallelFor(c_local_box, [=] AMREX_GPU_DEVICE(int i, int j, int k) + { + + Real GxR = 0.0, GxC = 0.0, GyR = 0.0, GyC = 0.0, GzR = 0.0, GzC = 0.0; + + if (i <= nx/2) { + // Gradient Operators + GxR = (cos(2.0*M_PI*i/nx)-1.0)/dx[0]; + GxC = (sin(2.0*M_PI*i/nx)-0.0)/dx[0]; + GyR = (cos(2.0*M_PI*j/ny)-1.0)/dx[1]; + GyC = (sin(2.0*M_PI*j/ny)-0.0)/dx[1]; + GzR = (cos(2.0*M_PI*k/nz)-1.0)/dx[2]; + GzC = (sin(2.0*M_PI*k/nz)-0.0)/dx[2]; + } + else { // conjugate + amrex::Abort("check the code; i should not go beyond bx.length(0)/2"); + } + + // Get the wavenumber + int ki = i; + int kj = j; + int kk = k; + Real knum = (ki*ki + kj*kj + kk*kk); + knum = std::sqrt(knum); + + // Scale Total velocity FFT components with Filtering + if ((knum >= kmin) and (knum <= kmax)) { + spectral_tx(i,j,k) *= (1.0/sqrtnpts); + spectral_ty(i,j,k) *= (1.0/sqrtnpts); + spectral_tz(i,j,k) *= (1.0/sqrtnpts); + } + else { + spectral_tx(i,j,k) *= 0.0; + spectral_ty(i,j,k) *= 0.0; + spectral_tz(i,j,k) *= 0.0; + } + + // Inverse Laplacian + Real Lap = GxR*GxR + GxC*GxC + GyR*GyR + GyC*GyC + GzR*GzR + GzC*GzC; + + // Divergence of vel + Real divR = spectral_tx(i,j,k).real()*GxR - spectral_tx(i,j,k).imag()*GxC + + spectral_ty(i,j,k).real()*GyR - spectral_ty(i,j,k).imag()*GyC + + spectral_tz(i,j,k).real()*GzR - spectral_tz(i,j,k).imag()*GzC ; + Real divC = spectral_tx(i,j,k).real()*GxC + spectral_tx(i,j,k).imag()*GxR + + spectral_ty(i,j,k).real()*GyC + spectral_ty(i,j,k).imag()*GyR + + spectral_tz(i,j,k).real()*GzC + spectral_tz(i,j,k).imag()*GzR ; + + if (Lap < 1.0e-12) { // zero mode for no bulk motion + spectral_dx(i,j,k) *= 0.0; + spectral_dy(i,j,k) *= 0.0; + spectral_dz(i,j,k) *= 0.0; + } + else { + + // Dilatational velocity + GpuComplex copy_dx((divR*GxR + divC*GxC) / Lap, + (divC*GxR - divR*GxC) / Lap); + spectral_dx(i,j,k) = copy_dx; + + GpuComplex copy_dy((divR*GyR + divC*GyC) / Lap, + (divC*GyR - divR*GyC) / Lap); + spectral_dy(i,j,k) = copy_dy; + + GpuComplex copy_dz((divR*GzR + divC*GzC) / Lap, + (divC*GzR - divR*GzC) / Lap); + spectral_dz(i,j,k) = copy_dz; + } + + // Solenoidal velocity + spectral_sx(i,j,k) = spectral_tx(i,j,k) - spectral_dx(i,j,k); + spectral_sy(i,j,k) = spectral_ty(i,j,k) - spectral_dy(i,j,k); + spectral_sz(i,j,k) = spectral_tz(i,j,k) - spectral_dz(i,j,k); + + }); + + Gpu::streamSynchronize(); + + MultiFab vel_decomp_filter_single(ba, dm, 1, 0); + // inverse Fourier transform filtered total velocity + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tx.dataPtr(); + fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr()); + + Gpu::streamSynchronize(); + vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 0, 1); + } + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + heffte_complex* spectral_data = (heffte_complex*) spectral_field_Ty.dataPtr(); + fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr()); + + Gpu::streamSynchronize(); + vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 1, 1); + } + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + heffte_complex* spectral_data = (heffte_complex*) spectral_field_Tz.dataPtr(); + fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr()); + + Gpu::streamSynchronize(); + vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 2, 1); + } + // inverse Fourier transform filtered solenoidal and dilatational components + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + heffte_complex* spectral_data = (heffte_complex*) spectral_field_Sx.dataPtr(); + fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr()); + + Gpu::streamSynchronize(); + vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 3, 1); + } + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + heffte_complex* spectral_data = (heffte_complex*) spectral_field_Sy.dataPtr(); + fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr()); + + Gpu::streamSynchronize(); + vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 4, 1); + } + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + heffte_complex* spectral_data = (heffte_complex*) spectral_field_Sz.dataPtr(); + fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr()); + + Gpu::streamSynchronize(); + vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 5, 1); + } + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + heffte_complex* spectral_data = (heffte_complex*) spectral_field_Dx.dataPtr(); + fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr()); + + Gpu::streamSynchronize(); + vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 6, 1); + } + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + heffte_complex* spectral_data = (heffte_complex*) spectral_field_Dy.dataPtr(); + fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr()); + + Gpu::streamSynchronize(); + vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 7, 1); + } + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + heffte_complex* spectral_data = (heffte_complex*) spectral_field_Dz.dataPtr(); + fft.backward(spectral_data, vel_decomp_filter_single[local_boxid].dataPtr()); + + Gpu::streamSynchronize(); + vel_decomp_filter.ParallelCopy(vel_decomp_filter_single, 0, 8, 1); + } + + + vel_decomp_filter.mult(1.0/sqrtnpts); + +} + + +void SpectralScalarDecomp(const MultiFab& scalar, + MultiFab& scalar_filter, + const amrex::Real kmin, + const amrex::Real kmax, + const amrex::Geometry& geom, + const amrex::Vector n_cells) +{ + BL_PROFILE_VAR("SpectralScalarDecomp()",SpectralScalarDecomp); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(scalar.nComp() == 1, + "SpectralScalarDecomp: must have 1 components of input scalar MultiFab"); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(scalar.local_size() == 1, + "SpectralScalarDecomp: Must have one Box per MPI process when using heFFTe"); + + const GpuArray dx = geom.CellSizeArray(); + + long npts; + Box domain = geom.Domain(); + npts = (domain.length(0)*domain.length(1)*domain.length(2)); + Real sqrtnpts = std::sqrt(npts); + + // get box array and distribution map of vel + DistributionMapping dm = scalar.DistributionMap(); + BoxArray ba = scalar.boxArray(); + + // since there is 1 MPI rank per box, each MPI rank obtains its local box and the associated boxid + Box local_box; + int local_boxid; + { + for (int i = 0; i < ba.size(); ++i) { + Box b = ba[i]; + // each MPI rank has its own local_box Box and local_boxid ID + if (ParallelDescriptor::MyProc() == dm[i]) { + local_box = b; + local_boxid = i; + } + } + } + + // now each MPI rank works on its own box + // for real->complex fft's, the fft is stored in an (nx/2+1) x ny x nz dataset + + // start by coarsening each box by 2 in the x-direction + Box c_local_box = amrex::coarsen(local_box, IntVect(AMREX_D_DECL(2,1,1))); + + // if the coarsened box's high-x index is even, we shrink the size in 1 in x + // this avoids overlap between coarsened boxes + if (c_local_box.bigEnd(0) * 2 == local_box.bigEnd(0)) { + c_local_box.setBig(0,c_local_box.bigEnd(0)-1); + } + // for any boxes that touch the hi-x domain we + // increase the size of boxes by 1 in x + // this makes the overall fft dataset have size (Nx/2+1 x Ny x Nz) + if (local_box.bigEnd(0) == geom.Domain().bigEnd(0)) { + c_local_box.growHi(0,1); + } + + // each MPI rank gets storage for its piece of the fft + BaseFab > spectral_field(c_local_box, 1, The_Device_Arena()); + MultiFab scalar_single(ba, dm, 1, 0); + + int r2c_direction = 0; + + // ForwardTransform + using heffte_complex = typename heffte::fft_output::type; + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + scalar_single.ParallelCopy(scalar, 0, 0, 1); + heffte_complex* spectral_data = (heffte_complex*) spectral_field.dataPtr(); + fft.forward(scalar_single[local_boxid].dataPtr(),spectral_data); + } + + Gpu::streamSynchronize(); + + // filtering + Array4< GpuComplex > spectral = spectral_field.array(); + int nx = n_cells[0]; + int ny = n_cells[1]; + int nz = n_cells[2]; + ParallelFor(c_local_box, [=] AMREX_GPU_DEVICE(int i, int j, int k) + { + + if (i <= nx/2) { + } + else { // conjugate + amrex::Abort("check the code; i should not go beyond bx.length(0)/2"); + } + + // Get the wavenumber + int ki = i; + int kj = j; + int kk = k; + Real knum = (ki*ki + kj*kj + kk*kk); + knum = std::sqrt(knum); + + // Scale Scalar FFT components with Filtering + if ((knum >= kmin) and (knum <= kmax)) { + spectral(i,j,k) *= (1.0/sqrtnpts); + spectral(i,j,k) *= (1.0/sqrtnpts); + spectral(i,j,k) *= (1.0/sqrtnpts); + } + else { + spectral(i,j,k) *= 0.0; + spectral(i,j,k) *= 0.0; + spectral(i,j,k) *= 0.0; + } + }); + + Gpu::streamSynchronize(); + + MultiFab scalar_filter_single(ba, dm, 1, 0); + // inverse Fourier transform filtered scalar + { +#if defined(HEFFTE_CUFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_ROCFFT) + heffte::fft3d_r2c fft +#elif defined(HEFFTE_FFTW) + heffte::fft3d_r2c fft +#endif + ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)}, + {local_box.bigEnd(0) ,local_box.bigEnd(1) ,local_box.bigEnd(2)}}, + {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)}, + {c_local_box.bigEnd(0) ,c_local_box.bigEnd(1) ,c_local_box.bigEnd(2)}}, + r2c_direction, ParallelDescriptor::Communicator()); + + heffte_complex* spectral_data = (heffte_complex*) spectral_field.dataPtr(); + fft.backward(spectral_data, scalar_filter_single[local_boxid].dataPtr()); + + Gpu::streamSynchronize(); + scalar_filter.ParallelCopy(scalar_filter_single, 0, 0, 1); + } + + scalar_filter.mult(1.0/sqrtnpts); + +} + +void SpectralWritePlotFile(const int step, + const amrex::Real& kmin, + const amrex::Real& kmax, + const amrex::Geometry& geom, + const amrex::MultiFab& vel_decomp_in, + const amrex::MultiFab& scalar_in) +{ + + MultiFab output; + + // Cell-Centered Velocity Gradient Stats (1,2,3 are directions) + // 0: ux + // 1: uy + // 2: uz + // 3: ux_s + // 4: uy_s + // 5: uz_s + // 6: ux_d + // 7: uy_d + // 8: uz_d + // 9: umag + // 10: umag_s + // 11: umag_d + // 12: scalar + // 13: divergence = u_1,1 + u_2,2 + u_3,3 + // 14: vorticity w1 + // 15: vorticity w2 + // 16: vorticity w3 + // 17: vorticity mag: sqrt(w1**2 + w2**2 + w3**2) + output.define(vel_decomp_in.boxArray(), vel_decomp_in.DistributionMap(), 18, 0); + output.setVal(0.0); + + const GpuArray dx = geom.CellSizeArray(); + + for ( MFIter mfi(output,TilingIfNotGPU()); mfi.isValid(); ++mfi ) { + + const Box& bx = mfi.tilebox(); + + const Array4< Real>& out = output.array(mfi); + + const Array4& v_decomp = vel_decomp_in.array(mfi); + + const Array4& sca = scalar_in.array(mfi); + + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + + out(i,j,k,0) = v_decomp(i,j,k,0); + out(i,j,k,1) = v_decomp(i,j,k,1); + out(i,j,k,2) = v_decomp(i,j,k,2); + out(i,j,k,3) = v_decomp(i,j,k,3); + out(i,j,k,4) = v_decomp(i,j,k,4); + out(i,j,k,5) = v_decomp(i,j,k,5); + out(i,j,k,6) = v_decomp(i,j,k,6); + out(i,j,k,7) = v_decomp(i,j,k,7); + out(i,j,k,8) = v_decomp(i,j,k,8); + + out(i,j,k,9) = std::sqrt(out(i,j,k,0)*out(i,j,k,0) + out(i,j,k,1)*out(i,j,k,1) + out(i,j,k,2)*out(i,j,k,2)); // mag + out(i,j,k,10) = std::sqrt(out(i,j,k,3)*out(i,j,k,3) + out(i,j,k,4)*out(i,j,k,4) + out(i,j,k,5)*out(i,j,k,5)); // mag solednoidal + out(i,j,k,11) = std::sqrt(out(i,j,k,6)*out(i,j,k,6) + out(i,j,k,7)*out(i,j,k,7) + out(i,j,k,8)*out(i,j,k,8)); // mag solednoidal + + out(i,j,k,12) = sca(i,j,k,0); + + // divergence + out(i,j,k,13) = 0.5*( (v_decomp(i+1,j,k,0) - v_decomp(i-1,j,k,0))/dx[0] + + (v_decomp(i,j+1,k,1) - v_decomp(i,j-1,k,1))/dx[1] + + (v_decomp(i,j,k+1,2) - v_decomp(i,j,k-1,2))/dx[2] ); + + // curl w1 = u_2,1 - u_1,2 + out(i,j,k,14) = 0.5*( (v_decomp(i+1,j,k,1) - v_decomp(i-1,j,k,1))/dx[0] - + (v_decomp(i,j+1,k,0) - v_decomp(i,j-1,k,0))/dx[1] ); + + // curl w2 = u_1,3 - u_3,1 + out(i,j,k,15) = 0.5*( (v_decomp(i,j,k+1,0) - v_decomp(i,j,k-1,0))/dx[2] - + (v_decomp(i+1,j,k,2) - v_decomp(i-1,j,k,2))/dx[0] ); + + // curl w2 = u_3,2 - u_2,3 + out(i,j,k,16) = 0.5*( (v_decomp(i,j+1,k,2) - v_decomp(i,j-1,k,2))/dx[1] - + (v_decomp(i,j,k+1,1) - v_decomp(i,j,k-1,1))/dx[2] ); + + // vorticity magnitude: sqrt(w1*w1 + w2*w2 + w3*w3) + out(i,j,k,17) = std::sqrt( out(i,j,k,14)*out(i,j,k,14) + out(i,j,k,15)*out(i,j,k,15) + out(i,j,k,16)*out(i,j,k,16) ); + }); + } + + // Write on a plotfile + std::string plotfilename = amrex::Concatenate("filtered_",step,9); + std::ostringstream os; + os << std::setprecision(3) << kmin; + plotfilename += os.str();; + std::ostringstream oss; + oss << std::setprecision(3) << kmax; + plotfilename += oss.str(); + + amrex::Vector varNames(18); + varNames[0] = "ux"; + varNames[1] = "uy"; + varNames[2] = "uz"; + varNames[3] = "ux_s"; + varNames[4] = "uy_s"; + varNames[5] = "uz_s"; + varNames[6] = "ux_d"; + varNames[7] = "uy_d"; + varNames[8] = "uz_d"; + varNames[9] = "umag"; + varNames[10] = "umag_s"; + varNames[11] = "umag_d"; + varNames[12] = "rho"; + varNames[13] = "div"; + varNames[14] = "w1"; + varNames[15] = "w2"; + varNames[16] = "w3"; + varNames[17] = "vort"; + WriteSingleLevelPlotfile(plotfilename,output,varNames,geom,0.0,step); +} + +void Read_Copy_MF_Checkpoint(amrex::MultiFab& mf, std::string mf_name, const std::string& checkpointname, + BoxArray& ba_old, DistributionMapping& dmap_old, + int NVARS, int ghost, const amrex::IntVect ngc, + int nodal_flag) +{ + // Read into temporary MF from file + MultiFab mf_temp; + VisMF::Read(mf_temp,amrex::MultiFabFileFullPrefix(0, checkpointname, "Level_", mf_name)); + + // Copy temporary MF into the new MF + if (ghost) { + mf.ParallelCopy(mf_temp, 0, 0, NVARS, ngc, ngc); + } + else { + mf.ParallelCopy(mf_temp, 0, 0, NVARS, 0, 0); + } +} + +void ShiftFaceToCC(const MultiFab& face_in, int face_comp, + MultiFab& cc_in, int cc_comp, int ncomp) +{ + + BL_PROFILE_VAR("ShiftFaceToCC()",ShiftFaceToCC); + + if (!face_in.is_nodal(0) && !face_in.is_nodal(1) && !face_in.is_nodal(2)) { + Abort("ShiftFaceToCC requires a face-centered MultiFab"); + } + + // Loop over boxes (note that mfi takes a cell-centered multifab as an argument) + for (MFIter mfi(cc_in,TilingIfNotGPU()); mfi.isValid(); ++mfi) { + + const Box& bx = mfi.tilebox(); + + Array4 const& face = face_in.array(mfi); + + Array4 const& cc = cc_in.array(mfi); + + amrex::ParallelFor(bx, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept + { + cc(i,j,k,cc_comp+n) = face(i,j,k,face_comp+n); + }); + } +} + +void ComputeGrad(const MultiFab & phi_in, std::array & gphi, + int start_incomp, int start_outcomp, int ncomp, int bccomp, const Geometry & geom, + int increment) +{ + BL_PROFILE_VAR("ComputeGrad()",ComputeGrad); + + // Physical Domain + Box dom(geom.Domain()); + + const GpuArray dx = geom.CellSizeArray(); + + // if not incrementing, initialize data to zero + if (increment == 0) { + for (int dir=0; dir & phi = phi_in.array(mfi); + + AMREX_D_TERM(const Array4 & gphix = gphi[0].array(mfi);, + const Array4 & gphiy = gphi[1].array(mfi);, + const Array4 & gphiz = gphi[2].array(mfi);); + + AMREX_D_TERM(const Box & bx_x = mfi.nodaltilebox(0);, + const Box & bx_y = mfi.nodaltilebox(1);, + const Box & bx_z = mfi.nodaltilebox(2);); + + amrex::ParallelFor(bx_x, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept + { + gphix(i,j,k,start_outcomp+n) += (phi(i,j,k,start_incomp+n)-phi(i-1,j,k,start_incomp+n)) / dx[0]; + }, + bx_y, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept + { + gphiy(i,j,k,start_outcomp+n) += (phi(i,j,k,start_incomp+n)-phi(i,j-1,k,start_incomp+n)) / dx[1]; + } + , bx_z, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept + { + gphiz(i,j,k,start_outcomp+n) += (phi(i,j,k,start_incomp+n)-phi(i,j,k-1,start_incomp+n)) / dx[2]; + } + ); + + } // end MFIter +} + +void FCMoments(const std::array& m1, + const amrex::Vector& comps, + std::array& mscr, + const int& power, + amrex::Vector& prod_val) +{ + + BL_PROFILE_VAR("FCMoments()",FCMoments); + + for (int d=0; d<3; ++d) { + MultiFab::Copy(mscr[d],m1[d],comps[d],0,1,0); + for(int i=1; i& m1, + amrex::Vector& sum) +{ + BL_PROFILE_VAR("SumStag()",SumStag); + + // Initialize to zero + std::fill(sum.begin(), sum.end(), 0.); + + ReduceOps reduce_op; + + //////// x-faces + + ReduceData reduce_datax(reduce_op); + using ReduceTuple = typename decltype(reduce_datax)::Type; + + for (MFIter mfi(m1[0],TilingIfNotGPU()); mfi.isValid(); ++mfi) + { + const Box& bx = mfi.tilebox(); + const Box& bx_grid = mfi.validbox(); + + auto const& fab = m1[0].array(mfi); + + int xlo = bx_grid.smallEnd(0); + int xhi = bx_grid.bigEnd(0); + + reduce_op.eval(bx, reduce_datax, + [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple + { + Real weight = (i>xlo && i(reduce_datax.value()); + ParallelDescriptor::ReduceRealSum(sum[0]); + + //////// y-faces + + ReduceData reduce_datay(reduce_op); + + for (MFIter mfi(m1[1],TilingIfNotGPU()); mfi.isValid(); ++mfi) + { + const Box& bx = mfi.tilebox(); + const Box& bx_grid = mfi.validbox(); + + auto const& fab = m1[1].array(mfi); + + int ylo = bx_grid.smallEnd(1); + int yhi = bx_grid.bigEnd(1); + + reduce_op.eval(bx, reduce_datay, + [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple + { + Real weight = (j>ylo && j(reduce_datay.value()); + ParallelDescriptor::ReduceRealSum(sum[1]); + + //////// z-faces + + ReduceData reduce_dataz(reduce_op); + + for (MFIter mfi(m1[2],TilingIfNotGPU()); mfi.isValid(); ++mfi) + { + const Box& bx = mfi.tilebox(); + const Box& bx_grid = mfi.validbox(); + + auto const& fab = m1[2].array(mfi); + + int zlo = bx_grid.smallEnd(2); + int zhi = bx_grid.bigEnd(2); + + reduce_op.eval(bx, reduce_dataz, + [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple + { + Real weight = (k>zlo && k(reduce_dataz.value()); + ParallelDescriptor::ReduceRealSum(sum[2]); +} + diff --git a/exec/compressible_stag/build_perlmutter.sh b/exec/compressible_stag/build_perlmutter.sh new file mode 100755 index 00000000..2118c705 --- /dev/null +++ b/exec/compressible_stag/build_perlmutter.sh @@ -0,0 +1,30 @@ +#!/usr/bin/bash + +# required dependencies +module load gpu +module load PrgEnv-gnu +module load craype +module load craype-x86-milan +module load craype-accel-nvidia80 +module load cudatoolkit +module load cmake/3.24.3 + +# necessary to use CUDA-Aware MPI and run a job +export CRAY_ACCEL_TARGET=nvidia80 + +# optimize CUDA compilation for A100 +export AMREX_CUDA_ARCH=8.0 + +# optimize CPU microarchitecture for AMD EPYC 3rd Gen (Milan/Zen3) +# note: the cc/CC/ftn wrappers below add those +export CXXFLAGS="-march=znver3" +export CFLAGS="-march=znver3" + +# compiler environment hints +export CC=cc +export CXX=CC +export FC=ftn +export CUDACXX=$(which nvcc) +export CUDAHOSTCXX=CC + +make -j10 USE_CUDA=TRUE DO_TURB=TRUE MAX_SPEC=2 USE_HEFFTE_CUFFT=TRUE USE_ASSERTION=TRUE