diff --git a/makefiles/make.daint b/makefiles/make.daint
index 980889c..1145da3 100644
--- a/makefiles/make.daint
+++ b/makefiles/make.daint
@@ -2,7 +2,7 @@ CXX=CC
 LD=CC
 
 ACCFFT_ROOT=../accfft/build/
-accfft?=true
+accfft?=false
 
 ifeq "$(compiler)" "intel"
 #CPPFLAGS += -dynamic
@@ -19,4 +19,4 @@ ifeq "$(accfft)" "true"
 LIBS += $(CRAY_CUDATOOLKIT_POST_LINK_OPTS)
 endif
 
-#CPPFLAGS += -DCUP_ASYNC_DUMP
+CPPFLAGS += -DCUP_ASYNC_DUMP
diff --git a/source/Definitions.h b/source/Definitions.h
index cf409b0..99a6d46 100644
--- a/source/Definitions.h
+++ b/source/Definitions.h
@@ -36,6 +36,8 @@
 
 CubismUP_3D_NAMESPACE_BEGIN
 
+//#define ENERGY_FLUX_SPECTRUM 1
+
 enum { FE_CHI = 0, FE_U, FE_V, FE_W, FE_P, FE_TMPU, FE_TMPV, FE_TMPW };
 struct FluidElement
 {
diff --git a/source/Simulation.cpp b/source/Simulation.cpp
index e906e05..ca45e54 100644
--- a/source/Simulation.cpp
+++ b/source/Simulation.cpp
@@ -164,7 +164,7 @@ void Simulation::_init(const bool restart)
   else
     _ic();
   MPI_Barrier(sim.app_comm);
-  _serialize("init");
+  //_serialize("init");
 
   assert(sim.obstacle_vector != nullptr);
   if (sim.rank == 0)
@@ -397,6 +397,7 @@ void Simulation::setupOperators()
       checkpointPostVelocity = new Checkpoint(sim, "PostVelocity"));
 
   //sim.pipeline.push_back(new HITfiltering(sim));
+  sim.pipeline.push_back(new StructureFunctions(sim));
 
   sim.pipeline.push_back(new Analysis(sim));
 
diff --git a/source/SimulationData.cpp b/source/SimulationData.cpp
index 1324f66..f77a525 100644
--- a/source/SimulationData.cpp
+++ b/source/SimulationData.cpp
@@ -75,7 +75,8 @@ SimulationData::SimulationData(MPI_Comm mpicomm, ArgumentParser &parser)
   bChannelFixedMassFlux = parser("-channelFixedMassFlux").asBool(false);
 
   bRungeKutta23 = parser("-RungeKutta23").asBool(false);
-  bAdvection3rdOrder = parser("-Advection3rdOrder").asBool(true);
+  //bAdvection3rdOrder = parser("-Advection3rdOrder").asBool(true);
+  bAdvection3rdOrder = parser("-Advection3rdOrder").asBool(false);
 
   uMax_forced = parser("-uMax_forced").asDouble(0.0);
 
diff --git a/source/main_RL_HIT.cpp b/source/main_RL_HIT.cpp
index b8bc0a8..69e66d2 100644
--- a/source/main_RL_HIT.cpp
+++ b/source/main_RL_HIT.cpp
@@ -23,7 +23,6 @@
 #include <sstream>
 
 using Real = cubismup3d::Real;
-static constexpr Real rew_baseline = - 1000.0;
 #define GIVE_LOCL_STATEVARS
 #define GIVE_GRID_STATEVARS
 
@@ -108,7 +107,7 @@ inline void app_main(
   // If every grid point is an agent: probably will allocate too much memory
   // and crash because smarties allocates a trajectory for each point
   // If only one agent: sequences will be garbled together and cannot
-  // send clean Sequences.
+  // send clean Episodes to smarties' learning algorithms.
   // Also, rememebr that separate agents are thread safe!
   // let's say that each fluid block has one agent
   const int nAgentPerBlock = 1;
@@ -172,10 +171,11 @@ inline void app_main(
     const Real timeUpdateLES = tau_eta / LES_RL_FREQ_A;
     const Real timeSimulationMax = LES_RL_N_TSIM * tau_integral;
     const int maxNumUpdatesPerSim = timeSimulationMax / timeUpdateLES;
-    if (0) { // enable all dumping. //  && wrank != 1
+    if (bEvaluating) { // enable all dumping. //  && wrank != 1
       sim.sim.b3Ddump = true;  sim.sim.muteAll  = false;
       sim.sim.b2Ddump = false; sim.sim.saveFreq = 0;
-      sim.sim.verbose = true;  sim.sim.saveTime = timeUpdateLES;
+      //sim.sim.verbose = true;  sim.sim.saveTime = timeUpdateLES;
+      sim.sim.verbose = true;  sim.sim.saveTime = tau_integral;
     }
     printf("Reset simulation up to time=0 with SGS for eps:%f nu:%f Re:%f. "
            "Max %d action turns per simulation.\n", target.eps,
diff --git a/source/operators/AdvectionDiffusion.cpp b/source/operators/AdvectionDiffusion.cpp
index 9f279f1..461d3a1 100644
--- a/source/operators/AdvectionDiffusion.cpp
+++ b/source/operators/AdvectionDiffusion.cpp
@@ -314,7 +314,8 @@ struct Upwind3rd
   const Real invU = 1 / std::max(EPS, _sim.uMax_measured);
   Upwind3rd(const SimulationData& s) : _sim(s) {}
 
-  template<StepType step, int dir> Real diffx(LabMPI& L, const FluidBlock& o, const Real uAbs[3], const int ix, const int iy, const int iz) const {
+  template<StepType step, int dir>
+  inline Real diffx(LabMPI& L, const FluidBlock& o, const Real uAbs[3], const int ix, const int iy, const int iz) const {
     const Real ucc = inp<step,dir>(L,ix,iy,iz);
     const Real um1 = inp<step,dir>(L,ix-1,iy,iz), um2 = inp<step,dir>(L,ix-2,iy,iz);
     const Real up1 = inp<step,dir>(L,ix+1,iy,iz), up2 = inp<step,dir>(L,ix+2,iy,iz);
@@ -329,7 +330,8 @@ struct Upwind3rd
       return uAbs[0]>0? 2*up1 +3*ucc -6*um1 +um2 : -up2 +6*up1 -3*ucc -2*um1;
     #endif
   }
-  template<StepType step, int dir> Real diffy(LabMPI& L, const FluidBlock& o, const Real uAbs[3], const int ix, const int iy, const int iz) const {
+  template<StepType step, int dir>
+  inline Real diffy(LabMPI& L, const FluidBlock& o, const Real uAbs[3], const int ix, const int iy, const int iz) const {
     const Real ucc = inp<step,dir>(L,ix,iy,iz);
     const Real um1 = inp<step,dir>(L,ix,iy-1,iz), um2 = inp<step,dir>(L,ix,iy-2,iz);
     const Real up1 = inp<step,dir>(L,ix,iy+1,iz), up2 = inp<step,dir>(L,ix,iy+2,iz);
@@ -344,7 +346,8 @@ struct Upwind3rd
       return uAbs[1]>0? 2*up1 +3*ucc -6*um1 +um2 : -up2 +6*up1 -3*ucc -2*um1;
     #endif
   }
-  template<StepType step, int dir> Real diffz(LabMPI& L, const FluidBlock& o, const Real uAbs[3], const int ix, const int iy, const int iz) const {
+  template<StepType step, int dir>
+  inline Real diffz(LabMPI& L, const FluidBlock& o, const Real uAbs[3], const int ix, const int iy, const int iz) const {
     const Real ucc = inp<step,dir>(L,ix,iy,iz);
     const Real um1 = inp<step,dir>(L,ix,iy,iz-1), um2 = inp<step,dir>(L,ix,iy,iz-2);
     const Real up1 = inp<step,dir>(L,ix,iy,iz+1), up2 = inp<step,dir>(L,ix,iy,iz+2);
@@ -359,7 +362,8 @@ struct Upwind3rd
       return uAbs[2]>0? 2*up1 +3*ucc -6*um1 +um2 : -up2 +6*up1 -3*ucc -2*um1;
     #endif
   }
-  template<StepType step, int dir> Real   lap(LabMPI& L, const FluidBlock& o, const int ix, const int iy, const int iz) const {
+  template<StepType step, int dir>
+  inline Real   lap(LabMPI& L, const FluidBlock& o, const int ix, const int iy, const int iz) const {
     return  inp<step,dir>(L,ix+1,iy,iz) + inp<step,dir>(L,ix-1,iy,iz)
           + inp<step,dir>(L,ix,iy+1,iz) + inp<step,dir>(L,ix,iy-1,iz)
           + inp<step,dir>(L,ix,iy,iz+1) + inp<step,dir>(L,ix,iy,iz-1)
@@ -532,7 +536,7 @@ void AdvectionDiffusion::operator()(const double dt)
   else
   {
     if(sim.bRungeKutta23) {
-      sim.startProfiler("AdvDiff Kernel");
+      sim.startProfiler("AdvDiff23 Kernel");
       if(sim.bAdvection3rdOrder) {
         const KernelAdvectDiffuse<RK1, Upwind3rd> K1(sim);
         compute(K1);
@@ -545,10 +549,12 @@ void AdvectionDiffusion::operator()(const double dt)
         compute(K2);
       }
       sim.stopProfiler();
-      sim.startProfiler("AdvDiff copy");
-      const UpdateAndCorrectInflow U(sim);
-      U.operate<false>();
-      sim.stopProfiler();
+      if(not sim.bUseFourierBC) {
+        sim.startProfiler("AdvDiff copy");
+        const UpdateAndCorrectInflow U(sim);
+        U.operate<false>();
+        sim.stopProfiler();
+      }
     } else {
       sim.startProfiler("AdvDiff Kernel");
       if(sim.bAdvection3rdOrder) {
diff --git a/source/operators/HITfiltering.cpp b/source/operators/HITfiltering.cpp
index 2a237ed..4d6e4b9 100644
--- a/source/operators/HITfiltering.cpp
+++ b/source/operators/HITfiltering.cpp
@@ -143,8 +143,9 @@ void HITfiltering::operator()(const double dt)
 {
   if ( not (sim.timeAnalysis>0 && (sim.time+dt) >= sim.nextAnalysisTime) )
     return;
+  if (sim.muteAll) return;
 
-  sim.startProfiler("SGS Kernel");
+  sim.startProfiler("HITfiltering Kernel");
   const int BPDX = sim.grid->getBlocksPerDimension(0);
   const int BPDY = sim.grid->getBlocksPerDimension(1);
   const int BPDZ = sim.grid->getBlocksPerDimension(2);
@@ -246,4 +247,164 @@ void HITfiltering::operator()(const double dt)
   check("SGS");
 }
 
+StructureFunctions::StructureFunctions(SimulationData& s) : Operator(s)
+{
+  std::random_device rd;
+  gen.seed(rd());
+}
+
+inline Real periodic_distance(const Real x1, const Real x0, const Real extent)
+{
+  const Real dx = x1 - x0;
+  if      (dx >   extent/2) return dx - extent;
+  else if (dx <= -extent/2) return dx + extent;
+  else return dx;
+}
+inline Real periodic_distance(const std::array<Real,3> & p1,
+                     const std::array<Real,3> & p0,
+                     const std::array<Real,3> & extent)
+{
+  const Real dx = periodic_distance(p1[0], p0[0], extent[0]);
+  const Real dy = periodic_distance(p1[1], p0[1], extent[1]);
+  const Real dz = periodic_distance(p1[2], p0[2], extent[2]);
+  return std::sqrt(dx*dx + dy*dy + dz*dz);
+}
+
+std::array<double, 6> StructureFunctions::pick_ref_point()
+{
+  std::uniform_int_distribution<int> distrib_ranks(0, sim.nprocs-1);
+  std::uniform_int_distribution<size_t> distrib_block(0, vInfo.size()-1);
+  std::uniform_int_distribution<int> distrib_elem(0, CUP_BLOCK_SIZE-1);
+  int ref_rank = distrib_ranks(gen);
+  MPI_Bcast(&ref_rank, 1, MPI_INT, 0, sim.app_comm);
+  const size_t ref_bid  = distrib_block(gen);
+  const int ref_iz = distrib_elem(gen);
+  const int ref_iy = distrib_elem(gen);
+  const int ref_ix = distrib_elem(gen);
+  const BlockInfo & ref_info = vInfo[ref_bid];
+  FluidBlock & ref_block = * (FluidBlock*) ref_info.ptrBlock;
+  const FluidElement & ref_elem = ref_block(ref_ix, ref_iy, ref_iz);
+  const std::array<Real,3> ref_pos = ref_info.pos<Real>(ref_ix, ref_iy, ref_iz);
+  const std::array<Real,3> ref_vel = {ref_elem.u, ref_elem.v, ref_elem.w};
+  std::array<double, 6> ref = {0};
+  if(sim.rank == ref_rank) {
+    ref[0] = ref_pos[0];
+    ref[1] = ref_pos[1];
+    ref[2] = ref_pos[2];
+    ref[3] = ref_vel[0];
+    ref[4] = ref_vel[1];
+    ref[5] = ref_vel[2];
+  }
+  MPI_Allreduce(MPI_IN_PLACE, ref.data(), 6, MPI_DOUBLE, MPI_SUM, sim.app_comm);
+  if(sim.rank == ref_rank) {
+    assert(std::fabs(ref[0] - ref_pos[0]) < 1e-8);
+    assert(std::fabs(ref[1] - ref_pos[1]) < 1e-8);
+    assert(std::fabs(ref[2] - ref_pos[2]) < 1e-8);
+    assert(std::fabs(ref[3] - ref_vel[0]) < 1e-8);
+    assert(std::fabs(ref[4] - ref_vel[1]) < 1e-8);
+    assert(std::fabs(ref[5] - ref_vel[2]) < 1e-8);
+  }
+  return ref;
+}
+
+
+void StructureFunctions::operator()(const double dt)
+{
+  if (sim.muteAll) return;
+  if (computeInterval <= 0 or (sim.time+dt) < nextComputeTime)
+    return;
+  nextComputeTime += computeInterval;
+
+  sim.startProfiler("StructureFunctions Kernel");
+
+  auto ref = pick_ref_point();
+  const std::array<Real,3> ref_pos = {ref[0], ref[1], ref[2]};
+  const std::array<Real,3> ref_vel = {ref[3], ref[4], ref[5]};
+
+  static constexpr size_t oneD_ref_gridN = 32; // LES resolution
+  const Real delta_increments = sim.maxextent / oneD_ref_gridN;
+  static constexpr size_t n_shells = oneD_ref_gridN / 2;
+
+  unsigned long counts[n_shells] = {0};
+  double sum_S2[n_shells] = {0.0}, sum_S3[n_shells] = {0.0};
+  double sum_S4[n_shells] = {0.0}, sum_S6[n_shells] = {0.0};
+  double sum_A3[n_shells] = {0.0};
+
+  const auto get_shell_id = [=](const Real delta) {
+    // first shell goes from 0 to 1.5 delta_increments
+    // then from 1.5 to 2.5 and so on, so we can compute with 0:N / eta
+    // NOTE: 'average' radius is = 3/4 * (B^4 - A^4) / (B^3 - A^3)
+    // where B and A are external and internal radius of shell respectively
+    if (delta <= delta_increments * 1.5) return (size_t) 0;
+    const Real delta_nondim = delta / delta_increments; //should be at least 1.5
+    assert(delta_nondim >= 1.5);
+    // in [1.5, 2.5) return 1, in [2.5, 3.5) return 2 and so on:
+    const size_t shell_id = std::max(delta_nondim - 0.5, (Real) 1);
+    return shell_id;
+  };
+  #pragma omp parallel for schedule(static) reduction(+ : counts[:n_shells], \
+                                                          sum_S2[:n_shells], \
+                                                          sum_S3[:n_shells], \
+                                                          sum_S4[:n_shells], \
+                                                          sum_S6[:n_shells], \
+                                                          sum_A3[:n_shells])
+  for (size_t i = 0; i < vInfo.size(); ++i)
+  {
+    const BlockInfo & info = vInfo[i];
+    FluidBlock& block = * (FluidBlock*) info.ptrBlock;
+
+    for (int iz = 0; iz < CUP_BLOCK_SIZE; ++iz)
+    for (int iy = 0; iy < CUP_BLOCK_SIZE; ++iy)
+    for (int ix = 0; ix < CUP_BLOCK_SIZE; ++ix)
+    {
+      const std::array<Real,3> pos = info.pos<Real>(ix, iy, iz);
+      const Real delta = periodic_distance(pos, ref_pos, sim.extent);
+      const size_t shell_id = get_shell_id(delta);
+
+      if (shell_id >= n_shells) continue;
+      const Real rx = pos[0] - ref_pos[0];
+      const Real ry = pos[1] - ref_pos[1];
+      const Real rz = pos[2] - ref_pos[2];
+      const Real rnorm = std::max(std::sqrt(rx*rx + ry*ry + rz*rz),
+                                  std::numeric_limits<Real>::epsilon());
+      const Real ex = rx/rnorm, ey = ry/rnorm, ez = rz/rnorm;
+      const Real du = block(ix,iy,iz).u - ref_vel[0];
+      const Real dv = block(ix,iy,iz).v - ref_vel[1];
+      const Real dw = block(ix,iy,iz).w - ref_vel[2];
+      const Real deltaU = du*ex + dv*ey + dw*ez;
+      counts[shell_id] += 1;
+      sum_S2[shell_id] += std::pow(deltaU, 2);
+      sum_S3[shell_id] += std::pow(deltaU, 3);
+      sum_S4[shell_id] += std::pow(deltaU, 4);
+      sum_S6[shell_id] += std::pow(deltaU, 6);
+      sum_A3[shell_id] += std::pow(std::fabs(deltaU), 3);
+    }
+  }
+
+  MPI_Allreduce(MPI_IN_PLACE, sum_S2, n_shells, MPI_DOUBLE, MPI_SUM, sim.app_comm);
+  MPI_Allreduce(MPI_IN_PLACE, sum_S3, n_shells, MPI_DOUBLE, MPI_SUM, sim.app_comm);
+  MPI_Allreduce(MPI_IN_PLACE, sum_S4, n_shells, MPI_DOUBLE, MPI_SUM, sim.app_comm);
+  MPI_Allreduce(MPI_IN_PLACE, sum_S6, n_shells, MPI_DOUBLE, MPI_SUM, sim.app_comm);
+  MPI_Allreduce(MPI_IN_PLACE, sum_A3, n_shells, MPI_DOUBLE, MPI_SUM, sim.app_comm);
+  MPI_Allreduce(MPI_IN_PLACE, counts, n_shells, MPI_UNSIGNED_LONG, MPI_SUM, sim.app_comm);
+
+  if(sim.rank==0 and not sim.muteAll)
+  {
+    std::vector<double> buffer;
+    buffer.insert(buffer.end(), sum_S2, sum_S2 + n_shells);
+    buffer.insert(buffer.end(), sum_S3, sum_S3 + n_shells);
+    buffer.insert(buffer.end(), sum_S4, sum_S4 + n_shells);
+    buffer.insert(buffer.end(), sum_S6, sum_S6 + n_shells);
+    buffer.insert(buffer.end(), sum_A3, sum_A3 + n_shells);
+    buffer.insert(buffer.end(), counts, counts + n_shells); // to double
+    FILE * pFile = fopen ("structureFunctionsAnalysis.raw", "ab");
+    fwrite (buffer.data(), sizeof(double), buffer.size(), pFile);
+    fflush(pFile); fclose(pFile);
+  }
+
+  sim.stopProfiler();
+
+  check("SGS");
+}
+
 CubismUP_3D_NAMESPACE_END
diff --git a/source/operators/HITfiltering.h b/source/operators/HITfiltering.h
index 47683b0..3e17ac1 100644
--- a/source/operators/HITfiltering.h
+++ b/source/operators/HITfiltering.h
@@ -24,5 +24,21 @@ class HITfiltering : public Operator
   std::string getName() { return "HITfiltering"; }
 };
 
+class StructureFunctions : public Operator
+{
+  std::mt19937 gen;
+  const Real computeInterval = sim.timeAnalysis / 10;
+  Real nextComputeTime = 0;
+
+  std::array<double, 6> pick_ref_point();
+
+public:
+  StructureFunctions(SimulationData& s);
+
+  void operator()(const double dt);
+
+  std::string getName() { return "StructureFunctions"; }
+};
+
 CubismUP_3D_NAMESPACE_END
 #endif
diff --git a/source/operators/SGS.cpp b/source/operators/SGS.cpp
index 81dc073..1398e42 100644
--- a/source/operators/SGS.cpp
+++ b/source/operators/SGS.cpp
@@ -9,6 +9,10 @@
 #include "SGS.h"
 
 CubismUP_3D_NAMESPACE_BEGIN using namespace cubism;
+//#define DSM_LILLY
+//#define DSM_LOCAL
+
+static constexpr Real EPS = std::numeric_limits<Real>::epsilon();
 
 struct SGSHelperElement
 {
@@ -74,6 +78,7 @@ class KernelSGS_SSM
       sgs.dwD = (LN.w+LS.w + LE.w+LW.w + LF.w+LB.w - L.w*6)/(h*h);
 
       o(ix, iy, iz).tmpU = sgs.nu;
+      if(!readFromChi) o(ix, iy, iz).chi = Cs2;
     }
   }
 };
@@ -159,7 +164,7 @@ inline  Real facFilter(const int i, const int j, const int k)
     return 4.0/64;
   else if (abs(i)+abs(j)+abs(k) == 0)    // Center cells
     return 8.0/64;
-  else // assert(false); // TODO: huguesl, is 0 a valid outcome?
+  else // assert(false);
   return 0;
 }
 
@@ -234,6 +239,10 @@ class KernelSGS_DSM
   const std::array<int, 3> stencil_start = {-3, -3, -3};
   const std::array<int, 3> stencil_end = {4, 4, 4};
   const StencilInfo stencil{-3,-3,-3, 4,4,4, true, {FE_U,FE_V,FE_W}};
+  #ifdef DSM_LILLY
+  mutable Real mean_l_dot_m = 0;
+  mutable Real mean_m_dot_m = 0;
+  #endif
 
   KernelSGS_DSM(SGSGridMPI * const _sgsGrid)
       : sgsGrid(_sgsGrid) {}
@@ -242,19 +251,19 @@ class KernelSGS_DSM
   template <typename Lab, typename BlockType>
   void operator()(Lab& lab, const BlockInfo& info, BlockType& o) const
   {
-    const Real h = info.h_gridpoint;
+    const Real h = info.h_gridpoint, mFac = 2 * h * h;
     for (int iz = 0; iz < FluidBlock::sizeZ; ++iz)
     for (int iy = 0; iy < FluidBlock::sizeY; ++iy)
     for (int ix = 0; ix < FluidBlock::sizeX; ++ix) {
 
       const filterFluidElement L_f(lab,ix,iy,iz, h);
 
-      const Real m_xx = L_f.shear_S_xx - 4 * L_f.shear * L_f.S_xx;
-      const Real m_xy = L_f.shear_S_xy - 4 * L_f.shear * L_f.S_xy;
-      const Real m_xz = L_f.shear_S_xz - 4 * L_f.shear * L_f.S_xz;
-      const Real m_yy = L_f.shear_S_yy - 4 * L_f.shear * L_f.S_yy;
-      const Real m_yz = L_f.shear_S_yz - 4 * L_f.shear * L_f.S_yz;
-      const Real m_zz = L_f.shear_S_zz - 4 * L_f.shear * L_f.S_zz;
+      const Real m_xx = mFac * (L_f.shear_S_xx - 4 * L_f.shear * L_f.S_xx);
+      const Real m_xy = mFac * (L_f.shear_S_xy - 4 * L_f.shear * L_f.S_xy);
+      const Real m_xz = mFac * (L_f.shear_S_xz - 4 * L_f.shear * L_f.S_xz);
+      const Real m_yy = mFac * (L_f.shear_S_yy - 4 * L_f.shear * L_f.S_yy);
+      const Real m_yz = mFac * (L_f.shear_S_yz - 4 * L_f.shear * L_f.S_yz);
+      const Real m_zz = mFac * (L_f.shear_S_zz - 4 * L_f.shear * L_f.S_zz);
 
       const Real traceTerm = 1.0/3 * (L_f.uu + L_f.vv + L_f.ww
                               - L_f.u * L_f.u - L_f.v * L_f.v - L_f.w * L_f.w);
@@ -273,6 +282,10 @@ class KernelSGS_DSM
 
       o(ix,iy,iz).tmpV = l_dot_m;
       o(ix,iy,iz).tmpW = m_dot_m;
+      #ifdef DSM_LILLY
+        mean_l_dot_m += l_dot_m;
+        mean_m_dot_m += m_dot_m;
+      #endif
     }
   }
 };
@@ -285,7 +298,8 @@ class KernelSGS_DSM_avg
  public:
   const std::array<int, 3> stencil_start = {-1, -1, -1};
   const std::array<int, 3> stencil_end = {2, 2, 2};
-  const StencilInfo stencil{-1,-1,-1, 2,2,2, true, {FE_U,FE_V,FE_W}};
+  const StencilInfo stencil{-1,-1,-1, 2,2,2, true, {FE_U,FE_V,FE_W,
+                                                    FE_TMPV,FE_TMPW}};
 
   KernelSGS_DSM_avg(SGSGridMPI * const _sgsGrid)
       : sgsGrid(_sgsGrid) {}
@@ -317,20 +331,21 @@ class KernelSGS_DSM_avg
                                    +(d1udz1+d1wdx1)*(d1udz1+d1wdx1)
                                    +(d1wdy1+d1vdz1)*(d1wdy1+d1vdz1))/(2*h);
 
-      Real l_dot_m = 0.0;
-      Real m_dot_m = 0.0;
+      #ifndef DSM_LOCAL
+      Real l_dot_m = 0.0, m_dot_m = 0.0;
       for (int i=-1; i<2; ++i)
       for (int j=-1; j<2; ++j)
       for (int k=-1; k<2; ++k) {
-        const Real f = facFilter(i,j,k);
-        l_dot_m += f * lab(ix+i, iy+j, iz+k).tmpV;
-        m_dot_m += f * lab(ix+i, iy+j, iz+k).tmpW;
+        l_dot_m += facFilter(i,j,k) * lab(ix+i, iy+j, iz+k).tmpV;
+        m_dot_m += facFilter(i,j,k) * lab(ix+i, iy+j, iz+k).tmpW;
       }
-
-      Real Cs2 = (m_dot_m<=0) ? 0.0 : l_dot_m/2 / (h*h * m_dot_m);
-
-      if (Cs2 < 0) Cs2 = 0;
-      //if (Cs2 >= 0.25*0.25) Cs2 = 0.25*0.25;
+      const Real hat = l_dot_m            / std::max(m_dot_m,            EPS);
+      const Real loc = lab(ix,iy,iz).tmpV / std::max(lab(ix,iy,iz).tmpW, EPS);
+      const Real Cs2 = std::max({hat, loc, (Real) 0});
+      #else
+      const Real Cs2 = std::max(lab(ix,iy,iz).tmpV, EPS)
+                     / std::max(lab(ix,iy,iz).tmpW, EPS);
+      #endif
 
       sgs.nu = Cs2 * h*h * shear;
       sgs.duD = (LN.u+LS.u + LE.u+LW.u + LF.u+LB.u - L.u*6)/(h*h);
@@ -466,11 +481,29 @@ void SGS::operator()(const double dt)
     //compute<KernelSGS_nonUniform>(sgs);
   } else {
     if (sim.sgs=="DSM" or sim.cs < 0) { // Dynamic Smagorinsky Model
-      const KernelSGS_DSM computeCs(sgsGrid);
-      compute(computeCs);
-
-      const KernelSGS_DSM_avg averageCs(sgsGrid);
-      compute(averageCs);
+      #ifndef DSM_LILLY
+        const KernelSGS_DSM computeCs(sgsGrid);
+        compute(computeCs);
+
+        const KernelSGS_DSM_avg averageCs(sgsGrid);
+        compute(averageCs);
+      #else
+        const int nthreads = omp_get_max_threads();
+        std::vector<KernelSGS_DSM*> K(nthreads, nullptr);
+        for(int i=0; i<nthreads; ++i) K[i] = new KernelSGS_DSM(sgsGrid);
+        compute(K);
+        double mean[2];
+        for(int i=0; i<nthreads; ++i) {
+          mean[0] += K[i]->mean_l_dot_m;
+          mean[1] += K[i]->mean_m_dot_m;
+          delete K[i];
+        }
+        MPI_Allreduce(MPI_IN_PLACE, mean, 2, MPI_DOUBLE, MPI_SUM, sim.app_comm);
+        const Real CS2 = mean[0] / std::max(mean[1], EPS); // prevent nan
+        const Real Cs = std::sqrt(std::max(CS2, EPS));     // prevent nan
+        const KernelSGS_SSM<false> applyCs(sgsGrid, Cs);
+        compute(applyCs);
+      #endif
     }
     else if (sim.sgs=="SSM") { // Standard Smagorinsky Model
       const KernelSGS_SSM<false> K(sgsGrid, sim.cs);
diff --git a/source/operators/SGS_RL.cpp b/source/operators/SGS_RL.cpp
index 7044544..e98739d 100644
--- a/source/operators/SGS_RL.cpp
+++ b/source/operators/SGS_RL.cpp
@@ -508,9 +508,8 @@ void SGS_RL::run(const double dt, const bool RLinit, const bool RLover,
         // locS[ 0],
         locS[ 2], locS[ 3], locS[ 4], locS[ 5], locS[ 6],
         locS[ 7], locS[ 8], locS[ 9], locS[10], locS[11], locS[12],
-        // tke_nonDim,
+        // lenInt_nonDim, tke_nonDim,
         visc_nonDim, dissi_nonDim,
-        // lenInt_nonDim,
         En01_nonDim, En02_nonDim, En03_nonDim, En04_nonDim, En05_nonDim,
         En06_nonDim, En07_nonDim, En08_nonDim, En09_nonDim, En10_nonDim,
         En11_nonDim, En12_nonDim, En13_nonDim, En14_nonDim, En15_nonDim
diff --git a/source/spectralOperators/HITstatistics.h b/source/spectralOperators/HITstatistics.h
index eb6fd40..07d6640 100644
--- a/source/spectralOperators/HITstatistics.h
+++ b/source/spectralOperators/HITstatistics.h
@@ -20,7 +20,7 @@ struct HITstatistics
 {
   HITstatistics(const int maxGridSize, const Real maxBoxLength):
     N(maxGridSize), L(maxBoxLength),
-    k_msr(new Real[nBin]), E_msr(new Real[nBin]), cs2_msr(new Real[nBin])
+    k_msr(new Real[nBin]), E_msr(new Real[nBin]), Eflux(new Real[nBin])
   {
     //printf("maxGridSize %d %d %d\n", maxGridSize, N, nyquist);
     reset();
@@ -28,7 +28,7 @@ struct HITstatistics
   }
 
   HITstatistics(const HITstatistics&c) : N(c.N), L(c.L),
-    k_msr(new Real[nBin]), E_msr(new Real[nBin]), cs2_msr(new Real[nBin])
+    k_msr(new Real[nBin]), E_msr(new Real[nBin]), Eflux(new Real[nBin])
   {
     //printf("maxGridSize %d %d %d\n", c.N, N, nyquist);
     reset();
@@ -39,7 +39,7 @@ struct HITstatistics
   {
     delete [] k_msr;
     delete [] E_msr;
-    delete [] cs2_msr;
+    delete [] Eflux;
   }
 
   void reset()
@@ -49,7 +49,7 @@ struct HITstatistics
     tau_integral = 0; l_integral = 0;
     lambda = 0; uprime = 0; Re_lambda = 0;
     memset(E_msr, 0, nBin * sizeof(Real));
-    memset(cs2_msr, 0, nBin * sizeof(Real));
+    memset(Eflux, 0, nBin * sizeof(Real));
   }
 
   void updateDerivedQuantities(const Real _nu, const Real _dt,
@@ -147,7 +147,7 @@ struct HITstatistics
   Real dt = 0, nu = 0;
   Real * const k_msr;
   Real * const E_msr;
-  Real * const cs2_msr;
+  Real * const Eflux;
 };
 
 
diff --git a/source/spectralOperators/HITtargetData.h b/source/spectralOperators/HITtargetData.h
index 0aebb7d..978144e 100644
--- a/source/spectralOperators/HITtargetData.h
+++ b/source/spectralOperators/HITtargetData.h
@@ -62,7 +62,7 @@ struct HITtargetData
     std::string line; char arg[32]; double stdev, _dt;
     std::string fpath = smartiesFolderStructure? "../../" : "./";
     std::ifstream file(fpath + "scalars_" + paramspec);
-    if (!file.is_open()) {
+    if (! file.good()) {
       printf("scalars FILE NOT FOUND\n");
       holdsTargetData = false;
       return;
@@ -128,7 +128,7 @@ struct HITtargetData
     logE_mean.clear(); mode.clear();
     std::string fpath = smartiesFolderStructure? "../../" : "./";
     std::ifstream file(fpath + "spectrumLogE_" + paramspec);
-    if (!file.is_open()) {
+    if (!file.good()) {
       printf("spectrumLogE FILE NOT FOUND\n");
       holdsTargetData = false;
       return;
@@ -153,22 +153,23 @@ struct HITtargetData
         nModes, std::vector<double>(nModes,0) );
     std::string fpath = smartiesFolderStructure? "../../" : "./";
     std::ifstream file(fpath + "invCovLogE_" + paramspec);
-    if (!file.is_open()) {
+    if (!file.good()) {
       printf("invCovLogE FILE NOT FOUND\n");
       holdsTargetData = false;
       return;
     }
 
-    int j = 0;
+    size_t j = 0;
     while (std::getline(file, line)) {
-        int i = 0;
+        size_t i = 0;
         std::istringstream linestream(line);
         while (std::getline(linestream, line, ','))
           logE_invCov[j][i++] = std::stof(line);
-        assert(i >= nBin);
+        assert(i >= (size_t) nBin);
         j++;
     }
-    assert(j >= nBin);
+    assert(j >= (size_t) nBin);
+    nModes = std::min(j, nModes);
     file.close();
     file.open(fpath + "stdevLogE_" + paramspec);
     if (!file.is_open()) {
@@ -182,6 +183,8 @@ struct HITtargetData
         sscanf(line.c_str(), "%le", & logE_stdDev.back() );
     }
     assert((int) logE_stdDev.size() >= nBin);
+    nModes = std::min(logE_stdDev.size(), nModes);
+    printf("found %lu out of %d modes\n", nModes, nBin);
     //for (int i=0; i<nBin; ++i)  printf("%f %f %f\n", mode[i], logE_mean[i], logE_stdDev[i]);
     fflush(0);
   }
@@ -193,8 +196,9 @@ struct HITtargetData
 
   static std::vector<std::string> readTargetSpecs(std::string paramsList)
   {
-    std::stringstream ss(paramsList);
     std::vector<std::string> tokens;
+    if (paramsList == "") return tokens;
+    std::stringstream ss(paramsList);
     std::string item;
     while (getline(ss, item, ',')) tokens.push_back(item);
     return tokens;
@@ -206,8 +210,9 @@ struct HITtargetData
     for (int i=0; i<stats.nBin; ++i) logE[i] = std::log(stats.E_msr[i]);
     const long double fac = 0.5 / stats.nBin;
     long double dev = 0;
-    for (int j=0; j<stats.nBin; ++j)
-      for (int i=0; i<stats.nBin; ++i) {
+    const size_t N = std::min((size_t) stats.nBin, nModes);
+    for (size_t j=0; j<N; ++j)
+      for (size_t i=0; i<N; ++i) {
         const long double dLogEi = logE[i] - logE_mean[i];
         const long double dLogEj = logE[j] - logE_mean[j];
         dev += fac * dLogEj * logE_invCov[j][i] * dLogEi;
@@ -223,7 +228,8 @@ struct HITtargetData
   {
     long double ret = 0;
     const long double fac = 0.5 / stats.nBin;
-    for (int i=0; i<stats.nBin; ++i) {
+    const size_t N = std::min((size_t) stats.nBin, nModes);
+    for (size_t i=0; i<N; ++i) {
       const long double dLogEi = std::log(stats.E_msr[i]) - logE_mean[i];
       ret += fac * std::pow(dLogEi / logE_stdDev[i], 2);
     }
@@ -233,7 +239,8 @@ struct HITtargetData
   {
     long double ret = 0;
     const long double fac = 0.5 / stats.nBin;
-    for (int i=0; i<stats.nBin; ++i) {
+    const size_t N = std::min((size_t) stats.nBin, nModes);
+    for (size_t i=0; i<N; ++i) {
       const long double dLogEi = std::log(stats.E_msr[i]) - logE_mean[i];
       ret += fac * std::pow(dLogEi / logE_stdDev[i], 2);
       //ret += arg>4 ? std::exp(-2.0)/(arg-1) : std::exp(-arg);
@@ -288,7 +295,7 @@ struct HITtargetData
     size_t & pSamplesCount, long double & avgP, long double & m2P,
     const Real CS, const bool bPrint = true)
   {
-    const double newP = computeLogP(stats);
+    const double newP = - computeDiagLogArg(stats);
     pSamplesCount ++;
     assert(pSamplesCount > 0);
     const auto alpha = 1.0 / (long double) pSamplesCount;
@@ -307,6 +314,7 @@ struct HITtargetData
 };
 
 CubismUP_3D_NAMESPACE_END
+
 #endif
 
 #if 0
@@ -337,3 +345,4 @@ inline void updateGradScaling(const cubismup3d::SimulationData& sim,
   else scaling_factor = (1-beta) * scaling_factor + beta * newScaling;
 }
 #endif
+
diff --git a/source/spectralOperators/SpectralAnalysis.cpp b/source/spectralOperators/SpectralAnalysis.cpp
index fc64382..2859bb1 100644
--- a/source/spectralOperators/SpectralAnalysis.cpp
+++ b/source/spectralOperators/SpectralAnalysis.cpp
@@ -32,9 +32,13 @@ SpectralAnalysis::SpectralAnalysis(SimulationData & s)
   s.spectralManip->prepareBwd();
   sM = s.spectralManip;
 
-  //target = new HITtargetData(sM->maxGridN, "");
-  //target->smartiesFolderStructure = false;
-  //target->readAll("target");
+  target = new HITtargetData(sM->maxGridN, "");
+  target->smartiesFolderStructure = false;
+  target->readAll("target");
+  if (not target->holdsTargetData) {
+    delete target;
+    target = nullptr;
+  } else s.saveTime = target->tInteg;
 }
 
 void SpectralAnalysis::_cub2fftw()
@@ -131,10 +135,18 @@ void SpectralAnalysis::dump2File() const
 
   if(sM->sim.rank==0 and not sM->sim.muteAll) {
     buf.reserve(buf.size() + sM->stats.nBin);
-    for (int i = 0; i < sM->stats.nBin; ++i) buf.push_back(sM->stats.E_msr[i]);
+    for (int i=0; i<sM->stats.nBin; ++i) buf.push_back(sM->stats.E_msr[i]);
     FILE * pFile = fopen ("spectralAnalysis.raw", "ab");
     fwrite (buf.data(), sizeof(double), buf.size(), pFile);
     fflush(pFile); fclose(pFile);
+    #ifdef ENERGY_FLUX_SPECTRUM
+    buf.clear();
+    for (int i=0; i<sM->stats.nBin; ++i) buf.push_back(sM->stats.Eflux[i]);
+    //buf = std::vector<double>(sM->stats.Eflux, sM->stats.Eflux +sM->stats.nBin);
+    pFile = fopen ("fluxAnalysis.raw", "ab");
+    fwrite (buf.data(), sizeof(double), buf.size(), pFile);
+    fflush(pFile); fclose(pFile);
+    #endif
   }
 
   #if 0
diff --git a/source/spectralOperators/SpectralForcing.cpp b/source/spectralOperators/SpectralForcing.cpp
index bee9403..bab0fa6 100644
--- a/source/spectralOperators/SpectralForcing.cpp
+++ b/source/spectralOperators/SpectralForcing.cpp
@@ -13,6 +13,34 @@
 CubismUP_3D_NAMESPACE_BEGIN
 using namespace cubism;
 
+#if defined(ENERGY_FLUX_SPECTRUM) && ENERGY_FLUX_SPECTRUM == 1
+struct KernelComputeEnergyFlux
+{
+  const std::array<int, 3> stencil_start = {-1,-1,-1}, stencil_end = {2, 2, 2};
+  const StencilInfo stencil{-1,-1,-1, 2,2,2, false, {FE_U,FE_V,FE_W}};
+
+  template <typename Lab, typename BlockType>
+  void operator()(Lab& lab, const BlockInfo& info, BlockType& o) const
+  {
+    const Real h = info.h_gridpoint, fac = 1.0/(4*h*h);
+    for (int iz = 0; iz < FluidBlock::sizeZ; ++iz)
+    for (int iy = 0; iy < FluidBlock::sizeY; ++iy)
+    for (int ix = 0; ix < FluidBlock::sizeX; ++ix) {
+      const FluidElement &LW=lab(ix-1,iy,iz), &LE=lab(ix+1,iy,iz);
+      const FluidElement &LS=lab(ix,iy-1,iz), &LN=lab(ix,iy+1,iz);
+      const FluidElement &LF=lab(ix,iy,iz-1), &LB=lab(ix,iy,iz+1);
+      const Real dudx = LE.u-LW.u, dvdx = LE.v-LW.v, dwdx = LE.w-LW.w;
+      const Real dudy = LN.u-LS.u, dvdy = LN.v-LS.v, dwdy = LN.w-LS.w;
+      const Real dudz = LB.u-LF.u, dvdz = LB.v-LF.v, dwdz = LB.w-LF.w;
+      const Real SijSij2 = (2*dudx*dudx + (dudy+dvdx)*(dudy+dvdx)
+                          + 2*dvdy*dvdy + (dudz+dwdx)*(dudz+dwdx)
+                          + 2*dwdz*dwdz + (dwdy+dvdz)*(dwdy+dvdz)) * fac;
+      o(ix,iy,iz).tmpU = o(ix,iy,iz).chi * h*h * std::pow(SijSij2, 1.5);
+    }
+  }
+};
+#endif
+
 SpectralForcing::SpectralForcing(SimulationData & s) : Operator(s)
 {
   initSpectralAnalysisSolver(s);
@@ -26,6 +54,11 @@ void SpectralForcing::operator()(const double dt)
   SpectralManip & sM = * sim.spectralManip;
   HITstatistics & stats = sM.stats;
 
+  #if defined(ENERGY_FLUX_SPECTRUM) && ENERGY_FLUX_SPECTRUM == 1
+    const KernelComputeEnergyFlux K;
+    compute(K);
+  #endif
+
   _cub2fftw();
 
   sM.runFwd();
@@ -80,6 +113,9 @@ void SpectralForcing::_cub2fftw() const
   Real * const data_u = sM.data_u;
   Real * const data_v = sM.data_v;
   Real * const data_w = sM.data_w;
+  #ifdef ENERGY_FLUX_SPECTRUM
+    Real * const data_j = sM.data_j;
+  #endif
 
   #pragma omp parallel for schedule(static)
   for(size_t i=0; i<NlocBlocks; ++i) {
@@ -92,6 +128,13 @@ void SpectralForcing::_cub2fftw() const
       data_u[src_index] = b(ix,iy,iz).u;
       data_v[src_index] = b(ix,iy,iz).v;
       data_w[src_index] = b(ix,iy,iz).w;
+      #ifdef ENERGY_FLUX_SPECTRUM
+        #if ENERGY_FLUX_SPECTRUM == 1
+          data_j[src_index] = b(ix,iy,iz).tmpU;
+        #else
+          data_j[src_index] = b(ix,iy,iz).chi;
+        #endif
+      #endif
     }
   }
 }
diff --git a/source/spectralOperators/SpectralManip.h b/source/spectralOperators/SpectralManip.h
index fa26dbc..eb762df 100644
--- a/source/spectralOperators/SpectralManip.h
+++ b/source/spectralOperators/SpectralManip.h
@@ -16,6 +16,8 @@
 
 CubismUP_3D_NAMESPACE_BEGIN
 
+//#define ENERGY_FLUX_SPECTRUM
+
 template <class T>
 inline T pow2(const T val) {
     return val*val;
@@ -66,8 +68,12 @@ class SpectralManip
   bool bAllocFwd = false;
   bool bAllocBwd = false;
 
-  Real * data_u, * data_v, * data_w;
-  // * data_cs2;
+  Real * data_u = nullptr;
+  Real * data_v = nullptr;
+  Real * data_w = nullptr;
+  #ifdef ENERGY_FLUX_SPECTRUM
+  Real * data_j = nullptr;
+  #endif
 
 public:
   const MPI_Comm m_comm = grid.getCartComm();
diff --git a/source/spectralOperators/SpectralManipACC.cpp b/source/spectralOperators/SpectralManipACC.cpp
index c7fb4f8..99d7caf 100644
--- a/source/spectralOperators/SpectralManipACC.cpp
+++ b/source/spectralOperators/SpectralManipACC.cpp
@@ -128,16 +128,12 @@ streams(new myCUDAstreams())
       printf("PoissonSolverPeriodic: something wrong in isize\n");
       abort();
     }
-
     cudaMalloc((void**) & gpu_u, alloc_max);
     cudaMalloc((void**) & gpu_v, alloc_max);
     cudaMalloc((void**) & gpu_w, alloc_max);
     acc_plan* P = accfft_plan_dft(totN, gpu_u, gpu_u, acc_comm, ACCFFT_MEASURE);
     plan = (void*) P;
     data_size = (size_t) myN[0] * (size_t) myN[1] * (size_t) 2*nz_hat;
-    data_u = (Real*) malloc(data_size*sizeof(Real));
-    data_v = (Real*) malloc(data_size*sizeof(Real));
-    data_w = (Real*) malloc(data_size*sizeof(Real));
     stridez = 1; // fast
     stridey = 2*nz_hat;
     stridex = myN[1] * 2*nz_hat; // slow
@@ -157,13 +153,16 @@ streams(new myCUDAstreams())
     ostart[1] = 0; istart[1] = 0;
     ostart[2] = 0; istart[2] = 0;
     data_size = (size_t) myN[2] * (size_t) myN[1] * (size_t) 2*nx_hat;
-    data_u = (Real*) malloc(data_size*sizeof(Real));
-    data_v = (Real*) malloc(data_size*sizeof(Real));
-    data_w = (Real*) malloc(data_size*sizeof(Real));
     stridez = myN[1] * 2*nx_hat; // fast
     stridey = 2*nx_hat;
     stridex = 1; // slow
   }
+  data_u = (Real*) malloc(data_size * sizeof(Real));
+  data_v = (Real*) malloc(data_size * sizeof(Real));
+  data_w = (Real*) malloc(data_size * sizeof(Real));
+  #ifdef ENERGY_FLUX_SPECTRUM
+  data_j = (Real*) malloc(data_size * sizeof(Real));
+  #endif
 }
 
 void SpectralManipACC::prepareFwd()
diff --git a/source/spectralOperators/SpectralManipFFTW.cpp b/source/spectralOperators/SpectralManipFFTW.cpp
index d49a4c3..848b849 100644
--- a/source/spectralOperators/SpectralManipFFTW.cpp
+++ b/source/spectralOperators/SpectralManipFFTW.cpp
@@ -19,23 +19,17 @@
 CubismUP_3D_NAMESPACE_BEGIN
 using namespace cubism;
 
-//static inline Real pow2_cplx(const fft_c cplx_val) {
-//  return pow2(cplx_val[0]) + pow2(cplx_val[1]);
-//}
-
 void SpectralManipFFTW::_compute_forcing()
 {
   fft_c *const cplxData_u  = (fft_c *) data_u;
   fft_c *const cplxData_v  = (fft_c *) data_v;
   fft_c *const cplxData_w  = (fft_c *) data_w;
-  //fft_c *const cplxData_cs2  = (fft_c *) data_cs2;
   const long nKx = static_cast<long>(gsize[0]);
   const long nKy = static_cast<long>(gsize[1]);
   const long nKz = static_cast<long>(gsize[2]);
   const Real waveFactorX = 2.0 * M_PI / sim.extent[0];
   const Real waveFactorY = 2.0 * M_PI / sim.extent[1];
   const Real waveFactorZ = 2.0 * M_PI / sim.extent[2];
-  //const Real wFacX = 2*M_PI / nKx, wFacY = 2*M_PI / nKy, wFacZ = 2*M_PI / nKz;
   const Real h = sim.uniformH();
 
   const long loc_n1 = local_n1, shifty = local_1_start;
@@ -47,10 +41,18 @@ void SpectralManipFFTW::_compute_forcing()
 
   Real tke = 0, eps = 0, lIntegral = 0, tkeFiltered = 0;
   Real * E_msr = stats.E_msr;
-  memset(E_msr, 0, nBins * sizeof(Real));
-
-  // Let's only measure spectrum up to Nyquist.
-  #pragma omp parallel for reduction(+ : E_msr[:nBins], tke, eps, lIntegral, tkeFiltered) schedule(static)
+  memset(E_msr, 0, nBins * sizeof(Real)); //Only measure spectrum up to Nyquist
+  #ifdef ENERGY_FLUX_SPECTRUM
+    Real * Eflux = stats.Eflux;
+    memset(Eflux, 0, nBins * sizeof(Real));
+    fft_c *const cplxData_j  = (fft_c *) data_j;
+  #endif
+
+  #ifdef ENERGY_FLUX_SPECTRUM
+    #pragma omp parallel for reduction(+ : E_msr[:nBins], Eflux[:nBins], tke, eps, lIntegral, tkeFiltered) schedule(static)
+  #else
+    #pragma omp parallel for reduction(+ : E_msr[:nBins], tke, eps, lIntegral, tkeFiltered) schedule(static)
+  #endif
   for(long j = 0; j<loc_n1; ++j)
   for(long i = 0; i<sizeX;  ++i)
   for(long k = 0; k<sizeZ_hat; ++k)
@@ -93,10 +95,22 @@ void SpectralManipFFTW::_compute_forcing()
       const size_t binID = std::floor(std::sqrt((Real) kind) * nyquist_scaling);
       assert(binID < nBins);
       E_msr[binID] += E;
-      //if (bComputeCs2Spectrum){
-      //  const Real cs2 = std::sqrt(pow2_cplx(cplxData_cs2[linidx]));
-      //  cs2_msr[binID] += mult*cs2;
-      //}
+      #ifdef ENERGY_FLUX_SPECTRUM
+        const Real JR = cplxData_j[linidx][0], JI = cplxData_j[linidx][1];
+        #if ENERGY_FLUX_SPECTRUM == 1
+          Eflux[binID] += mult*std::sqrt(std::max(JR*JR + JI+JI, (Real)0));
+        #else
+          const Real S11sqR = pow2(- UI * dXfac),  S11sqI = pow2(UR * dXfac);
+          const Real S22sqR = pow2(- VI * dYfac),  S22sqI = pow2(VR * dYfac);
+          const Real S33sqR = pow2(- WI * dZfac),  S33sqI = pow2(WR * dZfac);
+          const Real S12sqR = pow2(dUdYR+dVdXR), S12sqI = pow2(dUdYI+dVdXI);
+          const Real S23sqR = pow2(dVdZR+dWdYR), S23sqI = pow2(dVdZI+dWdYI);
+          const Real S31sqR = pow2(dWdXR+dUdZR), S31sqI = pow2(dWdXI+dUdZI);
+          const Real SR2 = 2*S11sqR +2*S22sqR +2*S33sqR +S12sqR +S23sqR +S31sqR;
+          const Real SI2 = 2*S11sqI +2*S22sqI +2*S33sqI +S12sqI +S23sqI +S31sqI;
+          Eflux[binID] += mult*h*h*(JR*std::pow(SR2,1.5) +JI*std::pow(SI2,1.5));
+        #endif
+      #endif
     }
 
     if (k2 > 0 && k2 < 3.5) {
@@ -114,15 +128,17 @@ void SpectralManipFFTW::_compute_forcing()
   MPI_Allreduce(MPI_IN_PLACE, &eps, 1, MPIREAL, MPI_SUM, m_comm);
   MPI_Allreduce(MPI_IN_PLACE, &lIntegral, 1, MPIREAL, MPI_SUM, m_comm);
   MPI_Allreduce(MPI_IN_PLACE, &tkeFiltered, 1, MPIREAL, MPI_SUM, m_comm);
-  //if (bComputeCs2Spectrum){
-  //  assert(false);
-    //#pragma omp parallel reduction(+ : cs2_msr[:nBin])
-    //MPI_Allreduce(MPI_IN_PLACE, cs2_msr, nBin, MPIREAL, MPI_SUM, sM->m_comm);
-  //}
+  #ifdef ENERGY_FLUX_SPECTRUM
+  MPI_Allreduce(MPI_IN_PLACE, Eflux, nBins, MPIREAL, MPI_SUM, m_comm);
+  #endif
 
   const Real normalization = 1.0 / pow2(normalizeFFT);
-  for (size_t binID = 0; binID < nBins; binID++) E_msr[binID] *= normalization;
-  //if (bComputeCs2Spectrum) cs2_msr[binID] /= normalizeFFT;
+  for (size_t binID = 0; binID < nBins; binID++) {
+    E_msr[binID] *= normalization;
+    #ifdef ENERGY_FLUX_SPECTRUM
+    Eflux[binID] *= normalization;
+    #endif
+  }
 
   stats.tke = tke * normalization;
   stats.l_integral = lIntegral / tke;
@@ -319,7 +335,10 @@ SpectralManipFFTW::SpectralManipFFTW(SimulationData&s): SpectralManip(s)
   data_u = _FFTW_(alloc_real)(2*alloc_local);
   data_v = _FFTW_(alloc_real)(2*alloc_local);
   data_w = _FFTW_(alloc_real)(2*alloc_local);
-  // data_cs2 = _FFTW_(alloc_real)(2*alloc_local);
+
+  #ifdef ENERGY_FLUX_SPECTRUM
+    data_j = _FFTW_(alloc_real)(2*alloc_local);
+  #endif
 }
 
 void SpectralManipFFTW::prepareFwd()
@@ -327,14 +346,17 @@ void SpectralManipFFTW::prepareFwd()
   if (bAllocFwd) return;
 
   fwd_u = (void*) _FFTW_(mpi_plan_dft_r2c_3d)(gsize[0], gsize[1], gsize[2],
-    data_u, (fft_c*)data_u, m_comm, FFTW_MPI_TRANSPOSED_OUT  | FFTW_MEASURE);
+    data_u, (fft_c*)data_u, m_comm, FFTW_MPI_TRANSPOSED_OUT | FFTW_MEASURE);
   fwd_v = (void*) _FFTW_(mpi_plan_dft_r2c_3d)(gsize[0], gsize[1], gsize[2],
-    data_v, (fft_c*)data_v, m_comm, FFTW_MPI_TRANSPOSED_OUT  | FFTW_MEASURE);
+    data_v, (fft_c*)data_v, m_comm, FFTW_MPI_TRANSPOSED_OUT | FFTW_MEASURE);
   fwd_w = (void*) _FFTW_(mpi_plan_dft_r2c_3d)(gsize[0], gsize[1], gsize[2],
-    data_w, (fft_c*)data_w, m_comm, FFTW_MPI_TRANSPOSED_OUT  | FFTW_MEASURE);
+    data_w, (fft_c*)data_w, m_comm, FFTW_MPI_TRANSPOSED_OUT | FFTW_MEASURE);
+
+  #ifdef ENERGY_FLUX_SPECTRUM
+  fwd_j = (void*) _FFTW_(mpi_plan_dft_r2c_3d)(gsize[0], gsize[1], gsize[2],
+    data_j, (fft_c*)data_j, m_comm, FFTW_MPI_TRANSPOSED_OUT | FFTW_MEASURE);
+  #endif
 
-  //fwd_cs2 = (void*) _FFTW_(mpi_plan_dft_r2c_3d)(gsize[0], gsize[1], gsize[2],
-  //data_cs2, (fft_c*)data_cs2, m_comm, FFTW_MPI_TRANSPOSED_OUT | FFTW_MEASURE);
   bAllocFwd = true;
 }
 
@@ -363,7 +385,9 @@ void SpectralManipFFTW::runFwd() const
   _FFTW_(execute)((fft_plan) fwd_v);
   _FFTW_(execute)((fft_plan) fwd_w);
 
-  // _FFTW_(execute)((fft_plan) fwd_cs2);
+  #ifdef ENERGY_FLUX_SPECTRUM
+    _FFTW_(execute)((fft_plan) fwd_j);
+  #endif
 }
 
 void SpectralManipFFTW::runBwd() const
@@ -383,18 +407,23 @@ SpectralManipFFTW::~SpectralManipFFTW()
   _FFTW_(free)(data_u);
   _FFTW_(free)(data_v);
   _FFTW_(free)(data_w);
-  // _FFTW_(free)(data_cs2);
+  #ifdef ENERGY_FLUX_SPECTRUM
+    _FFTW_(free)(data_j);
+  #endif
   if (bAllocFwd) {
     _FFTW_(destroy_plan)((fft_plan) fwd_u);
     _FFTW_(destroy_plan)((fft_plan) fwd_v);
     _FFTW_(destroy_plan)((fft_plan) fwd_w);
-    // _FFTW_(destroy_plan)((fft_plan) fwd_cs2);
+    #ifdef ENERGY_FLUX_SPECTRUM
+      _FFTW_(destroy_plan)((fft_plan) fwd_j);
+    #endif
   }
   if (bAllocBwd) {
     _FFTW_(destroy_plan)((fft_plan) bwd_u);
     _FFTW_(destroy_plan)((fft_plan) bwd_v);
     _FFTW_(destroy_plan)((fft_plan) bwd_w);
   }
+
   _FFTW_(mpi_cleanup)();
 }
 
diff --git a/source/spectralOperators/SpectralManipFFTW.h b/source/spectralOperators/SpectralManipFFTW.h
index a5b64bb..e4694f8 100644
--- a/source/spectralOperators/SpectralManipFFTW.h
+++ b/source/spectralOperators/SpectralManipFFTW.h
@@ -20,8 +20,15 @@ class SpectralManipFFTW : public SpectralManip
   ptrdiff_t alloc_local=0;
   ptrdiff_t local_n0=0, local_0_start=0;
   ptrdiff_t local_n1=0, local_1_start=0;
-  void * fwd_u, * fwd_v, * fwd_w, * fwd_cs2;
-  void * bwd_u, * bwd_v, * bwd_w;
+  void * fwd_u = nullptr;
+  void * fwd_v = nullptr;
+  void * fwd_w = nullptr;
+  #ifdef ENERGY_FLUX_SPECTRUM
+  void * fwd_j = nullptr;
+  #endif
+  void * bwd_u = nullptr;
+  void * bwd_v = nullptr;
+  void * bwd_w = nullptr;
 
 public: