From bcc67e19b44b6a7b11d2302c9487fd19ec722662 Mon Sep 17 00:00:00 2001
From: Luca Fedeli <luca.fedeli@cea.fr>
Date: Thu, 1 Feb 2024 19:07:28 +0100
Subject: [PATCH 01/13] Docs: update instructions for Adastra supercomputer
 (CINES, France) (#4655)

* update instructions for Adastra supercomputer

* remove empty line

* fix bug

* fix bug

* fix bug
---
 Docs/source/install/hpc/adastra.rst           | 30 ++++++----
 .../adastra_warpx.profile.example             | 28 ++++-----
 .../adastra-cines/install_dependencies.sh     | 60 +++++++++----------
 Tools/machines/adastra-cines/submit.sh        | 17 ++++--
 4 files changed, 74 insertions(+), 61 deletions(-)
diff --git a/Docs/source/install/hpc/adastra.rst b/Docs/source/install/hpc/adastra.rst
index 44b07985670..0b984d5e2be 100644
--- a/Docs/source/install/hpc/adastra.rst
+++ b/Docs/source/install/hpc/adastra.rst
@@ -31,18 +31,26 @@ If you are new to this system, **please see the following resources**:
 Preparation
 -----------
 
+The following instructions will install WarpX in the ``$SHAREDHOMEDIR`` directory,
+which is shared among all the members of a given project. Due to the inode
+quota enforced for this machine, a shared installation of WarpX is advised.
+
 Use the following commands to download the WarpX source code:
 
 .. code-block:: bash
 
-   git clone https://github.com/ECP-WarpX/WarpX.git $HOME/src/warpx
+   # If you have multiple projects, activate the project that you want to use with:
+   #
+   # myproject -a YOUR_PROJECT_NAME
+   #
+   git clone https://github.com/ECP-WarpX/WarpX.git $SHAREDHOMEDIR/src/warpx
 
-We use system software modules, add environment hints and further dependencies via the file ``$HOME/adastra_warpx.profile``.
+We use system software modules, add environment hints and further dependencies via the file ``$SHAREDHOMEDIR/adastra_warpx.profile``.
 Create it now:
 
 .. code-block:: bash
 
-   cp $HOME/src/warpx/Tools/machines/adastra-cines/adastra_warpx.profile.example $HOME/adastra_warpx.profile
+   cp $SHAREDHOMEDIR/src/warpx/Tools/machines/adastra-cines/adastra_warpx.profile.example $SHAREDHOMEDIR/adastra_warpx.profile
 
 .. dropdown:: Script Details
    :color: light
@@ -53,8 +61,8 @@ Create it now:
       :language: bash
 
 Edit the 2nd line of this script, which sets the ``export proj=""`` variable using a text editor
-such as ``nano``, ``emacs``, or ``vim`` (all available by default on
-Adastra login nodes).
+such as ``nano``, ``emacs``, or ``vim`` (all available by default on Adastra login nodes) and
+uncomment the 3rd line (which sets ``$proj`` as the active project).
 
 .. important::
 
@@ -62,14 +70,14 @@ Adastra login nodes).
 
    .. code-block:: bash
 
-      source $HOME/adastra_warpx.profile
+      source $SHAREDHOMEDIR/adastra_warpx.profile
 
 Finally, since Adastra does not yet provide software modules for some of our dependencies, install them once:
 
 .. code-block:: bash
 
-   bash $HOME/src/warpx/Tools/machines/adastra-cines/install_dependencies.sh
-   source $HOME/sw/adastra/gpu/venvs/warpx-adastra/bin/activate
+   bash $SHAREDHOMEDIR/src/warpx/Tools/machines/adastra-cines/install_dependencies.sh
+   source $SHAREDHOMEDIR/sw/adastra/gpu/venvs/warpx-adastra/bin/activate
 
 .. dropdown:: Script Details
    :color: light
@@ -89,13 +97,13 @@ Use the following :ref:`cmake commands <building-cmake>` to compile the applicat
 
 .. code-block:: bash
 
-   cd $HOME/src/warpx
+   cd $SHAREDHOMEDIR/src/warpx
    rm -rf build_adastra
 
    cmake -S . -B build_adastra -DWarpX_COMPUTE=HIP -DWarpX_PSATD=ON -DWarpX_QED_TABLE_GEN=ON -DWarpX_DIMS="1;2;RZ;3"
    cmake --build build_adastra -j 16
 
-The WarpX application executables are now in ``$HOME/src/warpx/build_adastra/bin/``.
+The WarpX application executables are now in ``$SHAREDHOMEDIR/src/warpx/build_adastra/bin/``.
 Additionally, the following commands will install WarpX as a Python module:
 
 .. code-block:: bash
@@ -119,7 +127,7 @@ If you already installed WarpX in the past and want to update it, start by getti
 
 .. code-block:: bash
 
-   cd $HOME/src/warpx
+   cd $SHAREDHOMEDIR/src/warpx
 
    # read the output of this command - does it look ok?
    git status
diff --git a/Tools/machines/adastra-cines/adastra_warpx.profile.example b/Tools/machines/adastra-cines/adastra_warpx.profile.example
index 23441638893..0d55e869d6a 100644
--- a/Tools/machines/adastra-cines/adastra_warpx.profile.example
+++ b/Tools/machines/adastra-cines/adastra_warpx.profile.example
@@ -1,30 +1,33 @@
-# please set your project account
+# please set your project account and uncomment the following two lines
 #export proj=your_project_id
+#myproject -a $proj
 
 # required dependencies
+module purge
+module load cpe/23.12
 module load craype-accel-amd-gfx90a craype-x86-trento
 module load PrgEnv-cray
+module load CCE-GPU-3.0.0
 module load amd-mixed/5.2.3
-module load CPE-23.02-cce-15.0.1-GPU-softs
 
 # optional: for PSATD in RZ geometry support
-export CMAKE_PREFIX_PATH=${HOME}/sw/adastra/gpu/blaspp-master:$CMAKE_PREFIX_PATH
-export CMAKE_PREFIX_PATH=${HOME}/sw/adastra/gpu/lapackpp-master:$CMAKE_PREFIX_PATH
-export LD_LIBRARY_PATH=${HOME}/sw/adastra/gpu/blaspp-master/lib64:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=${HOME}/sw/adastra/gpu/lapackpp-master/lib64:$LD_LIBRARY_PATH
+export CMAKE_PREFIX_PATH=${SHAREDHOMEDIR}/sw/adastra/gpu/blaspp-master:$CMAKE_PREFIX_PATH
+export CMAKE_PREFIX_PATH=${SHAREDHOMEDIR}/sw/adastra/gpu/lapackpp-master:$CMAKE_PREFIX_PATH
+export LD_LIBRARY_PATH=${SHAREDHOMEDIR}/sw/adastra/gpu/blaspp-master/lib64:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=${SHAREDHOMEDIR}/sw/adastra/gpu/lapackpp-master/lib64:$LD_LIBRARY_PATH
 
 # optional: for QED lookup table generation support
-module load boost/1.81.0-mpi-python3
+module load boost/1.83.0-mpi-python3
 
 # optional: for openPMD support
 module load cray-hdf5-parallel
-export CMAKE_PREFIX_PATH=${HOME}/sw/adastra/gpu/c-blosc-1.21.1:$CMAKE_PREFIX_PATH
-export CMAKE_PREFIX_PATH=${HOME}/sw/adastra/gpu/adios2-2.8.3:$CMAKE_PREFIX_PATH
+export CMAKE_PREFIX_PATH=${SHAREDHOMEDIR}/sw/adastra/gpu/c-blosc-1.21.1:$CMAKE_PREFIX_PATH
+export CMAKE_PREFIX_PATH=${SHAREDHOMEDIR}/sw/adastra/gpu/adios2-2.8.3:$CMAKE_PREFIX_PATH
 
 export PATH=${HOME}/sw/adastra/gpu/adios2-2.8.3/bin:${PATH}
 
 # optional: for Python bindings or libEnsemble
-module load cray-python/3.9.13.1
+module load cray-python/3.11.5
 
 # fix system defaults: do not escape $ with a \ on tab completion
 shopt -s direxpand
@@ -49,7 +52,4 @@ export AMREX_AMD_ARCH=gfx90a
 # compiler environment hints
 export CC=$(which cc)
 export CXX=$(which CC)
-export FC=$(which ftn)
-export CFLAGS="-I${ROCM_PATH}/include"
-export CXXFLAGS="-I${ROCM_PATH}/include -Wno-pass-failed"
-export LDFLAGS="-L${ROCM_PATH}/lib -lamdhip64"
+export FC=$(which amdflang)
diff --git a/Tools/machines/adastra-cines/install_dependencies.sh b/Tools/machines/adastra-cines/install_dependencies.sh
index 8a4cef4a2ec..b48bf144c2a 100755
--- a/Tools/machines/adastra-cines/install_dependencies.sh
+++ b/Tools/machines/adastra-cines/install_dependencies.sh
@@ -20,7 +20,7 @@ if [ -z ${proj-} ]; then echo "WARNING: The 'proj' variable is not yet set in yo
 
 # Remove old dependencies #####################################################
 #
-SW_DIR="${HOME}/sw/adastra/gpu"
+SW_DIR="${SHAREDHOMEDIR}/sw/adastra/gpu"
 rm -rf ${SW_DIR}
 mkdir -p ${SW_DIR}
 
@@ -34,62 +34,62 @@ python3 -m pip uninstall -qqq -y mpi4py 2>/dev/null || true
 #
 
 # BLAS++ (for PSATD+RZ)
-if [ -d $HOME/src/blaspp ]
+if [ -d $SHAREDHOMEDIR/src/blaspp ]
 then
-  cd $HOME/src/blaspp
+  cd $SHAREDHOMEDIR/src/blaspp
   git fetch --prune
   git checkout master
   git pull
   cd -
 else
-  git clone https://github.com/icl-utk-edu/blaspp.git $HOME/src/blaspp
+  git clone https://github.com/icl-utk-edu/blaspp.git $SHAREDHOMEDIR/src/blaspp
 fi
-rm -rf $HOME/src/blaspp-adastra-gpu-build
-CXX=$(which CC) cmake -S $HOME/src/blaspp -B $HOME/src/blaspp-adastra-gpu-build -Duse_openmp=OFF -Dgpu_backend=hip -DCMAKE_CXX_STANDARD=17 -DCMAKE_INSTALL_PREFIX=${SW_DIR}/blaspp-master
-cmake --build $HOME/src/blaspp-adastra-gpu-build --target install --parallel 16
-rm -rf $HOME/src/blaspp-adastra-gpu-build
+rm -rf $SHAREDHOMEDIR/src/blaspp-adastra-gpu-build
+CXX=$(which CC) cmake -S $SHAREDHOMEDIR/src/blaspp -B $SHAREDHOMEDIR/src/blaspp-adastra-gpu-build -Duse_openmp=OFF -Dgpu_backend=hip -DCMAKE_CXX_STANDARD=17 -DCMAKE_INSTALL_PREFIX=${SW_DIR}/blaspp-master
+cmake --build $SHAREDHOMEDIR/src/blaspp-adastra-gpu-build --target install --parallel 16
+rm -rf $SHAREDHOMEDIR/src/blaspp-adastra-gpu-build
 
 # LAPACK++ (for PSATD+RZ)
-if [ -d $HOME/src/lapackpp ]
+if [ -d $SHAREDHOMEDIR/src/lapackpp ]
 then
-  cd $HOME/src/lapackpp
+  cd $SHAREDHOMEDIR/src/lapackpp
   git fetch --prune
   git checkout master
   git pull
   cd -
 else
-  git clone https://github.com/icl-utk-edu/lapackpp.git $HOME/src/lapackpp
+  git clone https://github.com/icl-utk-edu/lapackpp.git $SHAREDHOMEDIR/src/lapackpp
 fi
-rm -rf $HOME/src/lapackpp-adastra-gpu-build
-CXX=$(which CC) CXXFLAGS="-DLAPACK_FORTRAN_ADD_" cmake -S $HOME/src/lapackpp -B $HOME/src/lapackpp-adastra-gpu-build -DCMAKE_CXX_STANDARD=17 -Dbuild_tests=OFF -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON -DCMAKE_INSTALL_PREFIX=${SW_DIR}/lapackpp-master
-cmake --build $HOME/src/lapackpp-adastra-gpu-build --target install --parallel 16
-rm -rf $HOME/src/lapackpp-adastra-gpu-build
+rm -rf $SHAREDHOMEDIR/src/lapackpp-adastra-gpu-build
+CXX=$(which CC) CXXFLAGS="-DLAPACK_FORTRAN_ADD_" cmake -S $SHAREDHOMEDIR/src/lapackpp -B $SHAREDHOMEDIR/src/lapackpp-adastra-gpu-build -DCMAKE_CXX_STANDARD=17 -Dbuild_tests=OFF -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON -DCMAKE_INSTALL_PREFIX=${SW_DIR}/lapackpp-master
+cmake --build $SHAREDHOMEDIR/src/lapackpp-adastra-gpu-build --target install --parallel 16
+rm -rf $SHAREDHOMEDIR/src/lapackpp-adastra-gpu-build
 
 # c-blosc (I/O compression, for OpenPMD)
-if [ -d $HOME/src/c-blosc ]
+if [ -d $SHAREDHOMEDIR/src/c-blosc ]
 then
   # git repository is already there
   :
 else
-  git clone -b v1.21.1 https://github.com/Blosc/c-blosc.git $HOME/src/c-blosc
+  git clone -b v1.21.1 https://github.com/Blosc/c-blosc.git $SHAREDHOMEDIR/src/c-blosc
 fi
-rm -rf $HOME/src/c-blosc-ad-build
-cmake -S $HOME/src/c-blosc -B $HOME/src/c-blosc-ad-build -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DDEACTIVATE_AVX2=OFF -DCMAKE_INSTALL_PREFIX=${HOME}/sw/adastra/gpu/c-blosc-1.21.1
-cmake --build $HOME/src/c-blosc-ad-build --target install --parallel 16
-rm -rf $HOME/src/c-blosc-ad-build
+rm -rf $SHAREDHOMEDIR/src/c-blosc-ad-build
+cmake -S $SHAREDHOMEDIR/src/c-blosc -B $SHAREDHOMEDIR/src/c-blosc-ad-build -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DDEACTIVATE_AVX2=OFF -DCMAKE_INSTALL_PREFIX=${SW_DIR}/c-blosc-1.21.1
+cmake --build $SHAREDHOMEDIR/src/c-blosc-ad-build --target install --parallel 16
+rm -rf $SHAREDHOMEDIR/src/c-blosc-ad-build
 
 # ADIOS2 v. 2.8.3 (for OpenPMD)
-if [ -d $HOME/src/adios2 ]
+if [ -d $SHAREDHOMEDIR/src/adios2 ]
 then
   # git repository is already there
   :
 else
-  git clone -b v2.8.3 https://github.com/ornladios/ADIOS2.git $HOME/src/adios2
+  git clone -b v2.8.3 https://github.com/ornladios/ADIOS2.git $SHAREDHOMEDIR/src/adios2
 fi
-rm -rf $HOME/src/adios2-ad-build
-cmake -S $HOME/src/adios2 -B $HOME/src/adios2-ad-build -DADIOS2_USE_Blosc=ON -DADIOS2_USE_Fortran=OFF -DADIOS2_USE_Python=OFF -DADIOS2_USE_ZeroMQ=OFF -DCMAKE_INSTALL_PREFIX=${HOME}/sw/adastra/gpu/adios2-2.8.3
-cmake --build $HOME/src/adios2-ad-build --target install -j 16
-rm -rf $HOME/src/adios2-ad-build
+rm -rf $SHAREDHOMEDIR/src/adios2-ad-build
+cmake -S $SHAREDHOMEDIR/src/adios2 -B $SHAREDHOMEDIR/src/adios2-ad-build -DADIOS2_USE_Blosc=ON -DADIOS2_USE_Fortran=OFF -DADIOS2_USE_Python=OFF -DADIOS2_USE_ZeroMQ=OFF -DCMAKE_INSTALL_PREFIX=${SW_DIR}/adios2-2.8.3
+cmake --build $SHAREDHOMEDIR/src/adios2-ad-build --target install -j 16
+rm -rf $SHAREDHOMEDIR/src/adios2-ad-build
 
 
 # Python ######################################################################
@@ -114,9 +114,9 @@ python3 -m pip install --upgrade openpmd-api
 python3 -m pip install --upgrade matplotlib
 python3 -m pip install --upgrade yt
 # install or update WarpX dependencies such as picmistandard
-python3 -m pip install --upgrade -r $HOME/src/warpx/requirements.txt
+python3 -m pip install --upgrade -r $SHAREDHOMEDIR/src/warpx/requirements.txt
 # optional: for libEnsemble
-python3 -m pip install -r $HOME/src/warpx/Tools/LibEnsemble/requirements.txt
+python3 -m pip install -r $SHAREDHOMEDIR/src/warpx/Tools/LibEnsemble/requirements.txt
 # optional: for optimas (based on libEnsemble & ax->botorch->gpytorch->pytorch)
 #python3 -m pip install --upgrade torch --index-url https://download.pytorch.org/whl/rocm5.4.2
-#python3 -m pip install -r $HOME/src/warpx/Tools/optimas/requirements.txt
+#python3 -m pip install -r $SHAREDHOMEDIR/src/warpx/Tools/optimas/requirements.txt
diff --git a/Tools/machines/adastra-cines/submit.sh b/Tools/machines/adastra-cines/submit.sh
index 0cb75e86e69..15a2b292b58 100644
--- a/Tools/machines/adastra-cines/submit.sh
+++ b/Tools/machines/adastra-cines/submit.sh
@@ -1,22 +1,26 @@
 #!/bin/bash
-#SBATCH --job-name=warpx
 #SBATCH --account=<account_to_charge>
+#SBATCH --job-name=warpx
 #SBATCH --constraint=MI250
-#SBATCH --ntasks-per-node=8 --cpus-per-task=8 --gpus-per-node=8
-#SBATCH --threads-per-core=1 # --hint=nomultithread
+#SBATCH --nodes=2
 #SBATCH --exclusive
 #SBATCH --output=%x-%j.out
 #SBATCH --time=00:10:00
-#SBATCH --nodes=2
 
 module purge
 
-# Architecture
+# A CrayPE environment version
+module load cpe/23.12
+# An architecture
 module load craype-accel-amd-gfx90a craype-x86-trento
 # A compiler to target the architecture
 module load PrgEnv-cray
 # Some architecture related libraries and tools
-module load amd-mixed
+module load CCE-GPU-3.0.0
+module load amd-mixed/5.2.3
+
+date
+module list
 
 export MPICH_GPU_SUPPORT_ENABLED=1
 
@@ -36,4 +40,5 @@ export OMP_NUM_THREADS=1
 export WARPX_NMPI_PER_NODE=8
 export TOTAL_NMPI=$(( ${SLURM_JOB_NUM_NODES} * ${WARPX_NMPI_PER_NODE} ))
 srun -N${SLURM_JOB_NUM_NODES} -n${TOTAL_NMPI} --ntasks-per-node=${WARPX_NMPI_PER_NODE} \
+     --cpus-per-task=8 --threads-per-core=1 --gpu-bind=closest \
     ./warpx inputs > output.txt

From d8df8f60f0dc584202165d62ac37d0947cb8eb3d Mon Sep 17 00:00:00 2001
From: David Grote <grote1@llnl.gov>
Date: Thu, 1 Feb 2024 11:37:59 -0800
Subject: [PATCH 02/13] Fix doc for doChargeConservingDepositionShapeNImplicit
 (#4658)

---
 Source/Particles/Deposition/CurrentDeposition.H | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Source/Particles/Deposition/CurrentDeposition.H b/Source/Particles/Deposition/CurrentDeposition.H
index efe6efcc788..5d1055278b2 100644
--- a/Source/Particles/Deposition/CurrentDeposition.H
+++ b/Source/Particles/Deposition/CurrentDeposition.H
@@ -1142,9 +1142,11 @@ void doEsirkepovDepositionShapeN (const GetParticlePosition<PIdx>& GetPosition,
  *        particles positions are determined and in how the particle gamma is calculated.
  *
  * \tparam depos_order  deposition order
+ * \param xp_n,yp_n,zp_n  Pointer to arrays of particle position at time level n.
  * \param GetPosition  A functor for returning the particle position.
  * \param wp           Pointer to array of particle weights.
- * \param uxp,uyp,uzp  Pointer to arrays of particle momentum.
+ * \param uxp_n,uyp_n,uzp_n  Pointer to arrays of particle momentum at time level n.
+ * \param uxp_nph,uyp_nph,uzp_nph  Pointer to arrays of particle momentum at time level n + 1/2.
  * \param ion_lev      Pointer to array of particle ionization level. This is
                        required to have the charge of each macroparticle
                        since q is a scalar. For non-ionizable species,

From 282ae836de82ce38e31a79eda06564aaa92beb81 Mon Sep 17 00:00:00 2001
From: Harmen Stoppels <harmenstoppels@gmail.com>
Date: Thu, 1 Feb 2024 21:44:20 +0100
Subject: [PATCH 03/13] Add WarpX_CCACHE Option (#4637)

WarpX autodetects `ccache` and uses it, and there's nothing you can do
about it. This PR disables it by default, and lets developers enable it
through `-DWarpX_CCACHE:BOOL=ON`.

The reason for this is mostly related to compiler wrappers, of which
there are many...

If `g++` is a compiler wrapper, then `ccache g++` cannot see the
effective flags passed to underlying real `g++`. That leads eventually
to false positive cache hits, which is a pain to debug.

For compiler wrappers you want `g++` the wrapper to invoke the call to
`ccache <real g++ invocation>` to fix that, which is for example how
Spack handle it.

Further, if you use spack with ccache enabled, the defaults of WarpX
cause `ccache` to be invoked twice (inner & outer), doubling the cache
requirements.

Finally, compiler wrapper that handle ccache themselves may set further
ccache options / flags that WarpX does not set, such as disabling
hashing of the build dir -- w/o that option cache may be useless.

* Update Order and Docs
---
 CMakeLists.txt                | 11 ++++++++++-
 Docs/source/install/cmake.rst |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a0b28c9f86..76a5ecdd3f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,7 +43,15 @@ set_cxx17_superbuild()
 # this is an optional tool that stores compiled object files; allows fast
 # re-builds even with "make clean" in between. Mainly used to store AMReX
 # objects
-set_ccache()
+if(CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR)
+    set(WarpX_CCACHE_DEFAULT ON)
+else()
+    set(WarpX_CCACHE_DEFAULT OFF)  # we are a subproject in a superbuild
+endif()
+option(WarpX_CCACHE "Enable ccache for faster rebuilds" ${WarpX_CCACHE_DEFAULT})
+if(WarpX_CCACHE)
+    set_ccache()
+endif()
 
 
 # Output Directories ##########################################################
@@ -149,6 +157,7 @@ endif()
 # this defined the variable BUILD_TESTING which is ON by default
 #include(CTest)
 
+
 # Dependencies ################################################################
 #
 
diff --git a/Docs/source/install/cmake.rst b/Docs/source/install/cmake.rst
index d1dc25cf095..0882efd7fe2 100644
--- a/Docs/source/install/cmake.rst
+++ b/Docs/source/install/cmake.rst
@@ -116,7 +116,7 @@ By default, the most important dependencies of WarpX are automatically downloade
 CMake Option                  Default & Values                               Description
 ============================= ============================================== ===========================================================
 ``BUILD_SHARED_LIBS``         ON/**OFF**                                     `Build shared libraries for dependencies <https://cmake.org/cmake/help/latest/variable/BUILD_SHARED_LIBS.html>`__
-``CCACHE_PROGRAM``            First found ``ccache`` executable.             Set to ``-DCCACHE_PROGRAM=NO`` to disable CCache.
+``WarpX_CCACHE``              **ON**/OFF                                     Search and use CCache to speed up rebuilds.
 ``AMReX_CUDA_PTX_VERBOSE``    ON/**OFF**                                     Print CUDA code generation statistics from ``ptxas``.
 ``WarpX_amrex_src``           *None*                                         Path to AMReX source directory (preferred if set)
 ``WarpX_amrex_repo``          ``https://github.com/AMReX-Codes/amrex.git``   Repository URI to pull and build AMReX from

From 7a7c704ec0d34c46634452a49c07dcd34c1e3c13 Mon Sep 17 00:00:00 2001
From: Luca Fedeli <luca.fedeli@cea.fr>
Date: Fri, 2 Feb 2024 00:17:36 +0100
Subject: [PATCH 04/13] Update profile and job script for LUMI supercomputer
 (#4634)

* update LUMI profile and LUMI job script

* add advice to run on dev-g

* update job script and profile
---
 .../lumi-csc/lumi_warpx.profile.example       | 20 +++++++++-------
 Tools/machines/lumi-csc/submit.sh             | 24 +++++++++++++++----
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/Tools/machines/lumi-csc/lumi_warpx.profile.example b/Tools/machines/lumi-csc/lumi_warpx.profile.example
index 74b8aa8df17..2cb44035ce4 100644
--- a/Tools/machines/lumi-csc/lumi_warpx.profile.example
+++ b/Tools/machines/lumi-csc/lumi_warpx.profile.example
@@ -2,9 +2,9 @@
 #export proj=<yourProject>
 
 # required dependencies
-module load LUMI/23.03  partition/G
+module load LUMI/23.09  partition/G
 module load rocm/5.2.3  # waiting for 5.5 for next bump
-module load buildtools/23.03
+module load buildtools/23.09
 
 # optional: just an additional text editor
 module load nano
@@ -16,16 +16,16 @@ export LD_LIBRARY_PATH=${HOME}/sw/lumi/gpu/blaspp-master/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${HOME}/sw/lumi/gpu/lapackpp-master/lib64:$LD_LIBRARY_PATH
 
 # optional: for QED lookup table generation support
-module load Boost/1.81.0-cpeCray-23.03
+module load Boost/1.82.0-cpeCray-23.09
 
 # optional: for openPMD support
-module load cray-hdf5/1.12.2.3
+module load cray-hdf5/1.12.2.7
 export CMAKE_PREFIX_PATH=${HOME}/sw/lumi/gpu/c-blosc-1.21.1:$CMAKE_PREFIX_PATH
 export CMAKE_PREFIX_PATH=${HOME}/sw/lumi/gpu/adios2-2.8.3:$CMAKE_PREFIX_PATH
 export PATH=${HOME}/sw/lumi/gpu/adios2-2.8.3/bin:${PATH}
 
 # optional: for Python bindings or libEnsemble
-module load cray-python/3.9.13.1
+module load cray-python/3.10.10
 
 # an alias to request an interactive batch node for one hour
 #   for paralle execution, start on the batch node: srun <command>
@@ -41,9 +41,13 @@ export MPICH_GPU_SUPPORT_ENABLED=1
 export AMREX_AMD_ARCH=gfx90a
 
 # compiler environment hints
-export CC=$(which cc)
-export CXX=$(which CC)
-export FC=$(which ftn)
+# Warning: using the compiler wrappers cc and CC
+#          instead of amdclang and amdclang++
+#          currently results in a significant
+#          loss of performances
+export CC=$(which amdclang)
+export CXX=$(which amdclang++)
+export FC=$(which amdflang)
 export CFLAGS="-I${ROCM_PATH}/include"
 export CXXFLAGS="-I${ROCM_PATH}/include -Wno-pass-failed"
 export LDFLAGS="-L${ROCM_PATH}/lib -lamdhip64"
diff --git a/Tools/machines/lumi-csc/submit.sh b/Tools/machines/lumi-csc/submit.sh
index f6be702300a..d784471acd5 100644
--- a/Tools/machines/lumi-csc/submit.sh
+++ b/Tools/machines/lumi-csc/submit.sh
@@ -10,7 +10,7 @@
 #SBATCH --gpus-per-node=8
 #SBATCH --time=00:10:00
 
-export MPICH_GPU_SUPPORT_ENABLED=1
+date
 
 # note (12-12-22)
 # this environment setting is currently needed on LUMI to work-around a
@@ -30,7 +30,11 @@ export FI_MR_CACHE_MONITOR=memhooks  # alternative cache monitor
 # the home directory, which does not scale.
 export ROCFFT_RTC_CACHE_PATH=/dev/null
 
-export OMP_NUM_THREADS=1
+# Seen since August 2023
+# OLCFDEV-1597: OFI Poll Failed UNDELIVERABLE Errors
+# https://docs.olcf.ornl.gov/systems/frontier_user_guide.html#olcfdev-1597-ofi-poll-failed-undeliverable-errors
+export MPICH_SMP_SINGLE_COPY_MODE=NONE
+export FI_CXI_RX_MATCH_MODE=software
 
 # LUMI documentation suggests using the following wrapper script
 # to set the ROCR_VISIBLE_DEVICES to the value of SLURM_LOCALID
@@ -47,9 +51,21 @@ chmod +x ./select_gpu
 sleep 1
 
 # LUMI documentation suggests using the following CPU bind
-# so that the node local rank and GPU ID match
+# in order to have 6 threads per GPU (blosc compression in adios2 uses threads)
 # see https://docs.lumi-supercomputer.eu/runjobs/scheduled-jobs/lumig-job/
-CPU_BIND="map_cpu:48,56,16,24,1,8,32,40"
+#
+# WARNING: the following CPU_BIND options don't work on the dev-g partition.
+#          If you want to run your simulation on dev-g, please comment them
+#          out and replace them with CPU_BIND="map_cpu:49,57,17,25,1,9,33,41"
+#
+CPU_BIND="mask_cpu:7e000000000000,7e00000000000000"
+CPU_BIND="${CPU_BIND},7e0000,7e000000"
+CPU_BIND="${CPU_BIND},7e,7e00"
+CPU_BIND="${CPU_BIND},7e00000000,7e0000000000"
+
+export OMP_NUM_THREADS=6
+
+export MPICH_GPU_SUPPORT_ENABLED=1
 
 srun --cpu-bind=${CPU_BIND} ./select_gpu ./warpx inputs | tee outputs.txt
 rm -rf ./select_gpu

From 1093f618d9efb5d0971fa609ed24758364a17caa Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Thu, 1 Feb 2024 16:58:27 -0800
Subject: [PATCH 05/13] Fix BTD/Scrape Flush Count with Filters (#4657)

* Fix BTD/Scrape Flush Count with Filters

Move the counting of already flushed particles for writers that call the
I/O backends multiple time per data set, e.g., BTD and boundary scraping,
into the I/O backend.

Currently, filtering is done as the first step in I/O backends and thus
the previous count outside of the I/O backends was over-counting
particles that might still get filtered out.

Offset should be a `long`:
Overflow risk is very high for pure `int`.
Also, counter is `unsigned`, so `unsigned long` for now.

* Simplify: Remove `m_totalParticles_flushed_already`

Less state we can forget in checkpoint-restart and that we
have to transfer across API boundaries.
---
 Source/Diagnostics/BTDiagnostics.H            |  2 -
 Source/Diagnostics/BTDiagnostics.cpp          | 31 ++++--------
 .../BoundaryScrapingDiagnostics.cpp           | 17 ++-----
 Source/Diagnostics/Diagnostics.H              |  5 --
 Source/Diagnostics/FlushFormats/FlushFormat.H |  3 +-
 .../FlushFormats/FlushFormatAscent.H          |  3 +-
 .../FlushFormats/FlushFormatAscent.cpp        |  2 +-
 .../FlushFormats/FlushFormatCheckpoint.H      |  3 +-
 .../FlushFormats/FlushFormatCheckpoint.cpp    |  2 +-
 .../FlushFormats/FlushFormatOpenPMD.H         |  3 +-
 .../FlushFormats/FlushFormatOpenPMD.cpp       |  4 +-
 .../FlushFormats/FlushFormatPlotfile.H        |  3 +-
 .../FlushFormats/FlushFormatPlotfile.cpp      |  6 +--
 .../FlushFormats/FlushFormatSensei.H          |  3 +-
 .../FlushFormats/FlushFormatSensei.cpp        |  7 ++-
 Source/Diagnostics/FullDiagnostics.cpp        |  5 +-
 Source/Diagnostics/OpenPMDHelpFunction.H      | 18 +++++++
 Source/Diagnostics/OpenPMDHelpFunction.cpp    | 20 ++++++++
 Source/Diagnostics/WarpXOpenPMD.H             |  8 ++-
 Source/Diagnostics/WarpXOpenPMD.cpp           | 50 ++++++++-----------
 20 files changed, 94 insertions(+), 101 deletions(-)

diff --git a/Source/Diagnostics/BTDiagnostics.H b/Source/Diagnostics/BTDiagnostics.H
index ab894da69c2..f6c44c777ea 100644
--- a/Source/Diagnostics/BTDiagnostics.H
+++ b/Source/Diagnostics/BTDiagnostics.H
@@ -399,8 +399,6 @@ private:
         lab-frame data. */
     void InitializeParticleFunctors () override;
 
-    /** Update total number of particles flushed for all species for ith snapshot */
-    void UpdateTotalParticlesFlushed(int i_buffer);
     /** Reset total number of particles in the particle buffer to 0 for ith snapshot */
     void ResetTotalParticlesInBuffer(int i_buffer);
     /** Clear particle data stored in the particle buffer */
diff --git a/Source/Diagnostics/BTDiagnostics.cpp b/Source/Diagnostics/BTDiagnostics.cpp
index 0e517e8190c..f7965cd2688 100644
--- a/Source/Diagnostics/BTDiagnostics.cpp
+++ b/Source/Diagnostics/BTDiagnostics.cpp
@@ -129,7 +129,6 @@ void BTDiagnostics::DerivedInitData ()
         }
     }
     m_particles_buffer.resize(m_num_buffers);
-    m_totalParticles_flushed_already.resize(m_num_buffers);
     m_totalParticles_in_buffer.resize(m_num_buffers);
 
     // check that simulation can fill all BTD snapshots
@@ -1065,12 +1064,12 @@ BTDiagnostics::Flush (int i_buffer, bool force_flush)
         }
     }
     m_flush_format->WriteToFile(
-        m_varnames, m_mf_output[i_buffer], m_geom_output[i_buffer], warpx.getistep(),
-        labtime, m_output_species[i_buffer], nlev_output, file_name, m_file_min_digits,
+        m_varnames, m_mf_output.at(i_buffer), m_geom_output.at(i_buffer), warpx.getistep(),
+        labtime,
+        m_output_species.at(i_buffer), nlev_output, file_name, m_file_min_digits,
         m_plot_raw_fields, m_plot_raw_fields_guards,
-        use_pinned_pc, isBTD, i_buffer, m_buffer_flush_counter[i_buffer],
-        m_max_buffer_multifabs[i_buffer], m_geom_snapshot[i_buffer][0], isLastBTDFlush,
-        m_totalParticles_flushed_already[i_buffer]);
+        use_pinned_pc, isBTD, i_buffer, m_buffer_flush_counter.at(i_buffer),
+        m_max_buffer_multifabs.at(i_buffer), m_geom_snapshot.at(i_buffer).at(0), isLastBTDFlush);
 
     // Rescaling the box for plotfile after WriteToFile. This is because, for plotfiles, when writing particles, amrex checks if the particles are within the bounds defined by the box. However, in BTD, particles can be (at max) 1 cell outside the bounds of the geometry. So we keep a one-cell bigger box for plotfile when writing out the particle data and rescale after.
     if (m_format == "plotfile") {
@@ -1104,7 +1103,6 @@ BTDiagnostics::Flush (int i_buffer, bool force_flush)
     NullifyFirstFlush(i_buffer);
     // if particles are selected for output then update and reset counters
     if (!m_output_species_names.empty()) {
-        UpdateTotalParticlesFlushed(i_buffer);
         ResetTotalParticlesInBuffer(i_buffer);
         ClearParticleBuffer(i_buffer);
     }
@@ -1271,10 +1269,10 @@ void BTDiagnostics::MergeBuffersForPlotfile (int i_snapshot)
                 InterleaveSpeciesHeader(recent_species_Header,snapshot_species_Header,
                                         m_output_species_names[i], m_buffer_flush_counter[i_snapshot]);
                 if (BufferSpeciesHeader.m_total_particles == 0) { continue; }
-                if (m_totalParticles_flushed_already[i_snapshot][i]==0) {
-                WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
-                    std::rename(recent_ParticleHdrFilename.c_str(), snapshot_ParticleHdrFilename.c_str()) == 0,
-                    std::string("Renaming ").append(recent_ParticleHdrFilename).append(" to ").append(snapshot_ParticleHdrFilename).append(" has failed"));
+                if (!amrex::FileExists(snapshot_ParticleHdrFilename)) {
+                    WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
+                        std::rename(recent_ParticleHdrFilename.c_str(), snapshot_ParticleHdrFilename.c_str()) == 0,
+                        std::string("Renaming ").append(recent_ParticleHdrFilename).append(" to ").append(snapshot_ParticleHdrFilename).append(" has failed"));
                 } else {
                     InterleaveParticleDataHeader(recent_ParticleHdrFilename,
                                                  snapshot_ParticleHdrFilename);
@@ -1435,10 +1433,8 @@ BTDiagnostics::InitializeParticleBuffer ()
     const MultiParticleContainer& mpc = warpx.GetPartContainer();
     for (int i = 0; i < m_num_buffers; ++i) {
         m_particles_buffer[i].resize(m_output_species_names.size());
-        m_totalParticles_flushed_already[i].resize(m_output_species_names.size());
         m_totalParticles_in_buffer[i].resize(m_output_species_names.size());
         for (int isp = 0; isp < m_particles_buffer[i].size(); ++isp) {
-            m_totalParticles_flushed_already[i][isp] = 0;
             m_totalParticles_in_buffer[i][isp] = 0;
             m_particles_buffer[i][isp] = std::make_unique<PinnedMemoryParticleContainer>(WarpX::GetInstance().GetParGDB());
             const int idx = mpc.getSpeciesID(m_output_species_names[isp]);
@@ -1489,15 +1485,6 @@ BTDiagnostics::PrepareParticleDataForOutput()
     }
 }
 
-void
-BTDiagnostics::UpdateTotalParticlesFlushed(int i_buffer)
-{
-    for (int isp = 0; isp < m_totalParticles_flushed_already[i_buffer].size(); ++isp) {
-        m_totalParticles_flushed_already[i_buffer][isp] += static_cast<int>(
-            m_particles_buffer[i_buffer][isp]->TotalNumberOfParticles());
-    }
-}
-
 void
 BTDiagnostics::ResetTotalParticlesInBuffer(int i_buffer)
 {
diff --git a/Source/Diagnostics/BoundaryScrapingDiagnostics.cpp b/Source/Diagnostics/BoundaryScrapingDiagnostics.cpp
index c85dbd6b226..11ffce02f09 100644
--- a/Source/Diagnostics/BoundaryScrapingDiagnostics.cpp
+++ b/Source/Diagnostics/BoundaryScrapingDiagnostics.cpp
@@ -102,15 +102,6 @@ BoundaryScrapingDiagnostics::InitializeParticleBuffer ()
             m_output_species[i_buffer].push_back(ParticleDiag(m_diag_name, species_name, pc, bnd_buffer));
         }
     }
-    // Initialize total number of particles flushed
-    m_totalParticles_flushed_already.resize(m_num_buffers);
-    for (int i_buffer = 0; i_buffer < m_num_buffers; ++i_buffer) {
-        int const n_species = static_cast<int>(m_output_species_names.size());
-        m_totalParticles_flushed_already[i_buffer].resize(n_species);
-        for (int i_species=0; i_species<n_species; i_species++) {
-            m_totalParticles_flushed_already[i_buffer][i_species] = 0;
-        }
-    }
 }
 
 bool
@@ -157,11 +148,13 @@ BoundaryScrapingDiagnostics::Flush (int i_buffer, bool /* force_flush */)
     const std::string file_prefix = m_file_prefix + "/particles_at_" + particle_buffer.boundaryName(i_buffer);
 
     m_flush_format->WriteToFile(
-        m_varnames, m_mf_output[i_buffer], m_geom_output[i_buffer], warpx.getistep(),
-        warpx.gett_new(0), m_output_species[i_buffer], nlev_output, file_prefix,
+        m_varnames, m_mf_output.at(i_buffer), m_geom_output.at(i_buffer), warpx.getistep(),
+        warpx.gett_new(0),
+        m_output_species.at(i_buffer),
+        nlev_output, file_prefix,
         m_file_min_digits, false, false, use_pinned_pc, isBTD,
         warpx.getistep(0), bufferID, numBTDBuffers, geom,
-        isLastBTD, m_totalParticles_flushed_already[i_buffer]);
+        isLastBTD);
 
     // Now that the data has been written out, clear out the buffer
     particle_buffer.clearParticles(i_buffer);
diff --git a/Source/Diagnostics/Diagnostics.H b/Source/Diagnostics/Diagnostics.H
index 53ce319d747..c0d2a9f0d53 100644
--- a/Source/Diagnostics/Diagnostics.H
+++ b/Source/Diagnostics/Diagnostics.H
@@ -309,11 +309,6 @@ protected:
     /** Vector of pointers to functors to compute particle output per species*/
     amrex::Vector< std::unique_ptr<ComputeParticleDiagFunctor> > m_all_particle_functors;
 
-    /** Vector of total number of particles previously flushed, per species, per snapshot.
-     *  The first vector is for total number of snapshots and second vector loops
-     *  over the total number of species selected for diagnostics.
-     */
-    amrex::Vector< amrex::Vector <int> > m_totalParticles_flushed_already;
     /** Vector of total number of particles in the buffer, per species, per snapshot.
      *  The first vector is for total number of snapshots and second vector loops
      *  over the total number of species selected for diagnostics.
diff --git a/Source/Diagnostics/FlushFormats/FlushFormat.H b/Source/Diagnostics/FlushFormats/FlushFormat.H
index 403e9df7857..65741e4ff20 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormat.H
+++ b/Source/Diagnostics/FlushFormats/FlushFormat.H
@@ -24,8 +24,7 @@ public:
         bool isBTD = false, int snapshotID = -1,
         int bufferID = 1, int numBuffers = 1,
         const amrex::Geometry& full_BTD_snapshot = amrex::Geometry(),
-        bool isLastBTDFlush = false,
-        const amrex::Vector<int>& totalParticlesFlushedAlready = amrex::Vector<int>() ) const = 0;
+        bool isLastBTDFlush = false) const = 0;
 
     FlushFormat () = default;
     virtual ~FlushFormat() = default;
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatAscent.H b/Source/Diagnostics/FlushFormats/FlushFormatAscent.H
index 228e4bc5cf6..9d8d3fcd7d2 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatAscent.H
+++ b/Source/Diagnostics/FlushFormats/FlushFormatAscent.H
@@ -41,8 +41,7 @@ public:
         bool isBTD = false, int snapshotID = -1,
         int bufferID = 1, int numBuffers = 1,
         const amrex::Geometry& full_BTD_snapshot = amrex::Geometry(),
-        bool isLastBTDFlush = false,
-        const amrex::Vector<int>& totalParticlesFlushedAlready = amrex::Vector<int>() ) const override;
+        bool isLastBTDFlush = false ) const override;
 
 #ifdef AMREX_USE_ASCENT
     /** \brief Do in-situ visualization for particle data.
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatAscent.cpp b/Source/Diagnostics/FlushFormats/FlushFormatAscent.cpp
index 980047e3b46..abfba37cd15 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatAscent.cpp
+++ b/Source/Diagnostics/FlushFormats/FlushFormatAscent.cpp
@@ -21,7 +21,7 @@ FlushFormatAscent::WriteToFile (
     const bool /*use_pinned_pc*/,
     bool isBTD, int /*snapshotID*/, int /*bufferID*/, int /*numBuffers*/,
     const amrex::Geometry& /*full_BTD_snapshot*/,
-    bool /*isLastBTDFlush*/, const amrex::Vector<int>& /* totalParticlesFlushedAlready*/) const
+    bool /*isLastBTDFlush*/) const
 {
 #ifdef AMREX_USE_ASCENT
     WARPX_PROFILE("FlushFormatAscent::WriteToFile()");
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatCheckpoint.H b/Source/Diagnostics/FlushFormats/FlushFormatCheckpoint.H
index f6aad226d75..5c26ac97f61 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatCheckpoint.H
+++ b/Source/Diagnostics/FlushFormats/FlushFormatCheckpoint.H
@@ -28,8 +28,7 @@ class FlushFormatCheckpoint final : public FlushFormatPlotfile
         bool isBTD = false, int snapshotID = -1,
         int bufferID = 1, int numBuffers = 1,
         const amrex::Geometry& full_BTD_snapshot = amrex::Geometry(),
-        bool isLastBTDFlush = false,
-        const amrex::Vector<int>& totalParticlesFlushedAlready = amrex::Vector<int>() ) const final;
+        bool isLastBTDFlush = false) const final;
 
     void CheckpointParticles (const std::string& dir,
                               const amrex::Vector<ParticleDiag>& particle_diags) const;
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatCheckpoint.cpp b/Source/Diagnostics/FlushFormats/FlushFormatCheckpoint.cpp
index 5f59cd723da..d77437fb931 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatCheckpoint.cpp
+++ b/Source/Diagnostics/FlushFormats/FlushFormatCheckpoint.cpp
@@ -39,7 +39,7 @@ FlushFormatCheckpoint::WriteToFile (
         bool /*isBTD*/, int /*snapshotID*/,
         int /*bufferID*/, int /*numBuffers*/,
         const amrex::Geometry& /*full_BTD_snapshot*/,
-        bool /*isLastBTDFlush*/, const amrex::Vector<int>& /* totalParticlesFlushedAlready*/) const
+        bool /*isLastBTDFlush*/) const
 {
     WARPX_PROFILE("FlushFormatCheckpoint::WriteToFile()");
 
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatOpenPMD.H b/Source/Diagnostics/FlushFormats/FlushFormatOpenPMD.H
index 88380407f5e..141760ac2a3 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatOpenPMD.H
+++ b/Source/Diagnostics/FlushFormats/FlushFormatOpenPMD.H
@@ -40,8 +40,7 @@ public:
         bool isBTD = false, int snapshotID = -1,
         int bufferID = 1, int numBuffers = 1,
         const amrex::Geometry& full_BTD_snapshot = amrex::Geometry(),
-        bool isLastBTDFlush = false,
-        const amrex::Vector<int>& totalParticlesFlushedAlready = amrex::Vector<int>() ) const override;
+        bool isLastBTDFlush = false ) const override;
 
     ~FlushFormatOpenPMD () override = default;
 
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatOpenPMD.cpp b/Source/Diagnostics/FlushFormats/FlushFormatOpenPMD.cpp
index 3b7006243e7..e0c8c4ef2d6 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatOpenPMD.cpp
+++ b/Source/Diagnostics/FlushFormats/FlushFormatOpenPMD.cpp
@@ -126,7 +126,7 @@ FlushFormatOpenPMD::WriteToFile (
     const bool use_pinned_pc,
     bool isBTD, int snapshotID, int bufferID, int numBuffers,
     const amrex::Geometry& full_BTD_snapshot,
-    bool isLastBTDFlush, const amrex::Vector<int>& totalParticlesFlushedAlready) const
+    bool isLastBTDFlush) const
 {
     WARPX_PROFILE("FlushFormatOpenPMD::WriteToFile()");
     const std::string& filename = amrex::Concatenate(prefix, iteration[0], file_min_digits);
@@ -164,7 +164,7 @@ FlushFormatOpenPMD::WriteToFile (
 
     // particles: all (reside only on locally finest level)
     m_OpenPMDPlotWriter->WriteOpenPMDParticles(
-        particle_diags, static_cast<amrex::Real>(time), use_pinned_pc, isBTD, isLastBTDFlush, totalParticlesFlushedAlready);
+        particle_diags, static_cast<amrex::Real>(time), use_pinned_pc, isBTD, isLastBTDFlush);
 
     // signal that no further updates will be written to this iteration
     m_OpenPMDPlotWriter->CloseStep(isBTD, isLastBTDFlush);
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatPlotfile.H b/Source/Diagnostics/FlushFormats/FlushFormatPlotfile.H
index 486dcc3b5ee..c62056b8907 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatPlotfile.H
+++ b/Source/Diagnostics/FlushFormats/FlushFormatPlotfile.H
@@ -35,8 +35,7 @@ public:
         bool isBTD = false, int snapshotID = -1,
         int bufferID = 1, int numBuffers = 1,
         const amrex::Geometry& full_BTD_snapshot = amrex::Geometry(),
-        bool isLastBTDFlush = false,
-        const amrex::Vector<int>& totalParticlesFlushedAlready = amrex::Vector<int>() ) const override;
+        bool isLastBTDFlush = false) const override;
 
     /** Write general info of the run into the plotfile */
     void WriteJobInfo(const std::string& dir) const;
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatPlotfile.cpp b/Source/Diagnostics/FlushFormats/FlushFormatPlotfile.cpp
index df73ed34c94..970d9a504d2 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatPlotfile.cpp
+++ b/Source/Diagnostics/FlushFormats/FlushFormatPlotfile.cpp
@@ -65,7 +65,7 @@ FlushFormatPlotfile::WriteToFile (
     const bool /*use_pinned_pc*/,
     bool isBTD, int snapshotID,  int bufferID, int numBuffers,
     const amrex::Geometry& /*full_BTD_snapshot*/,
-    bool isLastBTDFlush, const amrex::Vector<int>& /* totalParticlesFlushedAlready*/) const
+    bool isLastBTDFlush) const
 {
     WARPX_PROFILE("FlushFormatPlotfile::WriteToFile()");
     auto & warpx = WarpX::GetInstance();
@@ -340,9 +340,9 @@ FlushFormatPlotfile::WriteWarpXHeader(
 void
 FlushFormatPlotfile::WriteParticles(const std::string& dir,
                                     const amrex::Vector<ParticleDiag>& particle_diags,
-                                    const amrex::Real time, bool isBTD) const
+                                    const amrex::Real time,
+                                    bool isBTD) const
 {
-
     for (const auto& part_diag : particle_diags) {
         WarpXParticleContainer* pc = part_diag.getParticleContainer();
         PinnedMemoryParticleContainer* pinned_pc = part_diag.getPinnedParticleContainer();
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatSensei.H b/Source/Diagnostics/FlushFormats/FlushFormatSensei.H
index 54eb7099ba4..d2ec9a5a4e0 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatSensei.H
+++ b/Source/Diagnostics/FlushFormats/FlushFormatSensei.H
@@ -61,8 +61,7 @@ public:
         bool isBTD = false, int snapshotID = -1,
         int bufferID = 1, int numBuffers = 1,
         const amrex::Geometry& full_BTD_snapshot = amrex::Geometry(),
-        bool isLastBTDFlush = false,
-        const amrex::Vector<int>& totalParticlesFlushedAlready = amrex::Vector<int>() ) const override;
+        bool isLastBTDFlush = false) const override;
 
     /** \brief Do in-situ visualization for particle data.
      * \param[in] particle_diags Each element of this vector handles output of 1 species.
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatSensei.cpp b/Source/Diagnostics/FlushFormats/FlushFormatSensei.cpp
index e162b8b3121..348e1da4a00 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatSensei.cpp
+++ b/Source/Diagnostics/FlushFormats/FlushFormatSensei.cpp
@@ -53,14 +53,13 @@ FlushFormatSensei::WriteToFile (
     bool plot_raw_fields, bool plot_raw_fields_guards,
     const bool use_pinned_pc,
     bool isBTD, int /*snapshotID*/, int /*bufferID*/, int /*numBuffers*/,
-    const amrex::Geometry& /*full_BTD_snapshot*/, bool /*isLastBTDFlush*/,
-    const amrex::Vector<int>& totalParticlesFlushedAlready) const
+    const amrex::Geometry& /*full_BTD_snapshot*/, bool /*isLastBTDFlush*/) const
 {
     amrex::ignore_unused(
         geom, nlev, prefix, file_min_digits,
         plot_raw_fields, plot_raw_fields_guards,
-        use_pinned_pc,
-        totalParticlesFlushedAlready);
+        use_pinned_pc
+    );
 
 #ifndef AMREX_USE_SENSEI_INSITU
     amrex::ignore_unused(varnames, mf, iteration, time, particle_diags,
diff --git a/Source/Diagnostics/FullDiagnostics.cpp b/Source/Diagnostics/FullDiagnostics.cpp
index 4f1e47a2a52..fd329a38220 100644
--- a/Source/Diagnostics/FullDiagnostics.cpp
+++ b/Source/Diagnostics/FullDiagnostics.cpp
@@ -133,8 +133,9 @@ FullDiagnostics::Flush ( int i_buffer, bool /* force_flush */ )
     auto & warpx = WarpX::GetInstance();
 
     m_flush_format->WriteToFile(
-        m_varnames, m_mf_output[i_buffer], m_geom_output[i_buffer], warpx.getistep(),
-        warpx.gett_new(0), m_output_species[i_buffer], nlev_output, m_file_prefix,
+        m_varnames, m_mf_output.at(i_buffer), m_geom_output.at(i_buffer), warpx.getistep(),
+        warpx.gett_new(0),
+        m_output_species.at(i_buffer), nlev_output, m_file_prefix,
         m_file_min_digits, m_plot_raw_fields, m_plot_raw_fields_guards);
 
     FlushRaw();
diff --git a/Source/Diagnostics/OpenPMDHelpFunction.H b/Source/Diagnostics/OpenPMDHelpFunction.H
index 9db4b9fb194..d2f2c4f9f9d 100644
--- a/Source/Diagnostics/OpenPMDHelpFunction.H
+++ b/Source/Diagnostics/OpenPMDHelpFunction.H
@@ -14,7 +14,25 @@
 #include <string>
 
 
+/** Determine the preferred file ending if unspecified
+ *
+ * @return file ending without the "."
+ */
 std::string
 WarpXOpenPMDFileType ();
 
+#ifdef WARPX_USE_OPENPMD
+/** Determine how many particles were already written in this species and step
+ *
+ * This checks for a particle species the current size of the id attribute, if it exists,
+ * and if it does it takes its extent as the number of particles already on disk.
+ *
+ * Note that this checks declared size, not necessarily written size.
+ *
+ * @return exisitng extent of the "id" attribute or zero.
+ */
+unsigned long
+num_already_flushed (openPMD::ParticleSpecies & currSpecies);
+#endif
+
 #endif  // WARPX_OPENPMDHELPFUNCTION_H_
diff --git a/Source/Diagnostics/OpenPMDHelpFunction.cpp b/Source/Diagnostics/OpenPMDHelpFunction.cpp
index a898c97b6b4..6170249b52b 100644
--- a/Source/Diagnostics/OpenPMDHelpFunction.cpp
+++ b/Source/Diagnostics/OpenPMDHelpFunction.cpp
@@ -27,3 +27,23 @@ WarpXOpenPMDFileType ()
 #endif // WARPX_USE_OPENPMD
     return openPMDFileType;
 }
+
+#ifdef WARPX_USE_OPENPMD
+unsigned long
+num_already_flushed (openPMD::ParticleSpecies & currSpecies)
+{
+    const auto *const scalar = openPMD::RecordComponent::SCALAR;
+
+    unsigned long ParticleFlushOffset = 0;
+
+    if (currSpecies.contains("id")) {
+        if (currSpecies["id"].contains(scalar)) {
+            if (!currSpecies["id"][scalar].empty()) {
+                ParticleFlushOffset = currSpecies["id"][scalar].getExtent().at(0);
+            }
+        }
+    }
+
+    return ParticleFlushOffset;
+}
+#endif
diff --git a/Source/Diagnostics/WarpXOpenPMD.H b/Source/Diagnostics/WarpXOpenPMD.H
index e3b7b893d0a..4597dacd9ae 100644
--- a/Source/Diagnostics/WarpXOpenPMD.H
+++ b/Source/Diagnostics/WarpXOpenPMD.H
@@ -125,8 +125,7 @@ public:
               amrex::Real time,
               bool use_pinned_pc = false,
               bool isBTD = false,
-              bool isLastBTDFlush = false,
-              const amrex::Vector<int>& totalParticlesFlushedAlready = amrex::Vector<int>());
+              bool isLastBTDFlush = false);
 
   /** Write out all openPMD fields for all active MR levels
    *
@@ -290,9 +289,9 @@ private:
    * @param[in] int_comp_names The int attribute names, from WarpX
    * @param[in] charge         Charge of the particles (note: fix for ions)
    * @param[in] mass           Mass of the particles
+   * @param[inout] ParticleFlushOffset previously flushed number of particles in BTD
    * @param[in] isBTD is this a backtransformed diagnostics (BTD) write?
    * @param[in] isLastBTDFlush is this the last time we will flush this BTD station?
-   * @param[in] ParticleFlushOffset previously flushed number of particles in BTD
    */
   void DumpToFile (ParticleContainer* pc,
             const std::string& name,
@@ -304,8 +303,7 @@ private:
             amrex::ParticleReal charge,
             amrex::ParticleReal mass,
             bool isBTD = false,
-            bool isLastBTDFlush = false,
-            int ParticleFlushOffset = 0);
+            bool isLastBTDFlush = false);
 
   /** Get the openPMD-api filename for openPMD::Series
    *
diff --git a/Source/Diagnostics/WarpXOpenPMD.cpp b/Source/Diagnostics/WarpXOpenPMD.cpp
index 71d96a47927..64411ecf6e4 100644
--- a/Source/Diagnostics/WarpXOpenPMD.cpp
+++ b/Source/Diagnostics/WarpXOpenPMD.cpp
@@ -519,9 +519,11 @@ WarpXOpenPMDPlot::Init (openPMD::Access access, bool isBTD)
 
 void
 WarpXOpenPMDPlot::WriteOpenPMDParticles (const amrex::Vector<ParticleDiag>& particle_diags,
-                  const amrex::Real time, const bool use_pinned_pc,
-                  const bool isBTD, const bool isLastBTDFlush,
-                  const amrex::Vector<int>& totalParticlesFlushedAlready)
+                  const amrex::Real time,
+                  const bool use_pinned_pc,
+                  const bool isBTD,
+                  const bool isLastBTDFlush
+)
 {
 WARPX_PROFILE("WarpXOpenPMDPlot::WriteOpenPMDParticles()");
 
@@ -618,31 +620,15 @@ for (unsigned i = 0, n = particle_diags.size(); i < n; ++i) {
 
     // real_names contains a list of all real particle attributes.
     // real_flags is 1 or 0, whether quantity is dumped or not.
-    {
-        if (isBTD) {
-            DumpToFile(&tmp,
-                particle_diags[i].getSpeciesName(),
-                m_CurrentStep,
-                real_flags,
-                int_flags,
-                real_names, int_names,
-                pc->getCharge(), pc->getMass(),
-                isBTD, isLastBTDFlush,
-                totalParticlesFlushedAlready[i]
-            );
-        } else {
-            DumpToFile(&tmp,
-                particle_diags[i].getSpeciesName(),
-                m_CurrentStep,
-                real_flags,
-                int_flags,
-                real_names, int_names,
-                pc->getCharge(), pc->getMass(),
-                isBTD, isLastBTDFlush,
-                0
-            );
-        }
-    }
+    DumpToFile(&tmp,
+        particle_diags.at(i).getSpeciesName(),
+        m_CurrentStep,
+        real_flags,
+        int_flags,
+        real_names, int_names,
+        pc->getCharge(), pc->getMass(),
+        isBTD, isLastBTDFlush
+    );
 }
 }
 
@@ -657,8 +643,9 @@ WarpXOpenPMDPlot::DumpToFile (ParticleContainer* pc,
                     amrex::ParticleReal const charge,
                     amrex::ParticleReal const mass,
                     const bool isBTD,
-                    const bool isLastBTDFlush,
-                    int ParticleFlushOffset) {
+                    const bool isLastBTDFlush
+)
+{
     WARPX_ALWAYS_ASSERT_WITH_MESSAGE(m_Series != nullptr, "openPMD: series must be initialized");
 
     AMREX_ALWAYS_ASSERT(write_real_comp.size() == pc->NumRealComps());
@@ -672,6 +659,9 @@ WarpXOpenPMDPlot::DumpToFile (ParticleContainer* pc,
     openPMD::Iteration currIteration = GetIteration(iteration, isBTD);
     openPMD::ParticleSpecies currSpecies = currIteration.particles[name];
 
+    // only BTD writes multiple times into the same step, zero for other methods
+    unsigned long ParticleFlushOffset = isBTD ? num_already_flushed(currSpecies) : 0;
+
     // prepare data structures the first time BTD has non-zero particles
     //   we set some of them to zero extent, so we need to time that well
     bool const is_first_flush_with_particles = num_dump_particles > 0 && ParticleFlushOffset == 0;

From 206b0815a060aa9ccbf0f71a46f1142137f5b8a8 Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Thu, 1 Feb 2024 20:06:59 -0800
Subject: [PATCH 06/13] Release 24.02 (#4660)

* AMReX: 24.02

* pyAMReX: 24.02

* WarpX: 24.02
---
 .github/workflows/cuda.yml       | 2 +-
 CMakeLists.txt                   | 2 +-
 Docs/source/conf.py              | 4 ++--
 Python/setup.py                  | 2 +-
 Regression/WarpX-GPU-tests.ini   | 2 +-
 Regression/WarpX-tests.ini       | 2 +-
 cmake/dependencies/AMReX.cmake   | 4 ++--
 cmake/dependencies/pyAMReX.cmake | 4 ++--
 run_test.sh                      | 2 +-
 setup.py                         | 2 +-
 10 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 9960cdfbb29..79916c455d1 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -115,7 +115,7 @@ jobs:
         which nvcc || echo "nvcc not in PATH!"
 
         git clone https://github.com/AMReX-Codes/amrex.git ../amrex
-        cd ../amrex && git checkout --detach 689144d157a0106faf3d0ae89f8d90b0250cf975 && cd -
+        cd ../amrex && git checkout --detach 24.02 && cd -
         make COMP=gcc QED=FALSE USE_MPI=TRUE USE_GPU=TRUE USE_OMP=FALSE USE_PSATD=TRUE USE_CCACHE=TRUE -j 2
 
         ccache -s
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 76a5ecdd3f3..3a947b01dcd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Preamble ####################################################################
 #
 cmake_minimum_required(VERSION 3.20.0)
-project(WarpX VERSION 24.01)
+project(WarpX VERSION 24.02)
 
 include(${WarpX_SOURCE_DIR}/cmake/WarpXFunctions.cmake)
 
diff --git a/Docs/source/conf.py b/Docs/source/conf.py
index 48a02c5d216..b34c437b829 100644
--- a/Docs/source/conf.py
+++ b/Docs/source/conf.py
@@ -103,9 +103,9 @@ def __init__(self, *args, **kwargs):
 # built documents.
 #
 # The short X.Y version.
-version = u'24.01'
+version = u'24.02'
 # The full version, including alpha/beta/rc tags.
-release = u'24.01'
+release = u'24.02'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/Python/setup.py b/Python/setup.py
index f82e6563a31..3c6c0f605c0 100644
--- a/Python/setup.py
+++ b/Python/setup.py
@@ -54,7 +54,7 @@
     package_data = {}
 
 setup(name = 'pywarpx',
-      version = '24.01',
+      version = '24.02',
       packages = ['pywarpx'],
       package_dir = {'pywarpx': 'pywarpx'},
       description = """Wrapper of WarpX""",
diff --git a/Regression/WarpX-GPU-tests.ini b/Regression/WarpX-GPU-tests.ini
index 70c62b190fb..0659d530cf5 100644
--- a/Regression/WarpX-GPU-tests.ini
+++ b/Regression/WarpX-GPU-tests.ini
@@ -60,7 +60,7 @@ emailBody = Check https://ccse.lbl.gov/pub/GpuRegressionTesting/WarpX/ for more
 
 [AMReX]
 dir = /home/regtester/git/amrex/
-branch = 689144d157a0106faf3d0ae89f8d90b0250cf975
+branch = 24.02
 
 [source]
 dir = /home/regtester/git/WarpX
diff --git a/Regression/WarpX-tests.ini b/Regression/WarpX-tests.ini
index ab11d70dfcc..ae22eba499d 100644
--- a/Regression/WarpX-tests.ini
+++ b/Regression/WarpX-tests.ini
@@ -59,7 +59,7 @@ emailBody = Check https://ccse.lbl.gov/pub/RegressionTesting/WarpX/ for more det
 
 [AMReX]
 dir = /home/regtester/AMReX_RegTesting/amrex/
-branch = 689144d157a0106faf3d0ae89f8d90b0250cf975
+branch = 24.02
 
 [source]
 dir = /home/regtester/AMReX_RegTesting/warpx
diff --git a/cmake/dependencies/AMReX.cmake b/cmake/dependencies/AMReX.cmake
index 9b74c8db7fd..0f6a15a5ff4 100644
--- a/cmake/dependencies/AMReX.cmake
+++ b/cmake/dependencies/AMReX.cmake
@@ -250,7 +250,7 @@ macro(find_amrex)
         endif()
         set(COMPONENT_PRECISION ${WarpX_PRECISION} P${WarpX_PARTICLE_PRECISION})
 
-        find_package(AMReX 24.01 CONFIG REQUIRED COMPONENTS ${COMPONENT_ASCENT} ${COMPONENT_DIMS} ${COMPONENT_EB} PARTICLES ${COMPONENT_PIC} ${COMPONENT_PRECISION} ${COMPONENT_SENSEI} LSOLVERS)
+        find_package(AMReX 24.02 CONFIG REQUIRED COMPONENTS ${COMPONENT_ASCENT} ${COMPONENT_DIMS} ${COMPONENT_EB} PARTICLES ${COMPONENT_PIC} ${COMPONENT_PRECISION} ${COMPONENT_SENSEI} LSOLVERS)
         # note: TINYP skipped because user-configured and optional
 
         # AMReX CMake helper scripts
@@ -269,7 +269,7 @@ set(WarpX_amrex_src ""
 set(WarpX_amrex_repo "https://github.com/AMReX-Codes/amrex.git"
     CACHE STRING
     "Repository URI to pull and build AMReX from if(WarpX_amrex_internal)")
-set(WarpX_amrex_branch "689144d157a0106faf3d0ae89f8d90b0250cf975"
+set(WarpX_amrex_branch "24.02"
     CACHE STRING
     "Repository branch for WarpX_amrex_repo if(WarpX_amrex_internal)")
 
diff --git a/cmake/dependencies/pyAMReX.cmake b/cmake/dependencies/pyAMReX.cmake
index 3c043f60226..b4cf9f3f9c1 100644
--- a/cmake/dependencies/pyAMReX.cmake
+++ b/cmake/dependencies/pyAMReX.cmake
@@ -64,7 +64,7 @@ function(find_pyamrex)
         endif()
     elseif(NOT WarpX_pyamrex_internal)
         # TODO: MPI control
-        find_package(pyAMReX 24.01 CONFIG REQUIRED)
+        find_package(pyAMReX 24.02 CONFIG REQUIRED)
         message(STATUS "pyAMReX: Found version '${pyAMReX_VERSION}'")
     endif()
 endfunction()
@@ -79,7 +79,7 @@ option(WarpX_pyamrex_internal "Download & build pyAMReX" ON)
 set(WarpX_pyamrex_repo "https://github.com/AMReX-Codes/pyamrex.git"
     CACHE STRING
     "Repository URI to pull and build pyamrex from if(WarpX_pyamrex_internal)")
-set(WarpX_pyamrex_branch "cdf03496f6809527b97950e077508ca4b201fa9b"
+set(WarpX_pyamrex_branch "24.02"
     CACHE STRING
     "Repository branch for WarpX_pyamrex_repo if(WarpX_pyamrex_internal)")
 
diff --git a/run_test.sh b/run_test.sh
index 48857d264cb..e1b45ab7c28 100755
--- a/run_test.sh
+++ b/run_test.sh
@@ -68,7 +68,7 @@ python3 -m pip install --upgrade -r warpx/Regression/requirements.txt
 
 # Clone AMReX and warpx-data
 git clone https://github.com/AMReX-Codes/amrex.git
-cd amrex && git checkout --detach 689144d157a0106faf3d0ae89f8d90b0250cf975 && cd -
+cd amrex && git checkout --detach 24.02 && cd -
 # warpx-data contains various required data sets
 git clone --depth 1 https://github.com/ECP-WarpX/warpx-data.git
 # openPMD-example-datasets contains various required data sets
diff --git a/setup.py b/setup.py
index d3efeaaacd5..197a39ce23f 100644
--- a/setup.py
+++ b/setup.py
@@ -278,7 +278,7 @@ def build_extension(self, ext):
 setup(
     name='pywarpx',
     # note PEP-440 syntax: x.y.zaN but x.y.z.devN
-    version = '24.01',
+    version = '24.02',
     packages = ['pywarpx'],
     package_dir = {'pywarpx': 'Python/pywarpx'},
     author='Jean-Luc Vay, David P. Grote, Maxence Thévenet, Rémi Lehe, Andrew Myers, Weiqun Zhang, Axel Huebl, et al.',

From 9d8ecf93df7c8713df08eac96c1f88f31b7fcd0d Mon Sep 17 00:00:00 2001
From: Roelof Groenewald <40245517+roelof-groenewald@users.noreply.github.com>
Date: Thu, 1 Feb 2024 20:08:18 -0800
Subject: [PATCH 07/13] Add install instructions for ALCF's Polaris (#4636)

* add polaris machine files

* add doc page for Polaris
---
 Docs/source/install/hpc.rst                   |   1 +
 Docs/source/install/hpc/polaris.rst           | 187 ++++++++++++++++++
 .../polaris-alcf/install_gpu_dependencies.sh  | 123 ++++++++++++
 Tools/machines/polaris-alcf/polaris_gpu.pbs   |  36 ++++
 .../polaris_gpu_warpx.profile.example         |  51 +++++
 5 files changed, 398 insertions(+)
 create mode 100644 Docs/source/install/hpc/polaris.rst
 create mode 100755 Tools/machines/polaris-alcf/install_gpu_dependencies.sh
 create mode 100644 Tools/machines/polaris-alcf/polaris_gpu.pbs
 create mode 100644 Tools/machines/polaris-alcf/polaris_gpu_warpx.profile.example

diff --git a/Docs/source/install/hpc.rst b/Docs/source/install/hpc.rst
index 9617f2a7fd6..a7b0f636b56 100644
--- a/Docs/source/install/hpc.rst
+++ b/Docs/source/install/hpc.rst
@@ -46,6 +46,7 @@ This section documents quick-start guides for a selection of supercomputers that
    hpc/lxplus
    hpc/ookami
    hpc/perlmutter
+   hpc/polaris
    hpc/quartz
    hpc/spock
    hpc/summit
diff --git a/Docs/source/install/hpc/polaris.rst b/Docs/source/install/hpc/polaris.rst
new file mode 100644
index 00000000000..d20ecccee32
--- /dev/null
+++ b/Docs/source/install/hpc/polaris.rst
@@ -0,0 +1,187 @@
+.. _building-polaris:
+
+Polaris (ALCF)
+==============
+
+The `Polaris cluster <https://docs.alcf.anl.gov/polaris/getting-started/>`__ is located at ALCF.
+
+
+Introduction
+------------
+
+If you are new to this system, **please see the following resources**:
+
+* `ALCF user guide <https://docs.alcf.anl.gov/>`__
+* Batch system: `PBS <https://docs.alcf.anl.gov/running-jobs/job-and-queue-scheduling/>`__
+* `Filesystems <https://docs.alcf.anl.gov/data-management/filesystem-and-storage/file-systems/>`__
+
+.. _building-polaris-preparation:
+
+Preparation
+-----------
+
+Use the following commands to download the WarpX source code:
+
+.. code-block:: bash
+
+   git clone https://github.com/ECP-WarpX/WarpX.git $HOME/src/warpx
+
+On Polaris, you can run either on GPU nodes with fast A100 GPUs (recommended) or CPU nodes.
+
+.. tab-set::
+
+   .. tab-item:: A100 GPUs
+
+      We use system software modules, add environment hints and further dependencies via the file ``$HOME/polaris_gpu_warpx.profile``.
+      Create it now:
+
+      .. code-block:: bash
+
+         cp $HOME/src/warpx/Tools/machines/polaris-alcf/polaris_gpu_warpx.profile.example $HOME/polaris_gpu_warpx.profile
+
+      .. dropdown:: Script Details
+         :color: light
+         :icon: info
+         :animate: fade-in-slide-down
+
+         .. literalinclude:: ../../../../Tools/machines/polaris-alcf/polaris_gpu_warpx.profile.example
+            :language: bash
+
+      Edit the 2nd line of this script, which sets the ``export proj=""`` variable.
+      For example, if you are member of the project ``proj_name``, then run ``nano $HOME/polaris_gpu_warpx.profile`` and edit line 2 to read:
+
+      .. code-block:: bash
+
+         export proj="proj_name"
+
+      Exit the ``nano`` editor with ``Ctrl`` + ``O`` (save) and then ``Ctrl`` + ``X`` (exit).
+
+      .. important::
+
+         Now, and as the first step on future logins to Polaris, activate these environment settings:
+
+         .. code-block:: bash
+
+            source $HOME/polaris_gpu_warpx.profile
+
+      Finally, since Polaris does not yet provide software modules for some of our dependencies, install them once:
+
+      .. code-block:: bash
+
+         bash $HOME/src/warpx/Tools/machines/polaris-alcf/install_gpu_dependencies.sh
+         source ${CFS}/${proj%_g}/${USER}/sw/polaris/gpu/venvs/warpx/bin/activate
+
+      .. dropdown:: Script Details
+         :color: light
+         :icon: info
+         :animate: fade-in-slide-down
+
+         .. literalinclude:: ../../../../Tools/machines/polaris-alcf/install_gpu_dependencies.sh
+            :language: bash
+
+
+   .. tab-item:: CPU Nodes
+
+      *Under construction*
+
+
+.. _building-polaris-compilation:
+
+Compilation
+-----------
+
+Use the following :ref:`cmake commands <building-cmake>` to compile the application executable:
+
+.. tab-set::
+
+   .. tab-item:: A100 GPUs
+
+      .. code-block:: bash
+
+         cd $HOME/src/warpx
+         rm -rf build_pm_gpu
+
+         cmake -S . -B build_pm_gpu -DWarpX_COMPUTE=CUDA -DWarpX_PSATD=ON -DWarpX_QED_TABLE_GEN=ON -DWarpX_DIMS="1;2;RZ;3"
+         cmake --build build_pm_gpu -j 16
+
+      The WarpX application executables are now in ``$HOME/src/warpx/build_pm_gpu/bin/``.
+      Additionally, the following commands will install WarpX as a Python module:
+
+      .. code-block:: bash
+
+         cd $HOME/src/warpx
+         rm -rf build_pm_gpu_py
+
+         cmake -S . -B build_pm_gpu_py -DWarpX_COMPUTE=CUDA -DWarpX_PSATD=ON -DWarpX_QED_TABLE_GEN=ON -DWarpX_APP=OFF -DWarpX_PYTHON=ON -DWarpX_DIMS="1;2;RZ;3"
+         cmake --build build_pm_gpu_py -j 16 --target pip_install
+
+   .. tab-item:: CPU Nodes
+
+      *Under construction*
+
+Now, you can :ref:`submit Polaris compute jobs <running-cpp-polaris>` for WarpX :ref:`Python (PICMI) scripts <usage-picmi>` (:ref:`example scripts <usage-examples>`).
+Or, you can use the WarpX executables to submit Polaris jobs (:ref:`example inputs <usage-examples>`).
+For executables, you can reference their location in your :ref:`job script <running-cpp-polaris>` or copy them to a location in ``$PSCRATCH``.
+
+
+.. _building-polaris-update:
+
+Update WarpX & Dependencies
+---------------------------
+
+If you already installed WarpX in the past and want to update it, start by getting the latest source code:
+
+.. code-block:: bash
+
+   cd $HOME/src/warpx
+
+   # read the output of this command - does it look ok?
+   git status
+
+   # get the latest WarpX source code
+   git fetch
+   git pull
+
+   # read the output of these commands - do they look ok?
+   git status
+   git log # press q to exit
+
+And, if needed,
+
+- :ref:`update the polaris_gpu_warpx.profile or polaris_cpu_warpx files <building-polaris-preparation>`,
+- log out and into the system, activate the now updated environment profile as usual,
+- :ref:`execute the dependency install scripts <building-polaris-preparation>`.
+
+As a last step, clean the build directory ``rm -rf $HOME/src/warpx/build_pm_*`` and rebuild WarpX.
+
+
+.. _running-cpp-polaris:
+
+Running
+-------
+
+.. tab-set::
+
+   .. tab-item:: A100 (40GB) GPUs
+
+      The batch script below can be used to run a WarpX simulation on multiple nodes (change ``<NODES>`` accordingly) on the supercomputer Polaris at ALCF.
+
+      Replace descriptions between chevrons ``<>`` by relevant values, for instance ``<input file>`` could be ``plasma_mirror_inputs``.
+      Note that we run one MPI rank per GPU.
+
+      .. literalinclude:: ../../../../Tools/machines/polaris-alcf/polaris_gpu.pbs
+         :language: bash
+         :caption: You can copy this file from ``$HOME/src/warpx/Tools/machines/polaris-alcf/polaris_gpu.pbs``.
+
+      To run a simulation, copy the lines above to a file ``polaris_gpu.pbs`` and run
+
+      .. code-block:: bash
+
+         qsub polaris_gpu.pbs
+
+      to submit the job.
+
+
+   .. tab-item:: CPU Nodes
+
+      *Under construction*
diff --git a/Tools/machines/polaris-alcf/install_gpu_dependencies.sh b/Tools/machines/polaris-alcf/install_gpu_dependencies.sh
new file mode 100755
index 00000000000..e2cdca86fbc
--- /dev/null
+++ b/Tools/machines/polaris-alcf/install_gpu_dependencies.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+#
+# Copyright 2024 The WarpX Community
+#
+# This file is part of WarpX.
+#
+# Author: Axel Huebl (edited by Roelof Groenewald for Polaris)
+# License: BSD-3-Clause-LBNL
+
+# Exit on first error encountered #############################################
+#
+set -eu -o pipefail
+
+# Check: ######################################################################
+#
+#   Was polaris_gpu_warpx.profile sourced and configured correctly?
+if [ -z ${proj-} ]; then echo "WARNING: The 'proj' variable is not yet set in your polaris_gpu_warpx.profile file! Please edit its line 2 to continue!"; exit 1; fi
+
+# Remove old dependencies #####################################################
+#
+SW_DIR="/home/${USER}/sw/polaris/gpu"
+rm -rf ${SW_DIR}
+mkdir -p ${SW_DIR}
+
+# remove common user mistakes in python, located in .local instead of a venv
+python3 -m pip uninstall -qq -y pywarpx
+python3 -m pip uninstall -qq -y warpx
+python3 -m pip uninstall -qqq -y mpi4py 2>/dev/null || true
+
+# General extra dependencies ##################################################
+#
+
+# c-blosc (I/O compression)
+if [ -d $HOME/src/c-blosc ]
+then
+  cd $HOME/src/c-blosc
+  git fetch --prune
+  git checkout v1.21.1
+  cd -
+else
+  git clone -b v1.21.1 https://github.com/Blosc/c-blosc.git $HOME/src/c-blosc
+fi
+rm -rf $HOME/src/c-blosc-pm-gpu-build
+cmake -S $HOME/src/c-blosc -B $HOME/src/c-blosc-pm-gpu-build -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DDEACTIVATE_AVX2=OFF -DCMAKE_INSTALL_PREFIX=${SW_DIR}/c-blosc-1.21.1
+cmake --build $HOME/src/c-blosc-pm-gpu-build --target install --parallel 16
+rm -rf $HOME/src/c-blosc-pm-gpu-build
+
+# ADIOS2
+if [ -d $HOME/src/adios2 ]
+then
+  cd $HOME/src/adios2
+  git fetch --prune
+  git checkout v2.8.3
+  cd -
+else
+  git clone -b v2.8.3 https://github.com/ornladios/ADIOS2.git $HOME/src/adios2
+fi
+rm -rf $HOME/src/adios2-pm-gpu-build
+cmake -S $HOME/src/adios2 -B $HOME/src/adios2-pm-gpu-build -DADIOS2_USE_Blosc=ON -DADIOS2_USE_Fortran=OFF -DADIOS2_USE_Python=OFF -DADIOS2_USE_ZeroMQ=OFF -DCMAKE_INSTALL_PREFIX=${SW_DIR}/adios2-2.8.3
+cmake --build $HOME/src/adios2-pm-gpu-build --target install -j 16
+rm -rf $HOME/src/adios2-pm-gpu-build
+
+# BLAS++ (for PSATD+RZ)
+if [ -d $HOME/src/blaspp ]
+then
+  cd $HOME/src/blaspp
+  git fetch --prune
+  git checkout master
+  git pull
+  cd -
+else
+  git clone https://github.com/icl-utk-edu/blaspp.git $HOME/src/blaspp
+fi
+rm -rf $HOME/src/blaspp-pm-gpu-build
+CXX=$(which CC) cmake -S $HOME/src/blaspp -B $HOME/src/blaspp-pm-gpu-build -Duse_openmp=OFF -Dgpu_backend=cuda -DCMAKE_CXX_STANDARD=17 -DCMAKE_INSTALL_PREFIX=${SW_DIR}/blaspp-master
+cmake --build $HOME/src/blaspp-pm-gpu-build --target install --parallel 16
+rm -rf $HOME/src/blaspp-pm-gpu-build
+
+# LAPACK++ (for PSATD+RZ)
+if [ -d $HOME/src/lapackpp ]
+then
+  cd $HOME/src/lapackpp
+  git fetch --prune
+  git checkout master
+  git pull
+  cd -
+else
+  git clone https://github.com/icl-utk-edu/lapackpp.git $HOME/src/lapackpp
+fi
+rm -rf $HOME/src/lapackpp-pm-gpu-build
+CXX=$(which CC) CXXFLAGS="-DLAPACK_FORTRAN_ADD_" cmake -S $HOME/src/lapackpp -B $HOME/src/lapackpp-pm-gpu-build -DCMAKE_CXX_STANDARD=17 -Dbuild_tests=OFF -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON -DCMAKE_INSTALL_PREFIX=${SW_DIR}/lapackpp-master
+cmake --build $HOME/src/lapackpp-pm-gpu-build --target install --parallel 16
+rm -rf $HOME/src/lapackpp-pm-gpu-build
+
+# Python ######################################################################
+#
+python3 -m pip install --upgrade pip
+python3 -m pip install --upgrade virtualenv
+python3 -m pip cache purge
+rm -rf ${SW_DIR}/venvs/warpx
+python3 -m venv --system-site-packages ${SW_DIR}/venvs/warpx
+source ${SW_DIR}/venvs/warpx/bin/activate
+python3 -m pip install --upgrade pip
+python3 -m pip install --upgrade build
+python3 -m pip install --upgrade packaging
+python3 -m pip install --upgrade wheel
+python3 -m pip install --upgrade setuptools
+python3 -m pip install --upgrade cython
+python3 -m pip install --upgrade numpy
+python3 -m pip install --upgrade pandas
+python3 -m pip install --upgrade scipy
+# MPICC="cc -target-accel=nvidia80 -shared" python3 -m pip install --upgrade mpi4py --no-cache-dir --no-build-isolation --no-binary mpi4py
+python3 -m pip install --upgrade openpmd-api
+python3 -m pip install --upgrade matplotlib
+python3 -m pip install --upgrade yt
+# install or update WarpX dependencies such as picmistandard
+python3 -m pip install --upgrade -r $HOME/src/warpx/requirements.txt
+python3 -m pip install cupy-cuda11x  # CUDA 11.7 compatible wheel
+# optional: for libEnsemble
+python3 -m pip install -r $HOME/src/warpx/Tools/LibEnsemble/requirements.txt
+# optional: for optimas (based on libEnsemble & ax->botorch->gpytorch->pytorch)
+python3 -m pip install --upgrade torch  # CUDA 11.7 compatible wheel
+python3 -m pip install -r $HOME/src/warpx/Tools/optimas/requirements.txt
diff --git a/Tools/machines/polaris-alcf/polaris_gpu.pbs b/Tools/machines/polaris-alcf/polaris_gpu.pbs
new file mode 100644
index 00000000000..178db6ad6a2
--- /dev/null
+++ b/Tools/machines/polaris-alcf/polaris_gpu.pbs
@@ -0,0 +1,36 @@
+#!/bin/bash -l
+
+#PBS -A <proj>
+#PBS -l select=<NODES>:system=polaris
+#PBS -l place=scatter
+#PBS -l walltime=0:10:00
+#PBS -l filesystems=home:eagle
+#PBS -q debug
+#PBS -N test_warpx
+
+# Set required environment variables
+# support gpu-aware-mpi
+# export MPICH_GPU_SUPPORT_ENABLED=1
+
+# Change to working directory
+echo Working directory is $PBS_O_WORKDIR
+cd ${PBS_O_WORKDIR}
+
+echo Jobid: $PBS_JOBID
+echo Running on host `hostname`
+echo Running on nodes `cat $PBS_NODEFILE`
+
+# executable & inputs file or python interpreter & PICMI script here
+EXE=./warpx
+INPUTS=input1d
+
+# MPI and OpenMP settings
+NNODES=`wc -l < $PBS_NODEFILE`
+NRANKS_PER_NODE=4
+NDEPTH=1
+NTHREADS=1
+
+NTOTRANKS=$(( NNODES * NRANKS_PER_NODE ))
+echo "NUM_OF_NODES= ${NNODES} TOTAL_NUM_RANKS= ${NTOTRANKS} RANKS_PER_NODE= ${NRANKS_PER_NODE} THREADS_PER_RANK= ${NTHREADS}"
+
+mpiexec -np ${NTOTRANKS} ${EXE} ${INPUTS} > output.txt
diff --git a/Tools/machines/polaris-alcf/polaris_gpu_warpx.profile.example b/Tools/machines/polaris-alcf/polaris_gpu_warpx.profile.example
new file mode 100644
index 00000000000..d7a68bf16bb
--- /dev/null
+++ b/Tools/machines/polaris-alcf/polaris_gpu_warpx.profile.example
@@ -0,0 +1,51 @@
+# Set the project name
+export proj=""  # change me!
+
+# swap to the Milan cray package
+# module swap craype-x86-rome craype-x86-milan
+
+# required dependencies
+module load cmake/3.23.2
+module load cudatoolkit-standalone
+
+# optional: for QED support with detailed tables
+# module load boost/1.81.0
+
+# optional: for openPMD and PSATD+RZ support
+module load cray-hdf5-parallel/1.12.2.3
+export CMAKE_PREFIX_PATH=/home/${USER}/sw/polaris/gpu/c-blosc-1.21.1:$CMAKE_PREFIX_PATH
+export CMAKE_PREFIX_PATH=/home/${USER}/sw/polaris/gpu/adios2-2.8.3:$CMAKE_PREFIX_PATH
+export CMAKE_PREFIX_PATH=/home/${USER}/sw/polaris/gpu/blaspp-master:$CMAKE_PREFIX_PATH
+export CMAKE_PREFIX_PATH=/home/${USER}/sw/polaris/gpu/lapackpp-master:$CMAKE_PREFIX_PATH
+
+export LD_LIBRARY_PATH=/home/${USER}/sw/polaris/gpu/c-blosc-1.21.1/lib64:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/home/${USER}/sw/polaris/gpu/adios2-2.8.3/lib64:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/home/${USER}/sw/polaris/gpu/blaspp-master/lib64:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/home/${USER}/sw/polaris/gpu/lapackpp-master/lib64:$LD_LIBRARY_PATH
+
+export PATH=/home/${USER}/sw/polaris/gpu/adios2-2.8.3/bin:${PATH}
+
+# optional: for Python bindings or libEnsemble
+module load cray-python/3.9.13.1
+
+if [ -d "/home/${USER}/sw/polaris/gpu/venvs/warpx" ]
+then
+  source /home/${USER}/sw/polaris/gpu/venvs/warpx/bin/activate
+fi
+
+# necessary to use CUDA-Aware MPI and run a job
+export CRAY_ACCEL_TARGET=nvidia80
+
+# optimize CUDA compilation for A100
+export AMREX_CUDA_ARCH=8.0
+
+# optimize CPU microarchitecture for AMD EPYC 3rd Gen (Milan/Zen3)
+# note: the cc/CC/ftn wrappers below add those
+# export CXXFLAGS="-march=znver3"
+# export CFLAGS="-march=znver3"
+
+# compiler environment hints
+export CC=nvc
+export CXX=nvc++
+export CUDACXX=nvcc
+export CUDAHOSTCXX=nvc++

From 60bf00039522925870b86f3db6c4ac7d12c3a04b Mon Sep 17 00:00:00 2001
From: Justin Ray Angus <justin.angus@gmail.com>
Date: Thu, 1 Feb 2024 20:16:19 -0800
Subject: [PATCH 08/13] Implemented Villasenor and Buneman deposition routine
 for implicit solver (#4623)

* added infrastracture for doing the VillasenorAndBuneman current deposition scheme. Set algo.current_deposition = 3 to use. Right now, the actual algorithm is identical to doChargeConsevingCurrentDepositionImplicit().

* small modification to message when asserting that the evolve scheme is implicit when using villasenor-and-buneman current deposition.

* added new interpolation.angus_scheme as intermediate step for adding the CC1 deposit/gather routine from PICNIC. Seems to pass numerical energy conservation test in 2D right now, but the gather/deposit are actually just CIC.

* Created doGatherPicnicShapeN function. Does CIC right now, same as the VandB current deposition.

* vandb current deposition works in 2D.

* fixed bug in vandb deposition. Exact charge conservation is now obtained. Made same fix to corresonding gather routine.

* streamlined vandb deposition and corresponding gather routine.

* using dxp/dt in place of vp when doing current deposition for vandb scheme gives better charge conservation.

* added a new templated function to ShapeFactors.H that is used to compute the deposition weights tranverse to the current direction for a segment in the villesanor and buneman current deposition.

* significant streamlining of new vandb current deposition.

* applied same streamlining to vandb deposition from previous commit to the corresponding gather routine.

* working on generalzation of vandb deposit and corresponding gather routine to work for 1D, 3D, and for depos_order = 1. WIP

* removed interolation.angus_scheme flag that was used to do the Picnic gather when doing Implicit. Now, the gather routine is chosen based on the CurrentDepositionAlgo enum value.

* galerkin_interpolation flag is no longer used with the Implicit solver. Removed some assertions related to this flag and the implicit solver.

* VillasenorAndBuneman ==> Villasenor

* added ability to use shape_factor 3 with villasenor deposit. However, initial tests do not give exact energy conservation. It is good, but not exact. Don't know why, so for now we still assert to shape_factor < 3 when using villasenor deposit.

* villasenor deposition now works with shape_factor = 3.

* villasenor deposition, and the corresponding gather, now works in 1D.

* intermediate checkin. Working on getting villasenor working in 3D. WIP.

* added a new function to ShapeFactors.H that is similar to the average function, but instead of returning the average weights using old and new positions it returns both old and new weights along with the shared left index. This is needed for the transverse interpolation for villasenor in 3D.

* added coding to compute the cell-crossings in 3D. It runs in 3D, but charge is not conserved. WIP.

* fixed bug. villasenor deposition and corresponding gather work in 3D for shape factor = 2. Cleaned up logic for cell crossings setting in 3D.

* cleaning things up.

* villasenor deposition and corresponding gather work in 3D with shape_factor = 1,2, and 3.

* code compiles in RZ.

* small tune up.

* fixed bug in how Xcell is set in villasenor deposition/gather. Previous implementation only worked when the left domain boundary was zero.

* refactoring to avoid roundoff issue in charge conservation for shape factor 1 with villasenor deposition. Roundoff issue solved in 1D. Still need to do 2D and 3D.

* fixed roundoff issue with shape = 1 when using villasenor for 2D.

* fixed roundoff issue with shape = 3 when using villasenor in 3D.

* added proper briefs to the villasenor deposition and the picnic-like gather.

* added a few comments.

* removed duplicate query for galerkin_scheme in WarpX.cpp. Why didn't merge catch this?

* minor cleanup.

* added 4th order shape factor routines to ShapeFactors.H as needed for using shape factor = 4 with villasenor deposition.

* generalized villasenor deposition routine and the picnic gather to work shape factor orders higher than 3.

* added ability to parse in shape factor 4 when using villasenor deposition and added the ability to call shape factor 4 for the current density and charge density deposits.

* fixed a few small bugs that prevented compile in 3D.

* villasenor deposition now uses villasenor for out-of-plane J as well. Same for out-of-plane E in corresponding Picnic gather.

* slight refactoring and cleaning up some comments.

* cleaning up some comments.

* added regression tests.

* removed blank spaces.

* fixed bug

* cleaning up tabs.

* commented out std::cout lines.

* initializing potentially uninitialized variables.

* fixed indentation issue.

* changed some types to avoid narrowing conversion issue.

* removed cout comment lines.

* ions renamed to protons in new implicit example input deck, consistent with what is assumed in the benchmark data in the benchmarks_json/ folder. Updated the benchmark data using simulation ran on quartz LC.

* had to static_cast some quantities in new gather routine.

* more static casting.

* clang-tidy told me to use auto.

* fixed indentation.

* changed tolerance for new 2D CI test. max delta energy error on Azure is 2X higher than on LC.

* added particle y-position the json file for new 2D implicit ci test.

* updated values in json file for new 2D implicit ci test to have the sum of the absolute value of the fields rather than just the sum.

* if ==> else if. This should help avoid a potential bug in the future.

* adjusted some lines had an incorrect number of white spaces.

* significant simplification of code to determine segment values at cell crossings in 2D and 3D. By way of Dave Grote.

* slight refactoring for style and performance.

* more refactoring. No longer pre-defining slopes. Added logical check for zero dxp values before division.

* made list of params in brief for doVillasenorDepositionShapeNImplicit consistent with the input parameters.

* all variables used in computing the shape factor for the VB deposition, and corresponding gather, now have type double.
---
 Examples/Tests/Implicit/analysis_vandb_2d.py  |  68 ++
 Examples/Tests/Implicit/inputs_vandb_2d       |  92 +++
 .../ImplicitPicard_VandB_2d.json              |  31 +
 Regression/WarpX-tests.ini                    |  17 +
 Source/Diagnostics/WarpXOpenPMD.cpp           |   2 +
 Source/Initialization/WarpXInitData.cpp       |   3 +
 .../Particles/Deposition/CurrentDeposition.H  | 664 ++++++++++++++++
 Source/Particles/Gather/FieldGather.H         | 710 +++++++++++++++++-
 .../Particles/PhysicalParticleContainer.cpp   |   6 +-
 Source/Particles/ShapeFactors.H               | 105 +++
 Source/Particles/WarpXParticleContainer.cpp   |  74 +-
 Source/Utils/WarpXAlgorithmSelection.H        |   3 +-
 Source/Utils/WarpXAlgorithmSelection.cpp      |   9 +-
 Source/WarpX.H                                |   2 +-
 Source/WarpX.cpp                              |  57 +-
 Source/ablastr/particles/DepositCharge.H      |   5 +
 16 files changed, 1810 insertions(+), 38 deletions(-)
 create mode 100755 Examples/Tests/Implicit/analysis_vandb_2d.py
 create mode 100644 Examples/Tests/Implicit/inputs_vandb_2d
 create mode 100644 Regression/Checksum/benchmarks_json/ImplicitPicard_VandB_2d.json

diff --git a/Examples/Tests/Implicit/analysis_vandb_2d.py b/Examples/Tests/Implicit/analysis_vandb_2d.py
new file mode 100755
index 00000000000..fa3299925a8
--- /dev/null
+++ b/Examples/Tests/Implicit/analysis_vandb_2d.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+
+# Copyright 2024 Justin Angus
+#
+#
+# This file is part of WarpX.
+#
+# License: BSD-3-Clause-LBNL
+#
+# This is a script that analyses the simulation results from the script `inputs_vandb_2d`.
+# This simulates a 2D periodic plasma using the implicit solver
+# with the Villasenor deposition using shape factor 2.
+import os
+import sys
+
+import numpy as np
+from scipy.constants import e, epsilon_0
+import yt
+
+sys.path.insert(1, '../../../../warpx/Regression/Checksum/')
+import checksumAPI
+
+# this will be the name of the plot file
+fn = sys.argv[1]
+
+field_energy = np.loadtxt('diags/reducedfiles/field_energy.txt', skiprows=1)
+particle_energy = np.loadtxt('diags/reducedfiles/particle_energy.txt', skiprows=1)
+
+total_energy = field_energy[:,2] + particle_energy[:,2]
+
+delta_E = (total_energy - total_energy[0])/total_energy[0]
+max_delta_E = np.abs(delta_E).max()
+
+# This case should have near machine precision conservation of energy
+tolerance_rel_energy = 2.e-14
+tolerance_rel_charge = 2.e-15
+
+print(f"max change in energy: {max_delta_E}")
+print(f"tolerance: {tolerance_rel_energy}")
+
+assert( max_delta_E < tolerance_rel_energy )
+
+# check for machine precision conservation of charge density
+n0 = 1.e30
+
+pltdir = sys.argv[1]
+ds = yt.load(pltdir)
+data = ds.covering_grid(level = 0, left_edge = ds.domain_left_edge, dims = ds.domain_dimensions)
+
+divE = data['boxlib', 'divE'].value
+rho  = data['boxlib', 'rho'].value
+
+# compute local error in Gauss's law
+drho = (rho - epsilon_0*divE)/e/n0
+
+# compute RMS on in error on the grid
+nX = drho.shape[0]
+nZ = drho.shape[1]
+drho2_avg = (drho**2).sum()/(nX*nZ)
+drho_rms = np.sqrt(drho2_avg)
+
+print(f"rms error in charge conservation: {drho_rms}")
+print(f"tolerance: {tolerance_rel_charge}")
+
+assert( drho_rms < tolerance_rel_charge )
+
+test_name = os.path.split(os.getcwd())[1]
+checksumAPI.evaluate_checksum(test_name, fn)
diff --git a/Examples/Tests/Implicit/inputs_vandb_2d b/Examples/Tests/Implicit/inputs_vandb_2d
new file mode 100644
index 00000000000..2dc57323efe
--- /dev/null
+++ b/Examples/Tests/Implicit/inputs_vandb_2d
@@ -0,0 +1,92 @@
+#################################
+########## CONSTANTS ############
+#################################
+
+my_constants.n0 = 1.e30          # m^-3
+my_constants.Ti = 100.           # eV
+my_constants.Te = 100.           # eV
+my_constants.wpe = q_e*sqrt(n0/(m_e*epsilon0))
+my_constants.de0 = clight/wpe
+my_constants.nppcz = 10          # number of particles/cell in z
+my_constants.dt = 0.1/wpe        # s
+
+#################################
+####### GENERAL PARAMETERS ######
+#################################
+max_step = 20
+amr.n_cell =  40 40
+amr.max_grid_size = 8
+amr.blocking_factor = 8
+amr.max_level = 0
+geometry.dims = 2
+geometry.prob_lo = 0.0  0.0  # physical domain
+geometry.prob_hi = 10.0*de0  10.0*de0
+
+#################################
+####### Boundary condition ######
+#################################
+boundary.field_lo = periodic periodic
+boundary.field_hi = periodic periodic
+
+#################################
+############ NUMERICS ###########
+#################################
+warpx.serialize_initial_conditions = 1
+warpx.verbose = 1
+warpx.const_dt = dt
+#warpx.cfl = 0.5656
+warpx.use_filter = 0
+
+algo.maxwell_solver = Yee
+algo.evolve_scheme = "implicit_picard"
+algo.require_picard_convergence = 0
+algo.max_picard_iterations = 25
+algo.picard_iteration_tolerance = 0.0 #1.0e-12
+algo.particle_pusher = "boris"
+#algo.particle_pusher = "higuera"
+
+algo.particle_shape = 2
+#algo.current_deposition = "direct"
+#algo.current_deposition = "esirkepov"
+algo.current_deposition = "villasenor"
+
+#################################
+############ PLASMA #############
+#################################
+particles.species_names = electrons protons
+
+electrons.charge = -q_e
+electrons.mass = m_e
+electrons.injection_style = "NUniformPerCell"
+electrons.num_particles_per_cell_each_dim = nppcz nppcz
+electrons.profile = constant
+electrons.density = 1.e30  # number per m^3
+electrons.momentum_distribution_type = "gaussian"
+electrons.ux_th = sqrt(Te*q_e/m_e)/clight
+electrons.uy_th = sqrt(Te*q_e/m_e)/clight
+electrons.uz_th = sqrt(Te*q_e/m_e)/clight
+
+protons.charge = q_e
+protons.mass = m_p
+protons.injection_style = "NUniformPerCell"
+protons.num_particles_per_cell_each_dim = nppcz nppcz
+protons.profile = constant
+protons.density = 1.e30  # number per m^3
+protons.momentum_distribution_type = "gaussian"
+protons.ux_th = sqrt(Ti*q_e/m_p)/clight
+protons.uy_th = sqrt(Ti*q_e/m_p)/clight
+protons.uz_th = sqrt(Ti*q_e/m_p)/clight
+
+# Diagnostics
+diagnostics.diags_names = diag1
+diag1.intervals = 20
+diag1.diag_type = Full
+diag1.fields_to_plot = Ex Ey Ez Bx By Bz jx jy jz rho divE
+diag1.electrons.variables = w ux uy uz
+diag1.protons.variables = w ux uy uz
+
+warpx.reduced_diags_names = particle_energy field_energy
+particle_energy.type = ParticleEnergy
+particle_energy.intervals = 1
+field_energy.type = FieldEnergy
+field_energy.intervals = 1
diff --git a/Regression/Checksum/benchmarks_json/ImplicitPicard_VandB_2d.json b/Regression/Checksum/benchmarks_json/ImplicitPicard_VandB_2d.json
new file mode 100644
index 00000000000..d97eb04883f
--- /dev/null
+++ b/Regression/Checksum/benchmarks_json/ImplicitPicard_VandB_2d.json
@@ -0,0 +1,31 @@
+{
+  "lev=0": {
+    "Bx": 72730.70321925254,
+    "By": 89276.6097395453,
+    "Bz": 66911.00019634314,
+    "Ex": 92036838733000.64,
+    "Ey": 15583500940725.84,
+    "Ez": 89163420502164.97,
+    "divE": 8.998871921763322e+22,
+    "jx": 2.7748639888523993e+19,
+    "jy": 2.9501400595579277e+19,
+    "jz": 2.6976140199337787e+19,
+    "rho": 796777020986.2787
+  },
+  "protons": {
+    "particle_momentum_x": 2.0873315539608036e-17,
+    "particle_momentum_y": 2.0858882907322405e-17,
+    "particle_momentum_z": 2.0877345477243595e-17,
+    "particle_position_x": 0.004251275869323399,
+    "particle_position_y": 0.0042512738905209615,
+    "particle_weight": 2823958719279159.5
+  },
+  "electrons": {
+    "particle_momentum_x": 4.882673707817137e-19,
+    "particle_momentum_y": 4.879672470952739e-19,
+    "particle_momentum_z": 4.872329687213274e-19,
+    "particle_position_x": 0.004251641684258687,
+    "particle_position_y": 0.004251751978637919,
+    "particle_weight": 2823958719279159.5
+  }
+}
diff --git a/Regression/WarpX-tests.ini b/Regression/WarpX-tests.ini
index ae22eba499d..3310e642dd3 100644
--- a/Regression/WarpX-tests.ini
+++ b/Regression/WarpX-tests.ini
@@ -4515,6 +4515,23 @@ doVis = 0
 compareParticles = 1
 analysisRoutine = Examples/Tests/Implicit/analysis_1d.py
 
+[ImplicitPicard_VandB_2d]
+buildDir = .
+inputFile = Examples/Tests/Implicit/inputs_vandb_2d
+runtime_params = warpx.abort_on_warning_threshold=high
+dim = 2
+addToCompileString =
+cmakeSetupOpts = -DWarpX_DIMS=2
+restartTest = 0
+useMPI = 1
+numprocs = 2
+useOMP = 0
+numthreads = 1
+compileTest = 0
+doVis = 0
+compareParticles = 1
+analysisRoutine = Examples/Tests/Implicit/analysis_vandb_2d.py
+
 [SemiImplicitPicard_1d]
 buildDir = .
 inputFile = Examples/Tests/Implicit/inputs_1d_semiimplicit
diff --git a/Source/Diagnostics/WarpXOpenPMD.cpp b/Source/Diagnostics/WarpXOpenPMD.cpp
index 64411ecf6e4..7cc9f571a4a 100644
--- a/Source/Diagnostics/WarpXOpenPMD.cpp
+++ b/Source/Diagnostics/WarpXOpenPMD.cpp
@@ -1130,6 +1130,8 @@ WarpXOpenPMDPlot::SetConstParticleRecordsEDPIC (
                 return "Esirkepov";
             case CurrentDepositionAlgo::Vay :
                 return "Vay";
+            case CurrentDepositionAlgo::Villasenor :
+                return "Villasenor";
             default:
                 return "directMorseNielson";
         }
diff --git a/Source/Initialization/WarpXInitData.cpp b/Source/Initialization/WarpXInitData.cpp
index 169453a6e99..eee3012ab4d 100644
--- a/Source/Initialization/WarpXInitData.cpp
+++ b/Source/Initialization/WarpXInitData.cpp
@@ -229,6 +229,9 @@ WarpX::PrintMainPICparameters ()
     else if (current_deposition_algo == CurrentDepositionAlgo::Esirkepov){
       amrex::Print() << "Current Deposition:   | Esirkepov \n";
     }
+    else if (current_deposition_algo == CurrentDepositionAlgo::Villasenor){
+      amrex::Print() << "Current Deposition:   | Villasenor \n";
+    }
     // Print type of particle pusher
     if (particle_pusher_algo == ParticlePusherAlgo::Vay){
       amrex::Print() << "Particle Pusher:      | Vay \n";
diff --git a/Source/Particles/Deposition/CurrentDeposition.H b/Source/Particles/Deposition/CurrentDeposition.H
index 5d1055278b2..18df09c3b43 100644
--- a/Source/Particles/Deposition/CurrentDeposition.H
+++ b/Source/Particles/Deposition/CurrentDeposition.H
@@ -1535,6 +1535,670 @@ void doChargeConservingDepositionShapeNImplicit (const amrex::ParticleReal * con
 #endif
 }
 
+/**
+ * \brief Villasenor and Buneman Current Deposition for thread thread_num for implicit scheme.
+ *        The specifics for the implicit scheme are in how gamma is determined. This is a charge-
+ *        conserving deposition. The difference from Esirkepov is that the deposit is done segment
+ *        by segment, where the segments are determined by cell crossings. In general, this results
+ *        in a tighter stencil. The implementation is valid for an arbitrary number of cell crossings.
+ *
+ * \param depos_order  deposition order
+ * \param xp_n,yp_n,zp_n  Pointer to arrays of particle position at time level n.
+ * \param GetPosition  A functor for returning the particle position.
+ * \param wp           Pointer to array of particle weights.
+ * \param uxp_n,uyp_n,uzp_n  Pointer to arrays of particle momentum at time level n.
+ * \param uxp_nph,uyp_nph,uzp_nph  Pointer to arrays of particle momentum at time level n + 1/2.
+ * \param ion_lev      Pointer to array of particle ionization level. This is
+                       required to have the charge of each macroparticle
+                       since q is a scalar. For non-ionizable species,
+                       ion_lev is a null pointer.
+ * \param Jx_arr,Jy_arr,Jz_arr  Array4 of current density, either full array or tile.
+ * \param np_to_deposit         Number of particles for which current is deposited.
+ * \param dt                    Time step for particle level
+ * \param dx                    3D cell size
+ * \param xyzmin                Physical lower bounds of domain.
+ * \param lo                    Index lower bounds of domain.
+ * \param q                     species charge.
+ * \param n_rz_azimuthal_modes  Number of azimuthal modes when using RZ geometry.
+ * \param cost Pointer to (load balancing) cost corresponding to box where present particles deposit current.
+ * \param load_balance_costs_update_algo Selected method for updating load balance costs.
+ */
+template <int depos_order>
+void doVillasenorDepositionShapeNImplicit (const amrex::ParticleReal * const xp_n,
+                                           const amrex::ParticleReal * const yp_n,
+                                           const amrex::ParticleReal * const zp_n,
+                                           const GetParticlePosition<PIdx>& GetPosition,
+                                           const amrex::ParticleReal * const wp,
+                                           [[maybe_unused]]const amrex::ParticleReal * const uxp_n,
+                                           [[maybe_unused]]const amrex::ParticleReal * const uyp_n,
+                                           [[maybe_unused]]const amrex::ParticleReal * const uzp_n,
+                                           [[maybe_unused]]const amrex::ParticleReal * const uxp_nph,
+                                           [[maybe_unused]]const amrex::ParticleReal * const uyp_nph,
+                                           [[maybe_unused]]const amrex::ParticleReal * const uzp_nph,
+                                           const int * const ion_lev,
+                                           const amrex::Array4<amrex::Real>& Jx_arr,
+                                           const amrex::Array4<amrex::Real>& Jy_arr,
+                                           const amrex::Array4<amrex::Real>& Jz_arr,
+                                           const long np_to_deposit,
+                                           const amrex::Real dt,
+                                           const std::array<amrex::Real,3>& dx,
+                                           const std::array<amrex::Real, 3> xyzmin,
+                                           const amrex::Dim3 lo,
+                                           const amrex::Real q,
+                                           const int n_rz_azimuthal_modes,
+                                           amrex::Real * const cost,
+                                           const long load_balance_costs_update_algo)
+{
+    using namespace amrex;
+#if !defined(WARPX_DIM_RZ)
+    ignore_unused(n_rz_azimuthal_modes);
+#endif
+
+#if !defined(AMREX_USE_GPU)
+    amrex::ignore_unused(cost, load_balance_costs_update_algo);
+#endif
+
+    // Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
+    // (do_ionization=1)
+    bool const do_ionization = ion_lev;
+#if !defined(WARPX_DIM_1D_Z)
+    Real const dxi = 1.0_rt / dx[0];
+#endif
+#if !defined(WARPX_DIM_1D_Z)
+    Real const xmin = xyzmin[0];
+#endif
+#if defined(WARPX_DIM_3D)
+    Real const dyi = 1.0_rt / dx[1];
+    Real const ymin = xyzmin[1];
+#endif
+    Real const dzi = 1.0_rt / dx[2];
+    Real const zmin = xyzmin[2];
+
+#if defined(WARPX_DIM_3D)
+    Real const invvol = 1.0_rt / (dx[0]*dx[1]*dx[2]);
+#elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
+    Real const invvol = 1.0_rt / (dx[0]*dx[2]);
+#elif defined(WARPX_DIM_1D_Z)
+    Real const invvol = 1.0_rt / (dx[2]);
+#endif
+
+#if !defined(WARPX_DIM_1D_Z)
+    Real constexpr one_third = 1.0_rt / 3.0_rt;
+    Real constexpr one_sixth = 1.0_rt / 6.0_rt;
+#endif
+
+    // Loop over particles and deposit into Jx_arr, Jy_arr and Jz_arr
+#if defined(WARPX_USE_GPUCLOCK)
+    amrex::Real* cost_real = nullptr;
+    if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
+        cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
+        *cost_real = 0._rt;
+    }
+#endif
+    amrex::ParallelFor(
+        np_to_deposit,
+        [=] AMREX_GPU_DEVICE (long const ip) {
+#if defined(WARPX_USE_GPUCLOCK)
+            const auto KernelTimer = ablastr::parallelization::KernelTimer(
+                cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
+                cost_real);
+#endif
+
+#if !defined(WARPX_DIM_3D)
+            constexpr amrex::ParticleReal inv_c2 = 1._prt/(PhysConst::c*PhysConst::c);
+
+            // Compute inverse Lorentz factor, the average of gamma at time levels n and n+1
+            // The uxp,uyp,uzp are the velocities at time level n+1/2
+            const amrex::ParticleReal uxp_np1 = 2._prt*uxp_nph[ip] - uxp_n[ip];
+            const amrex::ParticleReal uyp_np1 = 2._prt*uyp_nph[ip] - uyp_n[ip];
+            const amrex::ParticleReal uzp_np1 = 2._prt*uzp_nph[ip] - uzp_n[ip];
+            const amrex::ParticleReal gamma_n = std::sqrt(1._prt + (uxp_n[ip]*uxp_n[ip] + uyp_n[ip]*uyp_n[ip] + uzp_n[ip]*uzp_n[ip])*inv_c2);
+            const amrex::ParticleReal gamma_np1 = std::sqrt(1._prt + (uxp_np1*uxp_np1 + uyp_np1*uyp_np1 + uzp_np1*uzp_np1)*inv_c2);
+            const amrex::ParticleReal gaminv = 2.0_prt/(gamma_n + gamma_np1);
+#endif
+
+            // wqx, wqy wqz are particle current in each direction
+            Real wq = q*wp[ip];
+            if (do_ionization){
+                wq *= ion_lev[ip];
+            }
+
+            ParticleReal xp_nph, yp_nph, zp_nph;
+            GetPosition(ip, xp_nph, yp_nph, zp_nph);
+
+#if !defined(WARPX_DIM_1D_Z)
+            ParticleReal const xp_np1 = 2._prt*xp_nph - xp_n[ip];
+#else
+            ignore_unused(xp_n);
+#endif
+#if defined(WARPX_DIM_3D) || defined(WARPX_DIM_RZ)
+            ParticleReal const yp_np1 = 2._prt*yp_nph - yp_n[ip];
+#else
+            ignore_unused(yp_n);
+#endif
+            ParticleReal const zp_np1 = 2._prt*zp_nph - zp_n[ip];
+
+            // computes current and old position in grid units
+#if defined(WARPX_DIM_RZ)
+            amrex::Real const xp_new = xp_np1;
+            amrex::Real const yp_new = yp_np1;
+            amrex::Real const xp_mid = xp_nph;
+            amrex::Real const yp_mid = yp_nph;
+            amrex::Real const xp_old = xp_n[ip];
+            amrex::Real const yp_old = yp_n[ip];
+            amrex::Real const rp_new = std::sqrt(xp_new*xp_new + yp_new*yp_new);
+            amrex::Real const rp_old = std::sqrt(xp_old*xp_old + yp_old*yp_old);
+            amrex::Real const rp_mid = (rp_new + rp_old)/2._rt;
+            amrex::Real costheta_mid, sintheta_mid;
+            if (rp_mid > 0._rt) {
+                costheta_mid = xp_mid/rp_mid;
+                sintheta_mid = yp_mid/rp_mid;
+            } else {
+                costheta_mid = 1._rt;
+                sintheta_mid = 0._rt;
+            }
+            const Complex xy_mid0 = Complex{costheta_mid, sintheta_mid};
+
+            // Keep these double to avoid bug in single precision
+            double const x_new = (rp_new - xmin)*dxi;
+            double const x_old = (rp_old - xmin)*dxi;
+            amrex::Real const vx = (rp_new - rp_old)/dt;
+            amrex::Real const vy = (-uxp_nph[ip]*sintheta_mid + uyp_nph[ip]*costheta_mid)*gaminv;
+#elif defined(WARPX_DIM_XZ)
+            // Keep these double to avoid bug in single precision
+            double const x_new = (xp_np1 - xmin)*dxi;
+            double const x_old = (xp_n[ip] - xmin)*dxi;
+            amrex::Real const vx = (xp_np1 - xp_n[ip])/dt;
+            amrex::Real const vy = uyp_nph[ip]*gaminv;
+#elif defined(WARPX_DIM_1D_Z)
+            amrex::Real const vx = uxp_nph[ip]*gaminv;
+            amrex::Real const vy = uyp_nph[ip]*gaminv;
+#elif defined(WARPX_DIM_3D)
+            // Keep these double to avoid bug in single precision
+            double const x_new = (xp_np1 - xmin)*dxi;
+            double const x_old = (xp_n[ip] - xmin)*dxi;
+            double const y_new = (yp_np1 - ymin)*dyi;
+            double const y_old = (yp_n[ip] - ymin)*dyi;
+            amrex::Real const vx = (xp_np1 - xp_n[ip])/dt;
+            amrex::Real const vy = (yp_np1 - yp_n[ip])/dt;
+#endif
+
+            // Keep these double to avoid bug in single precision
+            double const z_new = (zp_np1 - zmin)*dzi;
+            double const z_old = (zp_n[ip] - zmin)*dzi;
+            amrex::Real const vz = (zp_np1 - zp_n[ip])/dt;
+
+            // Define velocity kernals to deposit
+            amrex::Real const wqx = wq*vx*invvol;
+            amrex::Real const wqy = wq*vy*invvol;
+            amrex::Real const wqz = wq*vz*invvol;
+
+            // 1) Determine the number of segments.
+            // 2) Loop over segments and deposit current.
+
+            // cell crossings are defined at cell edges if depos_order is odd
+            // cell crossings are defined at cell centers if depos_order is even
+
+            int num_segments = 1;
+            double shift = 0.0;
+            if ( (depos_order % 2) == 0 ) { shift = 0.5; }
+
+#if defined(WARPX_DIM_3D)
+
+            // compute cell crossings in X-direction
+            const auto i_old = static_cast<int>(x_old-shift);
+            const auto i_new = static_cast<int>(x_new-shift);
+            int cell_crossings_x = std::abs(i_new-i_old);
+            num_segments += cell_crossings_x;
+
+            // compute cell crossings in Y-direction
+            const auto j_old = static_cast<int>(y_old-shift);
+            const auto j_new = static_cast<int>(y_new-shift);
+            int cell_crossings_y = std::abs(j_new-j_old);
+            num_segments += cell_crossings_y;
+
+            // compute cell crossings in Z-direction
+            const auto k_old = static_cast<int>(z_old-shift);
+            const auto k_new = static_cast<int>(z_new-shift);
+            int cell_crossings_z = std::abs(k_new-k_old);
+            num_segments += cell_crossings_z;
+
+            // need to assert that the number of cell crossings in each direction
+            // is within the range permitted by the number of guard cells
+            // e.g., if (num_segments > 7) ...
+
+            // compute total change in particle position and the initial cell
+            // locations in each direction used to find the position at cell crossings.
+            const double dxp = x_new - x_old;
+            const double dyp = y_new - y_old;
+            const double dzp = z_new - z_old;
+            const auto dirX_sign = static_cast<double>(dxp < 0. ? -1. : 1.);
+            const auto dirY_sign = static_cast<double>(dyp < 0. ? -1. : 1.);
+            const auto dirZ_sign = static_cast<double>(dzp < 0. ? -1. : 1.);
+            double Xcell = 0., Ycell = 0., Zcell = 0.;
+            if (num_segments > 1) {
+                Xcell = static_cast<double>(i_old) + shift + 0.5*(1.-dirX_sign);
+                Ycell = static_cast<double>(j_old) + shift + 0.5*(1.-dirY_sign);
+                Zcell = static_cast<double>(k_old) + shift + 0.5*(1.-dirZ_sign);
+            }
+
+            // loop over the number of segments and deposit
+            Compute_shape_factor< depos_order-1 > compute_shape_factor_cell;
+            Compute_shape_factor_pair< depos_order > compute_shape_factors_node;
+            double dxp_seg, dyp_seg, dzp_seg;
+            double x0_new, y0_new, z0_new;
+            double x0_old = x_old;
+            double y0_old = y_old;
+            double z0_old = z_old;
+
+            for (int ns=0; ns<num_segments; ns++) {
+
+                if (ns == num_segments-1) { // final segment
+
+                    x0_new = x_new;
+                    y0_new = y_new;
+                    z0_new = z_new;
+                    dxp_seg = x0_new - x0_old;
+                    dyp_seg = y0_new - y0_old;
+                    dzp_seg = z0_new - z0_old;
+
+                }
+                else {
+
+                    x0_new = Xcell + dirX_sign;
+                    y0_new = Ycell + dirY_sign;
+                    z0_new = Zcell + dirZ_sign;
+                    dxp_seg = x0_new - x0_old;
+                    dyp_seg = y0_new - y0_old;
+                    dzp_seg = z0_new - z0_old;
+
+                    if ( (dyp == 0. || std::abs(dxp_seg) < std::abs(dxp/dyp*dyp_seg))
+                      && (dzp == 0. || std::abs(dxp_seg) < std::abs(dxp/dzp*dzp_seg)) ) {
+                        Xcell = x0_new;
+                        dyp_seg = dyp/dxp*dxp_seg;
+                        dzp_seg = dzp/dxp*dxp_seg;
+                        y0_new = y0_old + dyp_seg;
+                        z0_new = z0_old + dzp_seg;
+                    }
+                    else if (dzp == 0. || std::abs(dyp_seg) < std::abs(dyp/dzp*dzp_seg)) {
+                        Ycell = y0_new;
+                        dxp_seg = dxp/dyp*dyp_seg;
+                        dzp_seg = dzp/dyp*dyp_seg;
+                        x0_new = x0_old + dxp_seg;
+                        z0_new = z0_old + dzp_seg;
+                    }
+                    else {
+                        Zcell = z0_new;
+                        dxp_seg = dxp/dzp*dzp_seg;
+                        dyp_seg = dyp/dzp*dzp_seg;
+                        x0_new = x0_old + dxp_seg;
+                        y0_new = y0_old + dyp_seg;
+                    }
+
+                }
+
+                // compute the segment factors (each equal to dt_seg/dt for nonzero dxp, dyp, or dzp)
+                const auto seg_factor_x = static_cast<double>(dxp == 0. ? 1. : dxp_seg/dxp);
+                const auto seg_factor_y = static_cast<double>(dyp == 0. ? 1. : dyp_seg/dyp);
+                const auto seg_factor_z = static_cast<double>(dzp == 0. ? 1. : dzp_seg/dzp);
+
+                // compute cell-based weights using the average segment position
+                double sx_cell[depos_order] = {0.};
+                double sy_cell[depos_order] = {0.};
+                double sz_cell[depos_order] = {0.};
+                double const x0_bar = (x0_new + x0_old)/2.0;
+                double const y0_bar = (y0_new + y0_old)/2.0;
+                double const z0_bar = (z0_new + z0_old)/2.0;
+                const int i0_cell = compute_shape_factor_cell( sx_cell, x0_bar-0.5 );
+                const int j0_cell = compute_shape_factor_cell( sy_cell, y0_bar-0.5 );
+                const int k0_cell = compute_shape_factor_cell( sz_cell, z0_bar-0.5 );
+
+                if constexpr (depos_order >= 3) { // higher-order correction to the cell-based weights
+                    Compute_shape_factor_pair<depos_order-1> compute_shape_factors_cell;
+                    double sx_old_cell[depos_order] = {0.};
+                    double sx_new_cell[depos_order] = {0.};
+                    double sy_old_cell[depos_order] = {0.};
+                    double sy_new_cell[depos_order] = {0.};
+                    double sz_old_cell[depos_order] = {0.};
+                    double sz_new_cell[depos_order] = {0.};
+                    const int i0_cell_2 = compute_shape_factors_cell( sx_old_cell, sx_new_cell, x0_old-0.5, x0_new-0.5 );
+                    const int j0_cell_2 = compute_shape_factors_cell( sy_old_cell, sy_new_cell, y0_old-0.5, y0_new-0.5 );
+                    const int k0_cell_2 = compute_shape_factors_cell( sz_old_cell, sz_new_cell, z0_old-0.5, z0_new-0.5 );
+                    ignore_unused(i0_cell_2, j0_cell_2, k0_cell_2);
+                    for (int m=0; m<depos_order; m++) {
+                        sx_cell[m] = (4.0*sx_cell[m] + sx_old_cell[m] + sx_new_cell[m])/6.0;
+                        sy_cell[m] = (4.0*sy_cell[m] + sy_old_cell[m] + sy_new_cell[m])/6.0;
+                        sz_cell[m] = (4.0*sz_cell[m] + sz_old_cell[m] + sz_new_cell[m])/6.0;
+                    }
+                }
+
+                // compute node-based weights using the old and new segment positions
+                double sx_old_node[depos_order+1] = {0.};
+                double sx_new_node[depos_order+1] = {0.};
+                double sy_old_node[depos_order+1] = {0.};
+                double sy_new_node[depos_order+1] = {0.};
+                double sz_old_node[depos_order+1] = {0.};
+                double sz_new_node[depos_order+1] = {0.};
+                const int i0_node = compute_shape_factors_node( sx_old_node, sx_new_node, x0_old, x0_new );
+                const int j0_node = compute_shape_factors_node( sy_old_node, sy_new_node, y0_old, y0_new );
+                const int k0_node = compute_shape_factors_node( sz_old_node, sz_new_node, z0_old, z0_new );
+
+                // deposit Jx for this segment
+                amrex::Real this_Jx;
+                for (int i=0; i<=depos_order-1; i++) {
+                    for (int j=0; j<=depos_order; j++) {
+                        for (int k=0; k<=depos_order; k++) {
+                            this_Jx = wqx*sx_cell[i]*( sy_old_node[j]*sz_old_node[k]*one_third
+                                                     + sy_old_node[j]*sz_new_node[k]*one_sixth
+                                                     + sy_new_node[j]*sz_old_node[k]*one_sixth
+                                                     + sy_new_node[j]*sz_new_node[k]*one_third )*seg_factor_x;
+                            amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i0_cell+i, lo.y+j0_node+j, lo.z+k0_node+k), this_Jx);
+                        }
+                    }
+                }
+
+                // deposit Jy for this segment
+                amrex::Real this_Jy;
+                for (int i=0; i<=depos_order; i++) {
+                    for (int j=0; j<=depos_order-1; j++) {
+                        for (int k=0; k<=depos_order; k++) {
+                            this_Jy = wqy*sy_cell[j]*( sx_old_node[i]*sz_old_node[k]*one_third
+                                                     + sx_old_node[i]*sz_new_node[k]*one_sixth
+                                                     + sx_new_node[i]*sz_old_node[k]*one_sixth
+                                                     + sx_new_node[i]*sz_new_node[k]*one_third )*seg_factor_y;
+                            amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i0_node+i, lo.y+j0_cell+j, lo.z+k0_node+k), this_Jy);
+                        }
+                    }
+                }
+
+                // deposit Jz for this segment
+                amrex::Real this_Jz;
+                for (int i=0; i<=depos_order; i++) {
+                    for (int j=0; j<=depos_order; j++) {
+                        for (int k=0; k<=depos_order-1; k++) {
+                            this_Jz = wqz*sz_cell[k]*( sx_old_node[i]*sy_old_node[j]*one_third
+                                                     + sx_old_node[i]*sy_new_node[j]*one_sixth
+                                                     + sx_new_node[i]*sy_old_node[j]*one_sixth
+                                                     + sx_new_node[i]*sy_new_node[j]*one_third )*seg_factor_z;
+                            amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i0_node+i, lo.y+j0_node+j, lo.z+k0_cell+k), this_Jz);
+                        }
+                    }
+                }
+
+                // update old segment values
+                if (ns < num_segments-1) {
+                    x0_old = x0_new;
+                    y0_old = y0_new;
+                    z0_old = z0_new;
+                }
+
+            } // end loop over segments
+
+#elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
+
+            // compute cell crossings in X-direction
+            const auto i_old = static_cast<int>(x_old-shift);
+            const auto i_new = static_cast<int>(x_new-shift);
+            int cell_crossings_x = std::abs(i_new-i_old);
+            num_segments += cell_crossings_x;
+
+            // compute cell crossings in Z-direction
+            const auto k_old = static_cast<int>(z_old-shift);
+            const auto k_new = static_cast<int>(z_new-shift);
+            int cell_crossings_z = std::abs(k_new-k_old);
+            num_segments += cell_crossings_z;
+
+            // need to assert that the number of cell crossings in each direction
+            // is within the range permitted by the number of guard cells
+            // e.g., if (num_segments > 5) ...
+
+            // compute total change in particle position and the initial cell
+            // locations in each direction used to find the position at cell crossings.
+            const double dxp = x_new - x_old;
+            const double dzp = z_new - z_old;
+            const auto dirX_sign = static_cast<double>(dxp < 0. ? -1. : 1.);
+            const auto dirZ_sign = static_cast<double>(dzp < 0. ? -1. : 1.);
+            double Xcell = 0., Zcell = 0.;
+            if (num_segments > 1) {
+                Xcell = static_cast<double>(i_old) + shift + 0.5*(1.-dirX_sign);
+                Zcell = static_cast<double>(k_old) + shift + 0.5*(1.-dirZ_sign);
+            }
+
+            // loop over the number of segments and deposit
+            Compute_shape_factor< depos_order-1 > compute_shape_factor_cell;
+            Compute_shape_factor_pair< depos_order > compute_shape_factors_node;
+            double dxp_seg, dzp_seg;
+            double x0_new, z0_new;
+            double x0_old = x_old;
+            double z0_old = z_old;
+
+            for (int ns=0; ns<num_segments; ns++) {
+
+                if (ns == num_segments-1) { // final segment
+
+                    x0_new = x_new;
+                    z0_new = z_new;
+                    dxp_seg = x0_new - x0_old;
+                    dzp_seg = z0_new - z0_old;
+
+                }
+                else {
+
+                    x0_new = Xcell + dirX_sign;
+                    z0_new = Zcell + dirZ_sign;
+                    dxp_seg = x0_new - x0_old;
+                    dzp_seg = z0_new - z0_old;
+
+                    if (dzp == 0. || std::abs(dxp_seg) < std::abs(dxp/dzp*dzp_seg)) {
+                        Xcell = x0_new;
+                        dzp_seg = dzp/dxp*dxp_seg;
+                        z0_new = z0_old + dzp_seg;
+                    }
+                    else {
+                        Zcell = z0_new;
+                        dxp_seg = dxp/dzp*dzp_seg;
+                        x0_new = x0_old + dxp_seg;
+                    }
+
+                }
+
+                // compute the segment factors (each equal to dt_seg/dt for nonzero dxp, or dzp)
+                const auto seg_factor_x = static_cast<double>(dxp == 0. ? 1. : dxp_seg/dxp);
+                const auto seg_factor_z = static_cast<double>(dzp == 0. ? 1. : dzp_seg/dzp);
+
+                // compute cell-based weights using the average segment position
+                double sx_cell[depos_order] = {0.};
+                double sz_cell[depos_order] = {0.};
+                double const x0_bar = (x0_new + x0_old)/2.0;
+                double const z0_bar = (z0_new + z0_old)/2.0;
+                const int i0_cell = compute_shape_factor_cell( sx_cell, x0_bar-0.5 );
+                const int k0_cell = compute_shape_factor_cell( sz_cell, z0_bar-0.5 );
+
+                if constexpr (depos_order >= 3) { // higher-order correction to the cell-based weights
+                    Compute_shape_factor_pair<depos_order-1> compute_shape_factors_cell;
+                    double sx_old_cell[depos_order] = {0.};
+                    double sx_new_cell[depos_order] = {0.};
+                    double sz_old_cell[depos_order] = {0.};
+                    double sz_new_cell[depos_order] = {0.};
+                    const int i0_cell_2 = compute_shape_factors_cell( sx_old_cell, sx_new_cell, x0_old-0.5, x0_new-0.5 );
+                    const int k0_cell_2 = compute_shape_factors_cell( sz_old_cell, sz_new_cell, z0_old-0.5, z0_new-0.5 );
+                    ignore_unused(i0_cell_2, k0_cell_2);
+                    for (int m=0; m<depos_order; m++) {
+                        sx_cell[m] = (4.0*sx_cell[m] + sx_old_cell[m] + sx_new_cell[m])/6.0;
+                        sz_cell[m] = (4.0*sz_cell[m] + sz_old_cell[m] + sz_new_cell[m])/6.0;
+                    }
+                }
+
+                // compute node-based weights using the old and new segment positions
+                double sx_old_node[depos_order+1] = {0.};
+                double sx_new_node[depos_order+1] = {0.};
+                double sz_old_node[depos_order+1] = {0.};
+                double sz_new_node[depos_order+1] = {0.};
+                const int i0_node = compute_shape_factors_node( sx_old_node, sx_new_node, x0_old, x0_new );
+                const int k0_node = compute_shape_factors_node( sz_old_node, sz_new_node, z0_old, z0_new );
+
+                // deposit Jx for this segment
+                amrex::Real this_Jx;
+                for (int i=0; i<=depos_order-1; i++) {
+                    for (int k=0; k<=depos_order; k++) {
+                        this_Jx = wqx*sx_cell[i]*(sz_old_node[k] + sz_new_node[k])/2.0_rt*seg_factor_x;
+                        amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i0_cell+i, lo.y+k0_node+k, 0, 0), this_Jx);
+#if defined(WARPX_DIM_RZ)
+                        Complex xy_mid = xy_mid0; // Throughout the following loop, xy_mid takes the value e^{i m theta}
+                        for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
+                            // The factor 2 comes from the normalization of the modes
+                            const Complex djr_cmplx = 2._rt*this_Jx*xy_mid;
+                            amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i0_cell+i, lo.y+k0_node+k, 0, 2*imode-1), djr_cmplx.real());
+                            amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i0_cell+i, lo.y+k0_node+k, 0, 2*imode), djr_cmplx.imag());
+                            xy_mid = xy_mid*xy_mid0;
+                        }
+#endif
+                    }
+                }
+
+                // deposit out-of-plane Jy for this segment
+                const auto seg_factor_y = std::min(seg_factor_x,seg_factor_z);
+                amrex::Real this_Jy;
+                for (int i=0; i<=depos_order; i++) {
+                    for (int k=0; k<=depos_order; k++) {
+                        this_Jy = wqy*( sx_old_node[i]*sz_old_node[k]*one_third
+                                      + sx_old_node[i]*sz_new_node[k]*one_sixth
+                                      + sx_new_node[i]*sz_old_node[k]*one_sixth
+                                      + sx_new_node[i]*sz_new_node[k]*one_third )*seg_factor_y;
+                        amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i0_node+i, lo.y+k0_node+k, 0, 0), this_Jy);
+#if defined(WARPX_DIM_RZ)
+                        Complex xy_mid = xy_mid0;
+                        // Throughout the following loop, xy_ takes the value e^{i m theta_}
+                        for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
+                            // The factor 2 comes from the normalization of the modes
+                            const Complex djy_cmplx = 2._rt*this_Jy*xy_mid;
+                            amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i0_node+i, lo.y+k0_node+k, 0, 2*imode-1), djy_cmplx.real());
+                            amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i0_node+i, lo.y+k0_node+k, 0, 2*imode), djy_cmplx.imag());
+                            xy_mid = xy_mid*xy_mid0;
+                        }
+#endif
+                    }
+                }
+
+                // deposit Jz for this segment
+                amrex::Real this_Jz;
+                for (int i=0; i<=depos_order; i++) {
+                    for (int k=0; k<=depos_order-1; k++) {
+                        this_Jz = wqz*sz_cell[k]*(sx_old_node[i] + sx_new_node[i])/2.0_rt*seg_factor_z;
+                        amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i0_node+i, lo.y+k0_cell+k, 0, 0), this_Jz);
+#if defined(WARPX_DIM_RZ)
+                        Complex xy_mid = xy_mid0; // Throughout the following loop, xy_mid takes the value e^{i m theta}
+                        for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
+                            // The factor 2 comes from the normalization of the modes
+                            const Complex djz_cmplx = 2._rt*this_Jz*xy_mid;
+                            amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i0_node+i, lo.y+k0_cell+k, 0, 2*imode-1), djz_cmplx.real());
+                            amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i0_node+i, lo.y+k0_cell+k, 0, 2*imode), djz_cmplx.imag());
+                            xy_mid = xy_mid*xy_mid0;
+                        }
+#endif
+                    }
+                }
+
+                // update old segment values
+                if (ns < num_segments-1) {
+                    x0_old = x0_new;
+                    z0_old = z0_new;
+                }
+
+            } // end loop over segments
+
+#elif defined(WARPX_DIM_1D_Z)
+
+            // compute cell crossings in Z-direction
+            const auto k_old = static_cast<int>(z_old-shift);
+            const auto k_new = static_cast<int>(z_new-shift);
+            int cell_crossings_z = std::abs(k_new-k_old);
+            num_segments += cell_crossings_z;
+
+            // need to assert that the number of cell crossings in each direction
+            // is within the range permitted by the number of guard cells
+            // e.g., if (num_segments > 3) ...
+
+            // compute dzp and the initial cell location used to find the cell crossings.
+            double const dzp = z_new - z_old;
+            const auto dirZ_sign = static_cast<double>(dzp < 0. ? -1. : 1.);
+            double Zcell = static_cast<double>(k_old) + shift + 0.5*(1.-dirZ_sign);
+
+            // loop over the number of segments and deposit
+            Compute_shape_factor< depos_order-1 > compute_shape_factor_cell;
+            Compute_shape_factor_pair< depos_order > compute_shape_factors_node;
+            double dzp_seg;
+            double z0_new;
+            double z0_old = z_old;
+
+            for (int ns=0; ns<num_segments; ns++) {
+
+                if (ns == num_segments-1) { // final segment
+                    z0_new = z_new;
+                    dzp_seg = z0_new - z0_old;
+                }
+                else {
+                    Zcell = Zcell + dirZ_sign;
+                    z0_new = Zcell;
+                    dzp_seg = z0_new - z0_old;
+                }
+
+                // compute the segment factor (equal to dt_seg/dt for nonzero dzp)
+                const auto seg_factor = static_cast<double>(dzp == 0. ? 1. : dzp_seg/dzp);
+
+                // compute cell-based weights using the average segment position
+                double sz_cell[depos_order] = {0.};
+                double const z0_bar = (z0_new + z0_old)/2.0;
+                const int k0_cell = compute_shape_factor_cell( sz_cell, z0_bar-0.5 );
+
+                if constexpr (depos_order >= 3) { // higher-order correction to the cell-based weights
+                    Compute_shape_factor_pair<depos_order-1> compute_shape_factors_cell;
+                    double sz_old_cell[depos_order] = {0.};
+                    double sz_new_cell[depos_order] = {0.};
+                    const int k0_cell_2 = compute_shape_factors_cell( sz_old_cell, sz_new_cell, z0_old-0.5, z0_new-0.5 );
+                    ignore_unused(k0_cell_2);
+                    for (int m=0; m<depos_order; m++) {
+                        sz_cell[m] = (4.0*sz_cell[m] + sz_old_cell[m] + sz_new_cell[m])/6.0;
+                    }
+                }
+
+                // compute node-based weights using the old and new segment positions
+                double sz_old_node[depos_order+1] = {0.};
+                double sz_new_node[depos_order+1] = {0.};
+                const int k0_node = compute_shape_factors_node( sz_old_node, sz_new_node, z0_old, z0_new );
+
+                // deposit out-of-plane Jx and Jy for this segment
+                for (int k=0; k<=depos_order; k++) {
+                    const amrex::Real weight = 0.5_rt*(sz_old_node[k] + sz_new_node[k])*seg_factor;
+                    amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+k0_node+k, 0, 0), wqx*weight);
+                    amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+k0_node+k, 0, 0), wqy*weight);
+                }
+
+                // deposit Jz for this segment
+                for (int k=0; k<=depos_order-1; k++) {
+                    const amrex::Real this_Jz = wqz*sz_cell[k]*seg_factor;
+                    amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+k0_cell+k, 0, 0), this_Jz);
+                }
+
+                // update old segment values
+                if (ns < num_segments-1) {
+                    z0_old = z0_new;
+                }
+
+            }
+
+#endif
+        }
+    );
+#if defined(WARPX_USE_GPUCLOCK)
+    if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
+        amrex::Gpu::streamSynchronize();
+        *cost += *cost_real;
+        amrex::The_Managed_Arena()->free(cost_real);
+    }
+#endif
+}
+
 /**
  * \brief Vay current deposition
  * (<a href="https://doi.org/10.1016/j.jcp.2013.03.010"> Vay et al, 2013</a>)
diff --git a/Source/Particles/Gather/FieldGather.H b/Source/Particles/Gather/FieldGather.H
index dd6b7276681..670d95014a0 100644
--- a/Source/Particles/Gather/FieldGather.H
+++ b/Source/Particles/Gather/FieldGather.H
@@ -880,6 +880,681 @@ void doGatherShapeNEsirkepovStencilImplicit (
 #endif
 }
 
+/**
+ * \brief Energy conserving field gather for thread thread_num for the implicit scheme
+ *        This uses the same stencil for the gather that is used for Villasenor current deposition.
+ *        The magnetic field is deposited using direct deposition.
+ *
+ * \tparam depos_order              Particle shape order
+ * \param xp_n,yp_n,zp_n            Particle position coordinates at start of step
+ * \param xp_nph,yp_nph,zp_nph      Particle position coordinates at half step
+ * \param Exp,Eyp,Ezp               Electric field on particles.
+ * \param Bxp,Byp,Bzp               Magnetic field on particles.
+ * \param Ex_arr,Ey_arr,Ez_arr      Array4 of the electric field, either full array or tile.
+ * \param Bx_arr,By_arr,Bz_arr      Array4 of the magnetic field, either full array or tile.
+ * \param Ex_type,Ey_type,Ez_type   IndexType of the electric field
+ * \param Bx_type,By_type,Bz_type   IndexType of the magnetic field
+ * \param dx                        3D cell spacing
+ * \param xyzmin                    Physical lower bounds of domain in x, y, z.
+ * \param lo                        Index lower bounds of domain.
+ * \param n_rz_azimuthal_modes      Number of azimuthal modes when using RZ geometry
+ */
+template <int depos_order>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void doGatherPicnicShapeN (
+                     [[maybe_unused]] const amrex::ParticleReal xp_n,
+                     [[maybe_unused]] const amrex::ParticleReal yp_n,
+                     const amrex::ParticleReal zp_n,
+                     [[maybe_unused]] const amrex::ParticleReal xp_nph,
+                     [[maybe_unused]] const amrex::ParticleReal yp_nph,
+                     const amrex::ParticleReal zp_nph,
+                     amrex::ParticleReal& Exp,
+                     amrex::ParticleReal& Eyp,
+                     amrex::ParticleReal& Ezp,
+                     amrex::ParticleReal& Bxp,
+                     amrex::ParticleReal& Byp,
+                     amrex::ParticleReal& Bzp,
+                     amrex::Array4<amrex::Real const> const& Ex_arr,
+                     amrex::Array4<amrex::Real const> const& Ey_arr,
+                     amrex::Array4<amrex::Real const> const& Ez_arr,
+                     amrex::Array4<amrex::Real const> const& Bx_arr,
+                     amrex::Array4<amrex::Real const> const& By_arr,
+                     amrex::Array4<amrex::Real const> const& Bz_arr,
+                     [[maybe_unused]] const amrex::IndexType Ex_type,
+                     [[maybe_unused]] const amrex::IndexType Ey_type,
+                     [[maybe_unused]] const amrex::IndexType Ez_type,
+                     [[maybe_unused]] const amrex::IndexType Bx_type,
+                     [[maybe_unused]] const amrex::IndexType By_type,
+                     [[maybe_unused]] const amrex::IndexType Bz_type,
+                     const amrex::GpuArray<amrex::Real, 3>& dx,
+                     const amrex::GpuArray<amrex::Real, 3>& xyzmin,
+                     const amrex::Dim3& lo,
+                     const int n_rz_azimuthal_modes)
+{
+    using namespace amrex;
+#if !defined(WARPX_DIM_RZ)
+    ignore_unused(n_rz_azimuthal_modes);
+#endif
+
+#if !defined(WARPX_DIM_1D_Z)
+    Real const dxi = 1.0_rt / dx[0];
+#endif
+#if !defined(WARPX_DIM_1D_Z)
+    Real const xmin = xyzmin[0];
+#endif
+#if defined(WARPX_DIM_3D)
+    Real const dyi = 1.0_rt / dx[1];
+    Real const ymin = xyzmin[1];
+#endif
+    Real const dzi = 1.0_rt / dx[2];
+    Real const zmin = xyzmin[2];
+
+#if !defined(WARPX_DIM_1D_Z)
+    ParticleReal xp_np1 = 2._prt*xp_nph - xp_n;
+#endif
+#if defined(WARPX_DIM_3D) || defined(WARPX_DIM_RZ)
+    ParticleReal yp_np1 = 2._prt*yp_nph - yp_n;
+#endif
+    ParticleReal zp_np1 = 2._prt*zp_nph - zp_n;
+
+#if !defined(WARPX_DIM_1D_Z)
+    Real constexpr one_third = 1.0_rt / 3.0_rt;
+    Real constexpr one_sixth = 1.0_rt / 6.0_rt;
+#endif
+
+    // computes current and old position in grid units
+#if defined(WARPX_DIM_RZ)
+    amrex::Real const xp_new = xp_np1;
+    amrex::Real const yp_new = yp_np1;
+    amrex::Real const xp_mid = xp_nph;
+    amrex::Real const yp_mid = yp_nph;
+    amrex::Real const xp_old = xp_n;
+    amrex::Real const yp_old = yp_n;
+    amrex::Real const rp_new = std::sqrt(xp_new*xp_new + yp_new*yp_new);
+    amrex::Real const rp_old = std::sqrt(xp_old*xp_old + yp_old*yp_old);
+    amrex::Real const rp_mid = (rp_new + rp_old)/2._rt;
+    amrex::Real costheta_mid, sintheta_mid;
+    if (rp_mid > 0._rt) {
+        costheta_mid = xp_mid/rp_mid;
+        sintheta_mid = yp_mid/rp_mid;
+    } else {
+        costheta_mid = 1._rt;
+        sintheta_mid = 0._rt;
+    }
+    const Complex xy_mid0 = Complex{costheta_mid, sintheta_mid};
+    // Keep these double to avoid bug in single precision
+    double const x_new = (rp_new - xmin)*dxi;
+    double const x_old = (rp_old - xmin)*dxi;
+    double const x_bar = (rp_mid - xmin)*dxi;
+#elif !defined(WARPX_DIM_1D_Z)
+    // Keep these double to avoid bug in single precision
+    double const x_new = (xp_np1 - xmin)*dxi;
+    double const x_old = (xp_n - xmin)*dxi;
+    double const x_bar = (xp_nph - xmin)*dxi;
+#endif
+#if defined(WARPX_DIM_3D)
+    // Keep these double to avoid bug in single precision
+    double const y_new = (yp_np1 - ymin)*dyi;
+    double const y_old = (yp_n   - ymin)*dyi;
+    double const y_bar = (yp_nph - ymin)*dyi;
+#endif
+    // Keep these double to avoid bug in single precision
+    double const z_new = (zp_np1 - zmin)*dzi;
+    double const z_old = (zp_n   - zmin)*dzi;
+    double const z_bar = (zp_nph - zmin)*dzi;
+
+    // 1) Determine the number of segments.
+    // 2) Loop over segments and gather electric field.
+    // 3) Gather magnetic field.
+
+    // cell crossings are defined at cell edges if depos_order is odd
+    // cell crossings are defined at cell centers if depos_order is even
+
+    int num_segments = 1;
+    double shift = 0.0;
+    if ( (depos_order % 2) == 0 ) { shift = 0.5; }
+
+#if defined(WARPX_DIM_3D)
+
+    // compute cell crossings in X-direction
+    const auto i_old = static_cast<int>(x_old-shift);
+    const auto i_new = static_cast<int>(x_new-shift);
+    int cell_crossings_x = std::abs(i_new-i_old);
+    num_segments += cell_crossings_x;
+
+    // compute cell crossings in Y-direction
+    const auto j_old = static_cast<int>(y_old-shift);
+    const auto j_new = static_cast<int>(y_new-shift);
+    int cell_crossings_y = std::abs(j_new-j_old);
+    num_segments += cell_crossings_y;
+
+    // compute cell crossings in Z-direction
+    const auto k_old = static_cast<int>(z_old-shift);
+    const auto k_new = static_cast<int>(z_new-shift);
+    int cell_crossings_z = std::abs(k_new-k_old);
+    num_segments += cell_crossings_z;
+
+    // need to assert that the number of cell crossings in each direction
+    // is within the range permitted by the number of guard cells
+    // e.g., if (num_segments > 7) ...
+
+    // compute total change in particle position and the initial cell
+    // locations in each direction used to find the position at cell crossings.
+    const double dxp = x_new - x_old;
+    const double dyp = y_new - y_old;
+    const double dzp = z_new - z_old;
+    const auto dirX_sign = static_cast<double>(dxp < 0. ? -1. : 1.);
+    const auto dirY_sign = static_cast<double>(dyp < 0. ? -1. : 1.);
+    const auto dirZ_sign = static_cast<double>(dzp < 0. ? -1. : 1.);
+    double Xcell = 0., Ycell = 0., Zcell = 0.;
+    if (num_segments > 1) {
+        Xcell = static_cast<double>(i_old) + shift + 0.5*(1.-dirX_sign);
+        Ycell = static_cast<double>(j_old) + shift + 0.5*(1.-dirY_sign);
+        Zcell = static_cast<double>(k_old) + shift + 0.5*(1.-dirZ_sign);
+    }
+
+    // loop over the number of segments and deposit
+    Compute_shape_factor< depos_order-1 > compute_shape_factor_cell;
+    Compute_shape_factor_pair< depos_order > compute_shape_factors_node;
+    double dxp_seg, dyp_seg, dzp_seg;
+    double x0_new, y0_new, z0_new;
+    double x0_old = x_old;
+    double y0_old = y_old;
+    double z0_old = z_old;
+
+    for (int ns=0; ns<num_segments; ns++) {
+
+        if (ns == num_segments-1) { // final segment
+
+            x0_new = x_new;
+            y0_new = y_new;
+            z0_new = z_new;
+            dxp_seg = x0_new - x0_old;
+            dyp_seg = y0_new - y0_old;
+            dzp_seg = z0_new - z0_old;
+
+        }
+        else {
+
+            x0_new = Xcell + dirX_sign;
+            y0_new = Ycell + dirY_sign;
+            z0_new = Zcell + dirZ_sign;
+            dxp_seg = x0_new - x0_old;
+            dyp_seg = y0_new - y0_old;
+            dzp_seg = z0_new - z0_old;
+
+            if ( (dyp == 0. || std::abs(dxp_seg) < std::abs(dxp/dyp*dyp_seg))
+              && (dzp == 0. || std::abs(dxp_seg) < std::abs(dxp/dzp*dzp_seg)) ) {
+                Xcell = x0_new;
+                dyp_seg = dyp/dxp*dxp_seg;
+                dzp_seg = dzp/dxp*dxp_seg;
+                y0_new = y0_old + dyp_seg;
+                z0_new = z0_old + dzp_seg;
+            }
+            else if (dzp == 0. || std::abs(dyp_seg) < std::abs(dyp/dzp*dzp_seg)) {
+                Ycell = y0_new;
+                dxp_seg = dxp/dyp*dyp_seg;
+                dzp_seg = dzp/dyp*dyp_seg;
+                x0_new = x0_old + dxp_seg;
+                z0_new = z0_old + dzp_seg;
+            }
+            else {
+                Zcell = z0_new;
+                dxp_seg = dxp/dzp*dzp_seg;
+                dyp_seg = dyp/dzp*dzp_seg;
+                x0_new = x0_old + dxp_seg;
+                y0_new = y0_old + dyp_seg;
+            }
+
+        }
+
+        // compute the segment factors (each equal to dt_seg/dt for nonzero dxp, dyp, or dzp)
+        const auto seg_factor_x = static_cast<double>(dxp == 0. ? 1. : dxp_seg/dxp);
+        const auto seg_factor_y = static_cast<double>(dyp == 0. ? 1. : dyp_seg/dyp);
+        const auto seg_factor_z = static_cast<double>(dzp == 0. ? 1. : dzp_seg/dzp);
+
+        // compute cell-based weights using the average segment position
+        double sx_cell[depos_order] = {0.};
+        double sy_cell[depos_order] = {0.};
+        double sz_cell[depos_order] = {0.};
+        double const x0_bar = (x0_new + x0_old)/2.0;
+        double const y0_bar = (y0_new + y0_old)/2.0;
+        double const z0_bar = (z0_new + z0_old)/2.0;
+        const int i0_cell = compute_shape_factor_cell( sx_cell, x0_bar-0.5 );
+        const int j0_cell = compute_shape_factor_cell( sy_cell, y0_bar-0.5 );
+        const int k0_cell = compute_shape_factor_cell( sz_cell, z0_bar-0.5 );
+
+        if constexpr (depos_order >= 3) { // higher-order correction to the cell-based weights
+            Compute_shape_factor_pair<depos_order-1> compute_shape_factors_cell;
+            double sx_old_cell[depos_order] = {0.};
+            double sx_new_cell[depos_order] = {0.};
+            double sy_old_cell[depos_order] = {0.};
+            double sy_new_cell[depos_order] = {0.};
+            double sz_old_cell[depos_order] = {0.};
+            double sz_new_cell[depos_order] = {0.};
+            const int i0_cell_2 = compute_shape_factors_cell( sx_old_cell, sx_new_cell, x0_old-0.5, x0_new-0.5 );
+            const int j0_cell_2 = compute_shape_factors_cell( sy_old_cell, sy_new_cell, y0_old-0.5, y0_new-0.5 );
+            const int k0_cell_2 = compute_shape_factors_cell( sz_old_cell, sz_new_cell, z0_old-0.5, z0_new-0.5 );
+            ignore_unused(i0_cell_2, j0_cell_2, k0_cell_2);
+            for (int m=0; m<depos_order; m++) {
+                sx_cell[m] = (4.0*sx_cell[m] + sx_old_cell[m] + sx_new_cell[m])/6.0;
+                sy_cell[m] = (4.0*sy_cell[m] + sy_old_cell[m] + sy_new_cell[m])/6.0;
+                sz_cell[m] = (4.0*sz_cell[m] + sz_old_cell[m] + sz_new_cell[m])/6.0;
+            }
+        }
+
+        // compute node-based weights using the old and new segment positions
+        double sx_old_node[depos_order+1] = {0.};
+        double sx_new_node[depos_order+1] = {0.};
+        double sy_old_node[depos_order+1] = {0.};
+        double sy_new_node[depos_order+1] = {0.};
+        double sz_old_node[depos_order+1] = {0.};
+        double sz_new_node[depos_order+1] = {0.};
+        const int i0_node = compute_shape_factors_node( sx_old_node, sx_new_node, x0_old, x0_new );
+        const int j0_node = compute_shape_factors_node( sy_old_node, sy_new_node, y0_old, y0_new );
+        const int k0_node = compute_shape_factors_node( sz_old_node, sz_new_node, z0_old, z0_new );
+
+        // gather Ex for this segment
+        amrex::Real weight;
+        for (int i=0; i<=depos_order-1; i++) {
+            for (int j=0; j<=depos_order; j++) {
+                for (int k=0; k<=depos_order; k++) {
+                    weight = sx_cell[i]*( sy_old_node[j]*sz_old_node[k]*one_third
+                                        + sy_old_node[j]*sz_new_node[k]*one_sixth
+                                        + sy_new_node[j]*sz_old_node[k]*one_sixth
+                                        + sy_new_node[j]*sz_new_node[k]*one_third )*seg_factor_x;
+                    Exp += Ex_arr(lo.x+i0_cell+i, lo.y+j0_node+j, lo.z+k0_node+k)*weight;
+                }
+            }
+        }
+
+        // gather Ey for this segment
+        for (int i=0; i<=depos_order; i++) {
+            for (int j=0; j<=depos_order-1; j++) {
+                for (int k=0; k<=depos_order; k++) {
+                    weight = sy_cell[j]*( sx_old_node[i]*sz_old_node[k]*one_third
+                                        + sx_old_node[i]*sz_new_node[k]*one_sixth
+                                        + sx_new_node[i]*sz_old_node[k]*one_sixth
+                                        + sx_new_node[i]*sz_new_node[k]*one_third )*seg_factor_y;
+                    Eyp += Ey_arr(lo.x+i0_node+i, lo.y+j0_cell+j, lo.z+k0_node+k)*weight;
+                }
+            }
+        }
+
+        // gather Ez for this segment
+        for (int i=0; i<=depos_order; i++) {
+            for (int j=0; j<=depos_order; j++) {
+                for (int k=0; k<=depos_order-1; k++) {
+                    weight = sz_cell[k]*( sx_old_node[i]*sy_old_node[j]*one_third
+                                        + sx_old_node[i]*sy_new_node[j]*one_sixth
+                                        + sx_new_node[i]*sy_old_node[j]*one_sixth
+                                        + sx_new_node[i]*sy_new_node[j]*one_third )*seg_factor_z;
+                    Ezp += Ez_arr(lo.x+i0_node+i, lo.y+j0_node+j, lo.z+k0_cell+k)*weight;
+                }
+            }
+        }
+
+        // update old segment values
+        if (ns < num_segments-1) {
+            x0_old = x0_new;
+            y0_old = y0_new;
+            z0_old = z0_new;
+        }
+
+    } // end loop over segments
+
+    // gather magnetic field
+    const int depos_order_B = 1; // second template parameter?
+    Compute_shape_factor< depos_order_B > compute_shape_factor_B;
+    double sz_bar_node[depos_order_B+1] = {0.};
+    double sz_bar_cell[depos_order_B+1] = {0.};
+    const int k_bar_node = compute_shape_factor_B(sz_bar_node, z_bar);
+    const int k_bar_cell = compute_shape_factor_B(sz_bar_cell, z_bar-0.5);
+    double sy_bar_node[depos_order_B+1] = {0.};
+    double sy_bar_cell[depos_order_B+1] = {0.};
+    const int j_bar_node = compute_shape_factor_B(sy_bar_node, y_bar);
+    const int j_bar_cell = compute_shape_factor_B(sy_bar_cell, y_bar-0.5);
+    double sx_bar_node[depos_order_B+1] = {0.};
+    double sx_bar_cell[depos_order_B+1] = {0.};
+    const int i_bar_node = compute_shape_factor_B(sx_bar_node, x_bar);
+    const int i_bar_cell = compute_shape_factor_B(sx_bar_cell, x_bar-0.5);
+
+    amrex::Real weight;
+    for (int i=0; i<=depos_order_B; i++) {
+        for (int j=0; j<=depos_order_B; j++) {
+            for (int k=0; k<=depos_order_B; k++) {
+                weight = static_cast<amrex::Real>(sx_bar_node[i]*sy_bar_cell[j]*sz_bar_cell[k]);
+                Bxp += Bx_arr(lo.x+i_bar_node+i, lo.y+j_bar_cell+j, lo.z+k_bar_cell+k)*weight;
+                //
+                weight = static_cast<amrex::Real>(sx_bar_cell[i]*sy_bar_node[j]*sz_bar_cell[k]);
+                Byp += By_arr(lo.x+i_bar_cell+i, lo.y+j_bar_node+j, lo.z+k_bar_cell+k)*weight;
+                //
+                weight = static_cast<amrex::Real>(sx_bar_cell[i]*sy_bar_cell[j]*sz_bar_node[k]);
+                Bzp += Bz_arr(lo.x+i_bar_cell+i, lo.y+j_bar_cell+j, lo.z+k_bar_node+k)*weight;
+            }
+        }
+    }
+
+#elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
+
+    // compute cell crossings in X-direction
+    const auto i_old = static_cast<int>(x_old-shift);
+    const auto i_new = static_cast<int>(x_new-shift);
+    int cell_crossings_x = std::abs(i_new-i_old);
+    num_segments += cell_crossings_x;
+
+    // compute cell crossings in Z-direction
+    const auto k_old = static_cast<int>(z_old-shift);
+    const auto k_new = static_cast<int>(z_new-shift);
+    int cell_crossings_z = std::abs(k_new-k_old);
+    num_segments += cell_crossings_z;
+
+    // need to assert that the number of cell crossings in each direction
+    // is within the range permitted by the number of guard cells
+    // e.g., if (num_segments > 5) ...
+
+    // compute total change in particle position and the initial cell
+    // locations in each direction used to find the position at cell crossings.
+    const double dxp = x_new - x_old;
+    const double dzp = z_new - z_old;
+    const auto dirX_sign = static_cast<double>(dxp < 0. ? -1. : 1.);
+    const auto dirZ_sign = static_cast<double>(dzp < 0. ? -1. : 1.);
+    double Xcell = 0., Zcell = 0.;
+    if (num_segments > 1) {
+        Xcell = static_cast<double>(i_old) + shift + 0.5*(1.-dirX_sign);
+        Zcell = static_cast<double>(k_old) + shift + 0.5*(1.-dirZ_sign);
+    }
+
+    // loop over the number of segments and deposit
+    Compute_shape_factor< depos_order-1 > compute_shape_factor_cell;
+    Compute_shape_factor_pair< depos_order > compute_shape_factors_node;
+    double dxp_seg, dzp_seg;
+    double x0_new, z0_new;
+    double x0_old = x_old;
+    double z0_old = z_old;
+
+    for (int ns=0; ns<num_segments; ns++) {
+
+        if (ns == num_segments-1) { // final segment
+
+            x0_new = x_new;
+            z0_new = z_new;
+            dxp_seg = x0_new - x0_old;
+            dzp_seg = z0_new - z0_old;
+
+        }
+        else {
+
+            x0_new = Xcell + dirX_sign;
+            z0_new = Zcell + dirZ_sign;
+            dxp_seg = x0_new - x0_old;
+            dzp_seg = z0_new - z0_old;
+
+            if (dzp == 0. || std::abs(dxp_seg) < std::abs(dxp/dzp*dzp_seg)) {
+                Xcell = x0_new;
+                dzp_seg = dzp/dxp*dxp_seg;
+                z0_new = z0_old + dzp_seg;
+            }
+            else {
+                Zcell = z0_new;
+                dxp_seg = dxp/dzp*dzp_seg;
+                x0_new = x0_old + dxp_seg;
+            }
+
+        }
+
+        // compute the segment factors (each equal to dt_seg/dt for nonzero dxp, or dzp)
+        const auto seg_factor_x = static_cast<double>(dxp == 0. ? 1. : dxp_seg/dxp);
+        const auto seg_factor_z = static_cast<double>(dzp == 0. ? 1. : dzp_seg/dzp);
+
+        // compute cell-based weights using the average segment position
+        double sx_cell[depos_order] = {0.};
+        double sz_cell[depos_order] = {0.};
+        double const x0_bar = (x0_new + x0_old)/2.0;
+        double const z0_bar = (z0_new + z0_old)/2.0;
+        const int i0_cell = compute_shape_factor_cell(sx_cell, x0_bar-0.5);
+        const int k0_cell = compute_shape_factor_cell(sz_cell, z0_bar-0.5);
+
+        if constexpr (depos_order >= 3) { // higher-order correction to the cell-based weights
+            Compute_shape_factor_pair<depos_order-1> compute_shape_factors_cell;
+            double sx_old_cell[depos_order] = {0.};
+            double sx_new_cell[depos_order] = {0.};
+            double sz_old_cell[depos_order] = {0.};
+            double sz_new_cell[depos_order] = {0.};
+            const int i0_cell_2 = compute_shape_factors_cell( sx_old_cell, sx_new_cell, x0_old-0.5, x0_new-0.5 );
+            const int k0_cell_2 = compute_shape_factors_cell( sz_old_cell, sz_new_cell, z0_old-0.5, z0_new-0.5 );
+            ignore_unused(i0_cell_2, k0_cell_2);
+            for (int m=0; m<depos_order; m++) {
+                sx_cell[m] = (4.0*sx_cell[m] + sx_old_cell[m] + sx_new_cell[m])/6.0;
+                sz_cell[m] = (4.0*sz_cell[m] + sz_old_cell[m] + sz_new_cell[m])/6.0;
+            }
+        }
+
+        // compute node-based weights using the old and new segment positions
+        double sx_old_node[depos_order+1] = {0.};
+        double sx_new_node[depos_order+1] = {0.};
+        double sz_old_node[depos_order+1] = {0.};
+        double sz_new_node[depos_order+1] = {0.};
+        const int i0_node = compute_shape_factors_node( sx_old_node, sx_new_node, x0_old, x0_new );
+        const int k0_node = compute_shape_factors_node( sz_old_node, sz_new_node, z0_old, z0_new );
+
+        // gather Ex for this segment
+        amrex::Real weight;
+        for (int i=0; i<=depos_order-1; i++) {
+            for (int k=0; k<=depos_order; k++) {
+                weight = sx_cell[i]*(sz_old_node[k] + sz_new_node[k])/2.0_rt*seg_factor_x;
+                Exp += Ex_arr(lo.x+i0_cell+i, lo.y+k0_node+k, 0, 0)*weight;
+#if defined(WARPX_DIM_RZ)
+                Complex xy_mid = xy_mid0; // Throughout the following loop, xy_mid takes the value e^{i m theta}
+                for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
+                    const auto dEx = (+ Ex_arr(lo.x+i0_cell+i, lo.y+k0_node+k, 0, 2*imode-1)*xy_mid.real()
+                                      - Ex_arr(lo.x+i0_cell+i, lo.y+k0_node+k, 0, 2*imode)*xy_mid.imag());
+                    Exp += weight*dEx;
+                    xy_mid = xy_mid*xy_mid0;
+                }
+#endif
+            }
+        }
+
+        // gather out-of-plane Ey for this segment
+        const auto seg_factor_y = std::min(seg_factor_x,seg_factor_z);
+        for (int i=0; i<=depos_order; i++) {
+            for (int k=0; k<=depos_order; k++) {
+                weight = ( sx_old_node[i]*sz_old_node[k]*one_third
+                       +   sx_old_node[i]*sz_new_node[k]*one_sixth
+                       +   sx_new_node[i]*sz_old_node[k]*one_sixth
+                       +   sx_new_node[i]*sz_new_node[k]*one_third )*seg_factor_y;
+                Eyp += Ey_arr(lo.x+i0_node+i, lo.y+k0_node+k, 0, 0)*weight;
+#if defined(WARPX_DIM_RZ)
+                Complex xy_mid = xy_mid0; // Throughout the following loop, xy_mid takes the value e^{i m theta}
+                for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
+                    const auto dEy = (+ Ey_arr(lo.x+i0_node+i, lo.y+k0_node+k, 0, 2*imode-1)*xy_mid.real()
+                                      - Ey_arr(lo.x+i0_node+i, lo.y+k0_node+k, 0, 2*imode)*xy_mid.imag());
+                    Eyp += weight*dEy;
+                    xy_mid = xy_mid*xy_mid0;
+                }
+#endif
+            }
+        }
+
+        // gather Ez for this segment
+        for (int i=0; i<=depos_order; i++) {
+            for (int k=0; k<=depos_order-1; k++) {
+                weight = sz_cell[k]*(sx_old_node[i] + sx_new_node[i])/2.0_rt*seg_factor_z;
+                Ezp += Ez_arr(lo.x+i0_node+i, lo.y+k0_cell+k, 0, 0)*weight;
+#if defined(WARPX_DIM_RZ)
+                Complex xy_mid = xy_mid0; // Throughout the following loop, xy_mid takes the value e^{i m theta}
+                for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
+                    const auto dEz = (+ Ez_arr(lo.x+i0_node+i, lo.y+k0_cell+k, 0, 2*imode-1)*xy_mid.real()
+                                      - Ez_arr(lo.x+i0_node+i, lo.y+k0_cell+k, 0, 2*imode)*xy_mid.imag());
+                    Ezp += weight*dEz;
+                    xy_mid = xy_mid*xy_mid0;
+                }
+#endif
+            }
+        }
+
+        // update old segment values
+        if (ns < num_segments-1) {
+            x0_old = x0_new;
+            z0_old = z0_new;
+        }
+
+    }
+
+    // gather magnetic field and out-of-plane electric field
+    const int depos_order_B = 1; // second template parameter?
+    Compute_shape_factor< depos_order_B > compute_shape_factor_B;
+    double sz_bar_node[depos_order_B+1] = {0.};
+    double sz_bar_cell[depos_order_B+1] = {0.};
+    const int k_bar_node = compute_shape_factor_B(sz_bar_node, z_bar);
+    const int k_bar_cell = compute_shape_factor_B(sz_bar_cell, z_bar-0.5);
+    double sx_bar_node[depos_order_B+1] = {0.};
+    double sx_bar_cell[depos_order_B+1] = {0.};
+    const int i_bar_node = compute_shape_factor_B(sx_bar_node, x_bar);
+    const int i_bar_cell = compute_shape_factor_B(sx_bar_cell, x_bar-0.5);
+
+    for (int i=0; i<=depos_order_B; i++) {
+        for (int k=0; k<=depos_order_B; k++) {
+            const auto weight_Bz = static_cast<amrex::Real>(sx_bar_cell[i]*sz_bar_node[k]);
+            Bzp += Bz_arr(lo.x+i_bar_cell+i, lo.y+k_bar_node+k, 0, 0)*weight_Bz;
+            //
+            const auto weight_Bx = static_cast<amrex::Real>(sx_bar_node[i]*sz_bar_cell[k]);
+            Bxp += Bx_arr(lo.x+i_bar_node+i, lo.y+k_bar_cell+k, 0, 0)*weight_Bx;
+            //
+            const auto weight_By = static_cast<amrex::Real>(sx_bar_cell[i]*sz_bar_cell[k]);
+            Byp += By_arr(lo.x+i_bar_cell+i, lo.y+k_bar_cell+k, 0, 0)*weight_By;
+#if defined(WARPX_DIM_RZ)
+            Complex xy_mid = xy_mid0; // Throughout the following loop, xy_mid takes the value e^{i m theta}
+            for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
+                const auto dBx = (+ Bx_arr(lo.x+i_bar_node+i, lo.y+k_bar_cell+k, 0, 2*imode-1)*xy_mid.real()
+                                  - Bx_arr(lo.x+i_bar_node+i, lo.y+k_bar_cell+k, 0, 2*imode)*xy_mid.imag());
+                const auto dBy = (+ By_arr(lo.x+i_bar_cell+i, lo.y+k_bar_cell+k, 0, 2*imode-1)*xy_mid.real()
+                                  - By_arr(lo.x+i_bar_cell+i, lo.y+k_bar_cell+k, 0, 2*imode)*xy_mid.imag());
+                const auto dBz = (+ Bz_arr(lo.x+i_bar_cell+i, lo.y+k_bar_node+k, 0, 2*imode-1)*xy_mid.real()
+                                  - Bz_arr(lo.x+i_bar_cell+i, lo.y+k_bar_node+k, 0, 2*imode)*xy_mid.imag());
+                Bxp += weight_Bx*dBx;
+                Byp += weight_By*dBy;
+                Bzp += weight_Bz*dBz;
+                xy_mid = xy_mid*xy_mid0;
+            }
+#endif
+        }
+    }
+
+#ifdef WARPX_DIM_RZ
+
+    // Convert Exp and Eyp (which are actually Er and Etheta) to Ex and Ey
+    const amrex::Real Exp_save = Exp;
+    Exp = costheta_mid*Exp - sintheta_mid*Eyp;
+    Eyp = costheta_mid*Eyp + sintheta_mid*Exp_save;
+    const amrex::Real Bxp_save = Bxp;
+    Bxp = costheta_mid*Bxp - sintheta_mid*Byp;
+    Byp = costheta_mid*Byp + sintheta_mid*Bxp_save;
+
+#endif
+
+#elif defined(WARPX_DIM_1D_Z)
+
+    // compute cell crossings in Z-direction
+    const auto k_old = static_cast<int>(z_old-shift);
+    const auto k_new = static_cast<int>(z_new-shift);
+    int cell_crossings_z = std::abs(k_new-k_old);
+    num_segments += cell_crossings_z;
+
+    // need to assert that the number of cell crossings in each direction
+    // is within the range permitted by the number of guard cells
+    // e.g., if (num_segments > 3) ...
+
+    // compute dzp and the initial cell location used to find the cell crossings.
+    double const dzp = z_new - z_old;
+    const auto dirZ_sign = static_cast<double>(dzp < 0. ? -1. : 1.);
+    double Zcell = static_cast<double>(k_old) + shift + 0.5*(1.-dirZ_sign);
+
+    // loop over the number of segments and deposit
+    Compute_shape_factor< depos_order-1 > compute_shape_factor_cell;
+    Compute_shape_factor_pair< depos_order > compute_shape_factors_node;
+    double dzp_seg;
+    double z0_new;
+    double z0_old = z_old;
+
+    for (int ns=0; ns<num_segments; ns++) {
+
+        if (ns == num_segments-1) { // final segment
+            z0_new = z_new;
+            dzp_seg = z0_new - z0_old;
+        }
+        else {
+            Zcell = Zcell + dirZ_sign;
+            z0_new = Zcell;
+            dzp_seg = z0_new - z0_old;
+        }
+
+        // compute the segment factor (equal to dt_seg/dt for nonzero dzp)
+        const auto seg_factor = static_cast<double>(dzp == 0. ? 1. : dzp_seg/dzp);
+
+        // compute cell-based weights using the average segment position
+        double sz_cell[depos_order] = {0.};
+        double const z0_bar = (z0_new + z0_old)/2.0;
+        const int k0_cell = compute_shape_factor_cell( sz_cell, z0_bar-0.5 );
+
+        if constexpr (depos_order >= 3) { // higher-order correction to the cell-based weights
+            Compute_shape_factor_pair<depos_order-1> compute_shape_factors_cell;
+            double sz_old_cell[depos_order] = {0.};
+            double sz_new_cell[depos_order] = {0.};
+            const int k0_cell_2 = compute_shape_factors_cell( sz_old_cell, sz_new_cell, z0_old-0.5, z0_new-0.5 );
+            ignore_unused(k0_cell_2);
+            for (int m=0; m<depos_order; m++) {
+                sz_cell[m] = (4.0*sz_cell[m] + sz_old_cell[m] + sz_new_cell[m])/6.0;
+            }
+        }
+
+        // compute node-based weights using the old and new segment positions
+        double sz_old_node[depos_order+1] = {0.};
+        double sz_new_node[depos_order+1] = {0.};
+        const int k0_node = compute_shape_factors_node( sz_old_node, sz_new_node, z0_old, z0_new );
+
+        // gather out-of-plane Ex and Ey for this segment
+        for (int k=0; k<=depos_order; k++) {
+            auto weight = 0.5_rt*(sz_old_node[k] + sz_new_node[k])*seg_factor;
+            Exp += Ex_arr(lo.x+k0_node+k, 0, 0)*weight;
+            Eyp += Ey_arr(lo.x+k0_node+k, 0, 0)*weight;
+        }
+
+        // gather Ez for this segment
+        for (int k=0; k<=depos_order-1; k++) {
+            auto weight = sz_cell[k]*seg_factor;
+            Ezp += Ez_arr(lo.x+k0_cell+k, 0, 0)*weight;
+        }
+
+        // update old segment values
+        if (ns < num_segments-1) {
+            z0_old = z0_new;
+        }
+
+    }
+
+    // gather magnetic field
+    const int depos_order_B = 1; // second template parameter?
+    Compute_shape_factor< depos_order_B > compute_shape_factor_B;
+    double sz_bar_node[depos_order_B+1] = {0.};
+    double sz_bar_cell[depos_order_B+1] = {0.};
+    const int k_bar_node = compute_shape_factor_B(sz_bar_node, z_bar);
+    const int k_bar_cell = compute_shape_factor_B(sz_bar_cell, z_bar-0.5_rt);
+
+    amrex::Real weight;
+    for (int k=0; k<=depos_order_B; k++) {
+        weight = static_cast<amrex::Real>(sz_bar_node[k]);
+        Bzp += Bz_arr(lo.x+k_bar_node+k, 0, 0)*weight;
+        //
+        weight = static_cast<amrex::Real>(sz_bar_cell[k]);
+        Bxp += Bx_arr(lo.x+k_bar_cell+k, 0, 0)*weight;
+        Byp += By_arr(lo.x+k_bar_cell+k, 0, 0)*weight;
+    }
+
+#endif
+}
+
 /**
  * \brief Field gather for particles
  *
@@ -1052,6 +1727,7 @@ void doGatherShapeN (const amrex::ParticleReal xp,
  * \param lo                      Index lower bounds of domain.
  * \param n_rz_azimuthal_modes    Number of azimuthal modes when using RZ geometry
  * \param nox                     order of the particle shape function
+ * \param gather_type             integer identifier for which algorithm to use
  * \param galerkin_interpolation  whether to use lower order in v
  */
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
@@ -1085,9 +1761,9 @@ void doGatherShapeNImplicit (
                      const amrex::Dim3& lo,
                      const int n_rz_azimuthal_modes,
                      const int nox,
-                     const bool galerkin_interpolation)
+                     const int depos_type )
 {
-    if (galerkin_interpolation) {
+    if (depos_type==0) { // CurrentDepositionAlgo::Esirkepov
         if (nox == 1) {
             doGatherShapeNEsirkepovStencilImplicit<1>(xp_n, yp_n, zp_n, xp_nph, yp_nph, zp_nph,
                                                       Exp, Eyp, Ezp, Bxp, Byp, Bzp,
@@ -1107,7 +1783,35 @@ void doGatherShapeNImplicit (
                                                       ex_type, ey_type, ez_type, bx_type, by_type, bz_type,
                                                       dx_arr, xyzmin_arr, lo, n_rz_azimuthal_modes);
         }
-    } else {
+    }
+    else if (depos_type==3) { // CurrentDepositionAlgo::Villasenor
+        if (nox == 1) {
+            doGatherPicnicShapeN<1>(xp_n, yp_n, zp_n, xp_nph, yp_nph, zp_nph,
+                                    Exp, Eyp, Ezp, Bxp, Byp, Bzp,
+                                    ex_arr, ey_arr, ez_arr, bx_arr, by_arr, bz_arr,
+                                    ex_type, ey_type, ez_type, bx_type, by_type, bz_type,
+                                    dx_arr, xyzmin_arr, lo, n_rz_azimuthal_modes);
+        } else if (nox == 2) {
+            doGatherPicnicShapeN<2>(xp_n, yp_n, zp_n, xp_nph, yp_nph, zp_nph,
+                                    Exp, Eyp, Ezp, Bxp, Byp, Bzp,
+                                    ex_arr, ey_arr, ez_arr, bx_arr, by_arr, bz_arr,
+                                    ex_type, ey_type, ez_type, bx_type, by_type, bz_type,
+                                    dx_arr, xyzmin_arr, lo, n_rz_azimuthal_modes);
+        } else if (nox == 3) {
+            doGatherPicnicShapeN<3>(xp_n, yp_n, zp_n, xp_nph, yp_nph, zp_nph,
+                                    Exp, Eyp, Ezp, Bxp, Byp, Bzp,
+                                    ex_arr, ey_arr, ez_arr, bx_arr, by_arr, bz_arr,
+                                    ex_type, ey_type, ez_type, bx_type, by_type, bz_type,
+                                    dx_arr, xyzmin_arr, lo, n_rz_azimuthal_modes);
+        } else if (nox == 4) {
+            doGatherPicnicShapeN<4>(xp_n, yp_n, zp_n, xp_nph, yp_nph, zp_nph,
+                                    Exp, Eyp, Ezp, Bxp, Byp, Bzp,
+                                    ex_arr, ey_arr, ez_arr, bx_arr, by_arr, bz_arr,
+                                    ex_type, ey_type, ez_type, bx_type, by_type, bz_type,
+                                    dx_arr, xyzmin_arr, lo, n_rz_azimuthal_modes);
+        }
+    }
+    else if (depos_type==1) { // CurrentDepositionAlgo::Direct
         if (nox == 1) {
             doGatherShapeN<1,0>(xp_nph, yp_nph, zp_nph, Exp, Eyp, Ezp, Bxp, Byp, Bzp,
                                 ex_arr, ey_arr, ez_arr, bx_arr, by_arr, bz_arr,
diff --git a/Source/Particles/PhysicalParticleContainer.cpp b/Source/Particles/PhysicalParticleContainer.cpp
index e39cd79b55d..929c3c26649 100644
--- a/Source/Particles/PhysicalParticleContainer.cpp
+++ b/Source/Particles/PhysicalParticleContainer.cpp
@@ -2984,7 +2984,7 @@ PhysicalParticleContainer::ImplicitPushXP (WarpXParIter& pti,
 
     const Dim3 lo = lbound(box);
 
-    bool galerkin_interpolation = WarpX::galerkin_interpolation;
+    int depos_type = WarpX::current_deposition_algo;
     int nox = WarpX::nox;
     int n_rz_azimuthal_modes = WarpX::n_rz_azimuthal_modes;
 
@@ -3107,8 +3107,8 @@ PhysicalParticleContainer::ImplicitPushXP (WarpXParIter& pti,
             doGatherShapeNImplicit(xp_n, yp_n, zp_n, xp, yp, zp, Exp, Eyp, Ezp, Bxp, Byp, Bzp,
                                    ex_arr, ey_arr, ez_arr, bx_arr, by_arr, bz_arr,
                                    ex_type, ey_type, ez_type, bx_type, by_type, bz_type,
-                                   dx_arr, xyzmin_arr, lo, n_rz_azimuthal_modes,
-                                   nox, galerkin_interpolation);
+                                   dx_arr, xyzmin_arr, lo, n_rz_azimuthal_modes, nox,
+                                   depos_type );
         }
 
         // Externally applied E and B-field in Cartesian co-ordinates
diff --git a/Source/Particles/ShapeFactors.H b/Source/Particles/ShapeFactors.H
index a0b4ed63a30..73e8f7243bb 100644
--- a/Source/Particles/ShapeFactors.H
+++ b/Source/Particles/ShapeFactors.H
@@ -64,6 +64,19 @@ struct Compute_shape_factor
             // index of the leftmost cell where particle deposits
             return j-1;
         }
+        else if constexpr (depos_order == 4){
+            const auto j = static_cast<int>(xmid + T(0.5));
+            const T xint = xmid - T(j);
+            const T xint_p1 = xint + T(1.0);
+            const T xint_m1 = xint - T(1.0);
+            sx[0] = T(1.0)/T(384.0)*(T(1.0) - T(2.0)*xint)*(T(1.0) - T(2.0)*xint)*(T(1.0) - T(2.0)*xint)*(T(1.0) - T(2.0)*xint);
+            sx[1] = T(1.0)/T(96.0)*(T(55.0) + T(4.0)*xint_p1*(T(5.0) - T(2.0)*xint_p1*(T(15.0) + T(2.0)*xint_p1*(xint_p1 - T(5.0)))));
+            sx[2] = T(115.0)/T(192.0) + xint*xint*(xint*xint/T(4.0) - T(5.0)/T(8.0));
+            sx[3] = T(1.0)/T(96.0)*(T(55.0) - T(4.0)*xint_m1*(T(5.0) + T(2.0)*xint_m1*(T(15.0) - T(2.0)*xint_m1*(-xint_m1 - T(5.0)))));
+            sx[4] = T(1.0)/T(384.0)*(T(1.0) + T(2.0)*xint)*(T(1.0) + T(2.0)*xint)*(T(1.0) + T(2.0)*xint)*(T(1.0) + T(2.0)*xint);
+            // index of the leftmost cell where particle deposits
+            return j-2;
+        }
         else{
             WARPX_ABORT_WITH_MESSAGE("Unknown particle shape selected in Compute_shape_factor");
             amrex::ignore_unused(sx, xmid);
@@ -132,4 +145,96 @@ struct Compute_shifted_shape_factor
     }
 };
 
+/**
+ *  Compute shape factors for two positions that are within
+ *  half a grid cell of the same cell interface and return the common
+ *  index of the leftmost cell where particle writes, which is correctly
+ *  determined by the average of the positions.
+ *  This is used for computing the segment weights transverse to the
+ *  current density direction in the Villasenor deposition algorithm.
+ */
+template <int depos_order>
+struct Compute_shape_factor_pair
+{
+    template< typename T >
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    int operator()(
+        T* const sx_old,
+        T* const sx_new,
+        T xold,
+        T xnew) const
+    {
+        const T xmid = T(0.5)*(xnew + xold);
+        if constexpr (depos_order == 1){
+            const auto j = static_cast<int>(xmid);
+            const T xint_old = xold - T(j);
+            sx_old[0] = T(1.0) - xint_old;
+            sx_old[1] = xint_old;
+            //
+            const T xint_new = xnew - T(j);
+            sx_new[0] = T(1.0) - xint_new;
+            sx_new[1] = xint_new;
+            return j;
+        }
+        else if constexpr (depos_order == 2){
+            const auto j = static_cast<int>(xmid + T(0.5));
+            const T xint_old = xold - T(j);
+            sx_old[0] = T(0.5)*(T(0.5) - xint_old)*(T(0.5) - xint_old);
+            sx_old[1] = T(0.75) - xint_old*xint_old;
+            sx_old[2] = T(0.5)*(T(0.5) + xint_old)*(T(0.5) + xint_old);
+            //
+            const T xint_new = xnew - T(j);
+            sx_new[0] = T(0.5)*(T(0.5) - xint_new)*(T(0.5) - xint_new);
+            sx_new[1] = T(0.75) - xint_new*xint_new;
+            sx_new[2] = T(0.5)*(T(0.5) + xint_new)*(T(0.5) + xint_new);
+            // index of the leftmost cell where particle deposits
+            return j-1;
+        }
+        else if constexpr (depos_order == 3){
+            const auto j = static_cast<int>(xmid);
+            const T xint_old = xold - T(j);
+            sx_old[0] = T(1.0)/T(6.0)*(T(1.0) - xint_old)*(T(1.0) - xint_old)*(T(1.0) - xint_old);
+            sx_old[1] = T(2.0)/T(3.0) - xint_old*xint_old*(T(1.0) - xint_old/(T(2.0)));
+            sx_old[2] = T(2.0)/T(3.0) - (T(1.0) - xint_old)*(T(1.0) - xint_old)*(T(1.0) - T(0.5)*(T(1.0) - xint_old));
+            sx_old[3] = T(1.0)/T(6.0)*xint_old*xint_old*xint_old;
+            //
+            const T xint_new = xnew - T(j);
+            sx_new[0] = T(1.0)/T(6.0)*(T(1.0) - xint_new)*(T(1.0) - xint_new)*(T(1.0) - xint_new);
+            sx_new[1] = T(2.0)/T(3.0) - xint_new*xint_new*(T(1.0) - xint_new/(T(2.0)));
+            sx_new[2] = T(2.0)/T(3.0) - (T(1.0) - xint_new)*(T(1.0) - xint_new)*(T(1.0) - T(0.5)*(T(1.0) - xint_new));
+            sx_new[3] = T(1.0)/T(6.0)*xint_new*xint_new*xint_new;
+            // index of the leftmost cell where particle deposits
+            return j-1;
+        }
+        else if constexpr (depos_order == 4){
+            const auto j = static_cast<int>(xmid + T(0.5));
+            const T xint_old = xold - T(j);
+            T xint_p1 = xint_old + T(1.0);
+            T xint_m1 = xint_old - T(1.0);
+            sx_old[0] = T(1.0)/T(384.0)*(T(1.0) - T(2.0)*xint_old)*(T(1.0) - T(2.0)*xint_old)*(T(1.0) - T(2.0)*xint_old)*(T(1.0) - T(2.0)*xint_old);
+            sx_old[1] = T(1.0)/T(96.0)*(T(55.0) + T(4.0)*xint_p1*(T(5.0) - T(2.0)*xint_p1*(T(15.0) + T(2.0)*xint_p1*(xint_p1 - T(5.0)))));
+            sx_old[2] = T(115.0)/T(192.0) + xint_old*xint_old*(xint_old*xint_old/T(4.0) - T(5.0)/T(8.0));
+            sx_old[3] = T(1.0)/T(96.0)*(T(55.0) - T(4.0)*xint_m1*(T(5.0) + T(2.0)*xint_m1*(T(15.0) - T(2.0)*xint_m1*(-xint_m1 - T(5.0)))));
+            sx_old[4] = T(1.0)/T(384.0)*(T(1.0) + T(2.0)*xint_old)*(T(1.0) + T(2.0)*xint_old)*(T(1.0) + T(2.0)*xint_old)*(T(1.0) + T(2.0)*xint_old);
+            //
+            const T xint_new = xnew - T(j);
+            xint_p1 = xint_new + T(1.0);
+            xint_m1 = xint_new - T(1.0);
+            sx_new[0] = T(1.0)/T(384.0)*(T(1.0) - T(2.0)*xint_new)*(T(1.0) - T(2.0)*xint_new)*(T(1.0) - T(2.0)*xint_new)*(T(1.0) - T(2.0)*xint_new);
+            sx_new[1] = T(1.0)/T(96.0)*(T(55.0) + T(4.0)*xint_p1*(T(5.0) - T(2.0)*xint_p1*(T(15.0) + T(2.0)*xint_p1*(xint_p1 - T(5.0)))));
+            sx_new[2] = T(115.0)/T(192.0) + xint_new*xint_new*(xint_new*xint_new/T(4.0) - T(5.0)/T(8.0));
+            sx_new[3] = T(1.0)/T(96.0)*(T(55.0) - T(4.0)*xint_m1*(T(5.0) + T(2.0)*xint_m1*(T(15.0) - T(2.0)*xint_m1*(-xint_m1 - T(5.0)))));
+            sx_new[4] = T(1.0)/T(384.0)*(T(1.0) + T(2.0)*xint_new)*(T(1.0) + T(2.0)*xint_new)*(T(1.0) + T(2.0)*xint_new)*(T(1.0) + T(2.0)*xint_new);
+            //
+            // index of the leftmost cell where particle deposits
+            return j-2;
+        }
+        else{
+            WARPX_ABORT_WITH_MESSAGE("Unknown particle shape selected in Compute_shape_factor_pair");
+            amrex::ignore_unused(sx_old, sx_new, xold, xnew);
+        }
+        return 0;
+    }
+};
+
 #endif // SHAPEFACTORS_H_
diff --git a/Source/Particles/WarpXParticleContainer.cpp b/Source/Particles/WarpXParticleContainer.cpp
index 85bb1c3f4b8..a395198e361 100644
--- a/Source/Particles/WarpXParticleContainer.cpp
+++ b/Source/Particles/WarpXParticleContainer.cpp
@@ -452,9 +452,10 @@ WarpXParticleContainer::DepositCurrent (WarpXParIter& pti,
     // Take into account Galilean shift
     const std::array<amrex::Real, 3>& xyzmin = WarpX::LowerCorner(tilebox, depos_lev, 0.5_rt*dt);
 
-    if (WarpX::current_deposition_algo == CurrentDepositionAlgo::Esirkepov) {
+    if (WarpX::current_deposition_algo == CurrentDepositionAlgo::Esirkepov ||
+        WarpX::current_deposition_algo == CurrentDepositionAlgo::Villasenor) {
         if (WarpX::grid_type == GridType::Collocated) {
-          WARPX_ABORT_WITH_MESSAGE("The Esirkepov algorithm cannot be used with a collocated grid.");
+          WARPX_ABORT_WITH_MESSAGE("Charge-conserving current depositions (Esirkepov and Villasenor) cannot be used with a collocated grid.");
         }
     }
 
@@ -517,6 +518,9 @@ WarpXParticleContainer::DepositCurrent (WarpXParIter& pti,
         if (WarpX::current_deposition_algo == CurrentDepositionAlgo::Esirkepov) {
             WARPX_ABORT_WITH_MESSAGE("Cannot do shared memory deposition with Esirkepov algorithm");
         }
+        else if (WarpX::current_deposition_algo == CurrentDepositionAlgo::Villasenor) {
+            WARPX_ABORT_WITH_MESSAGE("Cannot do shared memory deposition with Villasenor algorithm");
+        }
         else if (WarpX::current_deposition_algo == CurrentDepositionAlgo::Vay) {
             WARPX_ABORT_WITH_MESSAGE("Cannot do shared memory deposition with Vay algorithm");
         }
@@ -525,21 +529,21 @@ WarpXParticleContainer::DepositCurrent (WarpXParIter& pti,
             if        (WarpX::nox == 1){
                 doDepositionSharedShapeN<1>(
                         GetPosition, wp.dataPtr() + offset, uxp.dataPtr() + offset,
-                    uyp.dataPtr() + offset, uzp.dataPtr() + offset, ion_lev,
+                        uyp.dataPtr() + offset, uzp.dataPtr() + offset, ion_lev,
                         jx_fab, jy_fab, jz_fab, np_to_deposit, relative_time, dx,
                         xyzmin, lo, q, WarpX::n_rz_azimuthal_modes, cost,
                         WarpX::load_balance_costs_update_algo, bins, box, geom, max_tbox_size);
             } else if (WarpX::nox == 2){
                 doDepositionSharedShapeN<2>(
                         GetPosition, wp.dataPtr() + offset, uxp.dataPtr() + offset,
-                    uyp.dataPtr() + offset, uzp.dataPtr() + offset, ion_lev,
+                        uyp.dataPtr() + offset, uzp.dataPtr() + offset, ion_lev,
                         jx_fab, jy_fab, jz_fab, np_to_deposit, relative_time, dx,
                         xyzmin, lo, q, WarpX::n_rz_azimuthal_modes, cost,
                         WarpX::load_balance_costs_update_algo, bins, box, geom, max_tbox_size);
             } else if (WarpX::nox == 3){
                 doDepositionSharedShapeN<3>(
                         GetPosition, wp.dataPtr() + offset, uxp.dataPtr() + offset,
-                    uyp.dataPtr() + offset, uzp.dataPtr() + offset, ion_lev,
+                        uyp.dataPtr() + offset, uzp.dataPtr() + offset, ion_lev,
                         jx_fab, jy_fab, jz_fab, np_to_deposit, relative_time, dx,
                         xyzmin, lo, q, WarpX::n_rz_azimuthal_modes, cost,
                         WarpX::load_balance_costs_update_algo, bins, box, geom, max_tbox_size);
@@ -620,6 +624,66 @@ WarpXParticleContainer::DepositCurrent (WarpXParIter& pti,
                         WarpX::load_balance_costs_update_algo);
                 }
             }
+        } else if (WarpX::current_deposition_algo == CurrentDepositionAlgo::Villasenor) {
+            if (push_type == PushType::Implicit) {
+#if (AMREX_SPACEDIM >= 2)
+                auto& xp_n = pti.GetAttribs(particle_comps["x_n"]);
+                const ParticleReal* xp_n_data = xp_n.dataPtr() + offset;
+#else
+                const ParticleReal* xp_n_data = nullptr;
+#endif
+#if defined(WARPX_DIM_3D) || defined(WARPX_DIM_RZ)
+                auto& yp_n = pti.GetAttribs(particle_comps["y_n"]);
+                const ParticleReal* yp_n_data = yp_n.dataPtr() + offset;
+#else
+                const ParticleReal* yp_n_data = nullptr;
+#endif
+                auto& zp_n = pti.GetAttribs(particle_comps["z_n"]);
+                const ParticleReal* zp_n_data = zp_n.dataPtr() + offset;
+                auto& uxp_n = pti.GetAttribs(particle_comps["ux_n"]);
+                auto& uyp_n = pti.GetAttribs(particle_comps["uy_n"]);
+                auto& uzp_n = pti.GetAttribs(particle_comps["uz_n"]);
+                if (WarpX::nox == 1){
+                    doVillasenorDepositionShapeNImplicit<1>(
+                        xp_n_data, yp_n_data, zp_n_data,
+                        GetPosition, wp.dataPtr() + offset,
+                        uxp_n.dataPtr() + offset, uyp_n.dataPtr() + offset, uzp_n.dataPtr() + offset,
+                        uxp.dataPtr() + offset, uyp.dataPtr() + offset, uzp.dataPtr() + offset, ion_lev,
+                        jx_arr, jy_arr, jz_arr, np_to_deposit, dt, dx, xyzmin, lo, q,
+                        WarpX::n_rz_azimuthal_modes, cost,
+                        WarpX::load_balance_costs_update_algo);
+                } else if (WarpX::nox == 2){
+                    doVillasenorDepositionShapeNImplicit<2>(
+                        xp_n_data, yp_n_data, zp_n_data,
+                        GetPosition, wp.dataPtr() + offset,
+                        uxp_n.dataPtr() + offset, uyp_n.dataPtr() + offset, uzp_n.dataPtr() + offset,
+                        uxp.dataPtr() + offset, uyp.dataPtr() + offset, uzp.dataPtr() + offset, ion_lev,
+                        jx_arr, jy_arr, jz_arr, np_to_deposit, dt, dx, xyzmin, lo, q,
+                        WarpX::n_rz_azimuthal_modes, cost,
+                        WarpX::load_balance_costs_update_algo);
+                } else if (WarpX::nox == 3){
+                    doVillasenorDepositionShapeNImplicit<3>(
+                        xp_n_data, yp_n_data, zp_n_data,
+                        GetPosition, wp.dataPtr() + offset,
+                        uxp_n.dataPtr() + offset, uyp_n.dataPtr() + offset, uzp_n.dataPtr() + offset,
+                        uxp.dataPtr() + offset, uyp.dataPtr() + offset, uzp.dataPtr() + offset, ion_lev,
+                        jx_arr, jy_arr, jz_arr, np_to_deposit, dt, dx, xyzmin, lo, q,
+                        WarpX::n_rz_azimuthal_modes, cost,
+                        WarpX::load_balance_costs_update_algo);
+                } else if (WarpX::nox == 4){
+                    doVillasenorDepositionShapeNImplicit<4>(
+                        xp_n_data, yp_n_data, zp_n_data,
+                        GetPosition, wp.dataPtr() + offset,
+                        uxp_n.dataPtr() + offset, uyp_n.dataPtr() + offset, uzp_n.dataPtr() + offset,
+                        uxp.dataPtr() + offset, uyp.dataPtr() + offset, uzp.dataPtr() + offset, ion_lev,
+                        jx_arr, jy_arr, jz_arr, np_to_deposit, dt, dx, xyzmin, lo, q,
+                        WarpX::n_rz_azimuthal_modes, cost,
+                        WarpX::load_balance_costs_update_algo);
+                }
+            }
+            else {
+                WARPX_ABORT_WITH_MESSAGE("The Villasenor algorithm can only be used with implicit algorithm.");
+            }
         } else if (WarpX::current_deposition_algo == CurrentDepositionAlgo::Vay) {
             if (push_type == PushType::Implicit) {
                 WARPX_ABORT_WITH_MESSAGE("The Vay algorithm cannot be used with implicit algorithm.");
diff --git a/Source/Utils/WarpXAlgorithmSelection.H b/Source/Utils/WarpXAlgorithmSelection.H
index 87db0ae9b9b..735fc7993f1 100644
--- a/Source/Utils/WarpXAlgorithmSelection.H
+++ b/Source/Utils/WarpXAlgorithmSelection.H
@@ -87,7 +87,8 @@ struct CurrentDepositionAlgo {
     enum {
         Esirkepov = 0,
         Direct = 1,
-        Vay = 2
+        Vay = 2,
+        Villasenor = 3
     };
 };
 
diff --git a/Source/Utils/WarpXAlgorithmSelection.cpp b/Source/Utils/WarpXAlgorithmSelection.cpp
index 75c488134e0..abaf17f0a2c 100644
--- a/Source/Utils/WarpXAlgorithmSelection.cpp
+++ b/Source/Utils/WarpXAlgorithmSelection.cpp
@@ -63,10 +63,11 @@ const std::map<std::string, int> particle_pusher_algo_to_int = {
 };
 
 const std::map<std::string, int> current_deposition_algo_to_int = {
-    {"esirkepov", CurrentDepositionAlgo::Esirkepov },
-    {"direct",    CurrentDepositionAlgo::Direct },
-    {"vay",       CurrentDepositionAlgo::Vay },
-    {"default",   CurrentDepositionAlgo::Esirkepov } // NOTE: overwritten for PSATD and Hybrid-PIC below
+    {"esirkepov",  CurrentDepositionAlgo::Esirkepov },
+    {"direct",     CurrentDepositionAlgo::Direct },
+    {"vay",        CurrentDepositionAlgo::Vay },
+    {"villasenor", CurrentDepositionAlgo::Villasenor },
+    {"default",    CurrentDepositionAlgo::Esirkepov } // NOTE: overwritten for PSATD and Hybrid-PIC below
 };
 
 const std::map<std::string, int> charge_deposition_algo_to_int = {
diff --git a/Source/WarpX.H b/Source/WarpX.H
index e8c7ae79f7e..d3900c91593 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -156,7 +156,7 @@ public:
     int maxlevel_extEMfield_init;
 
     // Algorithms
-    //! Integer that corresponds to the current deposition algorithm (Esirkepov, direct, Vay)
+    //! Integer that corresponds to the current deposition algorithm (Esirkepov, direct, Vay, Villasenor)
     static short current_deposition_algo;
     //! Integer that corresponds to the charge deposition algorithm (only standard deposition)
     static short charge_deposition_algo;
diff --git a/Source/WarpX.cpp b/Source/WarpX.cpp
index 92623bc2a06..5031381561f 100644
--- a/Source/WarpX.cpp
+++ b/Source/WarpX.cpp
@@ -1157,6 +1157,12 @@ WarpX::ReadParameters ()
             "Current centering (nodal deposition) cannot be used with Esirkepov deposition."
             "Please set warpx.do_current_centering = 0 or algo.current_deposition = direct.");
 
+        WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
+            current_deposition_algo != CurrentDepositionAlgo::Villasenor ||
+            !do_current_centering,
+            "Current centering (nodal deposition) cannot be used with Villasenor deposition."
+            "Please set warpx.do_current_centering = 0 or algo.current_deposition = direct.");
+
         WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
             WarpX::current_deposition_algo != CurrentDepositionAlgo::Vay ||
             !do_current_centering,
@@ -1179,6 +1185,14 @@ WarpX::ReadParameters ()
                 "Vay deposition not implemented with multi-J algorithm");
         }
 
+        if (current_deposition_algo == CurrentDepositionAlgo::Villasenor) {
+            WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
+                evolve_scheme == EvolveScheme::ImplicitPicard ||
+                evolve_scheme == EvolveScheme::SemiImplicitPicard,
+                "Villasenor current deposition can only"
+                "be used with Implicit evolve schemes.");
+        }
+
         // Query algo.field_gathering from input, set field_gathering_algo to
         // "default" if not found (default defined in Utils/WarpXAlgorithmSelection.cpp)
         field_gathering_algo = static_cast<short>(GetAlgorithmInteger(pp_algo, "field_gathering"));
@@ -1243,8 +1257,9 @@ WarpX::ReadParameters ()
 
             WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
                 current_deposition_algo == CurrentDepositionAlgo::Esirkepov ||
+                current_deposition_algo == CurrentDepositionAlgo::Villasenor ||
                 current_deposition_algo == CurrentDepositionAlgo::Direct,
-                "Only Esirkepov or Direct current deposition supported with the implicit and semi-implicit schemes");
+                "Only Esirkepov, Villasenor, or Direct current deposition supported with the implicit and semi-implicit schemes");
 
             WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
                 electromagnetic_solver_id == ElectromagneticSolverAlgo::Yee ||
@@ -1259,18 +1274,6 @@ WarpX::ReadParameters ()
             WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
                 field_gathering_algo != GatheringAlgo::MomentumConserving,
                     "With implicit and semi-implicit schemes, the momentum conserving field gather is not supported as it would not conserve energy");
-
-            if (current_deposition_algo == CurrentDepositionAlgo::Direct) {
-                WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
-                    !galerkin_interpolation,
-                    "With implicit and semi-implicit schemes and direct deposition, the Galerkin field gathering must be turned off in order to conserve energy");
-            }
-
-            if (current_deposition_algo == CurrentDepositionAlgo::Esirkepov) {
-                WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
-                    galerkin_interpolation,
-                    "With implicit and semi-implicit schemes and Esirkepov deposition, the Galerkin field gathering must be turned on in order to conserve energy");
-            }
         }
 
         // Load balancing parameters
@@ -1325,10 +1328,18 @@ WarpX::ReadParameters ()
         if (!species_names.empty() || !lasers_names.empty()) {
             if (utils::parser::queryWithParser(pp_algo, "particle_shape", particle_shape)){
 
-                WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
-                    (particle_shape >= 1) && (particle_shape <=3),
-                    "algo.particle_shape can be only 1, 2, or 3"
-                );
+                if(current_deposition_algo == CurrentDepositionAlgo::Villasenor) {
+                    WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
+                        (particle_shape >= 1) && (particle_shape <=4),
+                        "algo.particle_shape can be only 1, 2, 3, or 4 with villasenor deposition"
+                    );
+                }
+                else {
+                    WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
+                        (particle_shape >= 1) && (particle_shape <=3),
+                        "algo.particle_shape can be only 1, 2, or 3"
+                    );
+                }
 
                 nox = particle_shape;
                 noy = particle_shape;
@@ -1337,7 +1348,8 @@ WarpX::ReadParameters ()
             else{
                 WARPX_ABORT_WITH_MESSAGE(
                     "algo.particle_shape must be set in the input file:"
-                    " please set algo.particle_shape to 1, 2, or 3");
+                    " please set algo.particle_shape to 1, 2, or 3."
+                    " if using the villasenor deposition, can use 4 also.");
             }
 
             if ((maxLevel() > 0) && (particle_shape > 1) && (do_pml_j_damping == 1))
@@ -1481,6 +1493,7 @@ WarpX::ReadParameters ()
         // are used
         current_correction = true;
         if (WarpX::current_deposition_algo == CurrentDepositionAlgo::Esirkepov ||
+            WarpX::current_deposition_algo == CurrentDepositionAlgo::Villasenor ||
             WarpX::current_deposition_algo == CurrentDepositionAlgo::Vay ||
             WarpX::do_dive_cleaning)
         {
@@ -1495,6 +1508,7 @@ WarpX::ReadParameters ()
 
         if (!current_correction &&
             current_deposition_algo != CurrentDepositionAlgo::Esirkepov &&
+            current_deposition_algo != CurrentDepositionAlgo::Villasenor &&
             current_deposition_algo != CurrentDepositionAlgo::Vay)
         {
             ablastr::warn_manager::WMRecordWarning(
@@ -1585,14 +1599,15 @@ WarpX::ReadParameters ()
         );
 
 
-        if (current_deposition_algo == CurrentDepositionAlgo::Esirkepov) {
+        if (current_deposition_algo == CurrentDepositionAlgo::Esirkepov ||
+            current_deposition_algo == CurrentDepositionAlgo::Villasenor) {
 
             // The comoving PSATD algorithm is not implemented nor tested with Esirkepov current deposition
             WARPX_ALWAYS_ASSERT_WITH_MESSAGE(v_comoving_is_zero,
-                "Esirkepov current deposition cannot be used with the comoving PSATD algorithm");
+                "charge-conserving current depositions (Esirkepov and Villasenor) cannot be used with the comoving PSATD algorithm");
 
             WARPX_ALWAYS_ASSERT_WITH_MESSAGE(v_galilean_is_zero,
-                "Esirkepov current deposition cannot be used with the Galilean algorithm.");
+                "charge-conserving current depositions (Esirkepov and Villasenor) cannot be used with the Galilean algorithm.");
         }
 
         WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
diff --git a/Source/ablastr/particles/DepositCharge.H b/Source/ablastr/particles/DepositCharge.H
index ad01ba4a213..f43e35c6b0b 100644
--- a/Source/ablastr/particles/DepositCharge.H
+++ b/Source/ablastr/particles/DepositCharge.H
@@ -195,6 +195,11 @@ deposit_charge (typename T_PC::ParIterType& pti,
                                     rho_fab, np_to_deposit.value(), dx, xyzmin, lo, charge,
                                     n_rz_azimuthal_modes, cost,
                                     load_balance_costs_update_algo);
+    } else if (nox == 4){
+        doChargeDepositionShapeN<4>(GetPosition, wp.dataPtr()+offset, ion_lev,
+                                    rho_fab, np_to_deposit.value(), dx, xyzmin, lo, charge,
+                                    n_rz_azimuthal_modes, cost,
+                                    load_balance_costs_update_algo);
     }
     ABLASTR_PROFILE_VAR_STOP(blp_ppc_chd, do_device_synchronize);
 

From 6e332e9479baa2769ff0ac22adb51c25c67627da Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Fri, 2 Feb 2024 14:21:51 -0800
Subject: [PATCH 09/13] Particle Container to Pure SoA Again (#4653)

* AMReX & pyAMReX: Latest `development`

More pure SoA and id handling goodness.

* Particle Container to Pure SoA Again

Transition to new, purely SoA particle containers.

This was originally merged in #3850 and reverted in #4652, since
we discovered issues loosing particles & laser particles on GPU.

* Modernize `idcpu` Treatment

- faster: less emitted operations, no jumps
- cheaper: less used registers
- safer: no read-before-write warnings
- cooler: no explanation needed
---
 .github/workflows/cuda.yml                    |   2 +-
 Docs/source/developers/amrex_basics.rst       |   2 +-
 Docs/source/developers/dimensionality.rst     |   4 +-
 Docs/source/developers/particles.rst          |  18 +-
 Docs/source/usage/workflows/python_extend.rst |  28 +-
 .../particle_data_python/PICMI_inputs_2d.py   |   6 +-
 .../PICMI_inputs_prev_pos_2d.py               |   6 +-
 .../PICMI_inputs_runtime_component_analyze.py |   6 +-
 Python/pywarpx/_libwarpx.py                   |   6 +-
 Python/pywarpx/particle_containers.py         | 255 ++++++++----------
 Regression/WarpX-GPU-tests.ini                |   2 +-
 Regression/WarpX-tests.ini                    |   2 +-
 .../BackTransformParticleFunctor.H            |  16 +-
 .../FlushFormats/FlushFormatAscent.cpp        |   3 -
 .../FlushFormats/FlushFormatCheckpoint.cpp    |   9 +-
 .../FlushFormats/FlushFormatPlotfile.cpp      |  13 +-
 Source/Diagnostics/ParticleIO.cpp             |   2 +-
 .../Diagnostics/ReducedDiags/FieldProbe.cpp   |   6 +-
 .../FieldProbeParticleContainer.H             |  20 +-
 .../FieldProbeParticleContainer.cpp           |  34 +--
 .../ReducedDiags/LoadBalanceCosts.cpp         |   3 +-
 Source/Diagnostics/WarpXOpenPMD.H             |   4 +-
 Source/Diagnostics/WarpXOpenPMD.cpp           | 139 ++--------
 .../ParticleBoundaryProcess.H                 |   7 +-
 Source/EmbeddedBoundary/ParticleScraper.H     |   7 +-
 .../BinaryCollision/BinaryCollision.H         |  16 +-
 .../Coulomb/PairWiseCoulombCollisionFunc.H    |   7 +-
 .../Collision/BinaryCollision/DSMC/DSMC.H     |   3 +-
 .../DSMC/SplitAndScatterFunc.H                |  28 +-
 .../NuclearFusion/NuclearFusionFunc.H         |  14 +-
 .../ProtonBoronFusionInitializeMomentum.H     |  10 +-
 .../TwoProductFusionInitializeMomentum.H      |   4 +-
 .../BinaryCollision/ParticleCreationFunc.H    |  59 ++--
 .../BinaryCollision/ShuffleFisherYates.H      |   2 +-
 .../Particles/Deposition/ChargeDeposition.H   |   2 +-
 .../Particles/Deposition/CurrentDeposition.H  |   2 +-
 .../ElementaryProcess/QEDPairGeneration.H     |   2 +-
 .../ElementaryProcess/QEDPhotonEmission.H     |  16 +-
 Source/Particles/LaserParticleContainer.H     |   7 +-
 .../NamedComponentParticleContainer.H         |  52 ++--
 Source/Particles/ParticleBoundaryBuffer.cpp   |   4 +-
 Source/Particles/ParticleCreation/SmartCopy.H |   5 +-
 .../Particles/ParticleCreation/SmartCreate.H  |  21 +-
 .../Particles/ParticleCreation/SmartUtils.H   |   8 +-
 Source/Particles/PhysicalParticleContainer.H  |   8 +-
 .../Particles/PhysicalParticleContainer.cpp   | 131 ++++-----
 Source/Particles/Pusher/GetAndSetPosition.H   | 152 +++++++----
 .../Particles/Resampling/LevelingThinning.cpp |   5 +-
 Source/Particles/Sorting/Partition.cpp        |   4 +-
 Source/Particles/Sorting/SortingUtils.H       |  41 ++-
 Source/Particles/Sorting/SortingUtils.cpp     |   2 +-
 Source/Particles/WarpXParticleContainer.H     |  27 +-
 Source/Particles/WarpXParticleContainer.cpp   |  90 +++----
 .../Particles/ParticleBoundaryBuffer.cpp      |  10 +-
 .../PinnedMemoryParticleContainer.cpp         |   2 +-
 .../Particles/WarpXParticleContainer.cpp      |  18 +-
 Source/Utils/ParticleUtils.H                  |   7 +-
 Source/Utils/ParticleUtils.cpp                |  26 +-
 Source/ablastr/particles/IndexHandling.H      |  41 ---
 Source/ablastr/particles/ParticleMoments.H    |  25 +-
 cmake/dependencies/AMReX.cmake                |   2 +-
 cmake/dependencies/pyAMReX.cmake              |   2 +-
 run_test.sh                                   |   2 +-
 63 files changed, 703 insertions(+), 754 deletions(-)
 delete mode 100644 Source/ablastr/particles/IndexHandling.H

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 79916c455d1..5e9f43f639d 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -115,7 +115,7 @@ jobs:
         which nvcc || echo "nvcc not in PATH!"
 
         git clone https://github.com/AMReX-Codes/amrex.git ../amrex
-        cd ../amrex && git checkout --detach 24.02 && cd -
+        cd ../amrex && git checkout --detach 296ed40e16ae1877640f5b78e9162dbd4ba1c279 && cd -
         make COMP=gcc QED=FALSE USE_MPI=TRUE USE_GPU=TRUE USE_OMP=FALSE USE_PSATD=TRUE USE_CCACHE=TRUE -j 2
 
         ccache -s
diff --git a/Docs/source/developers/amrex_basics.rst b/Docs/source/developers/amrex_basics.rst
index 577a6547bb5..64ad71af06c 100644
--- a/Docs/source/developers/amrex_basics.rst
+++ b/Docs/source/developers/amrex_basics.rst
@@ -13,7 +13,7 @@ WarpX is built on the Adaptive Mesh Refinement (AMR) library `AMReX <https://git
 
 * ``amrex::MultiFab``: Collection of `FAB` (= ``FArrayBox``) on a single AMR level, distributed over MPI ranks. The concept of `ghost cells` is defined at the ``MultiFab`` level.
 
-* ``amrex::ParticleContainer``: A collection of particles, typically for particles of a physical species. Particles in a ``ParticleContainer`` are organized per ``Box``. Particles in a ``Box`` are organized per tile (this feature is off when running on GPU). Particles within a tile are stored in several structures, each being contiguous in memory: (i) an Array-Of-Struct (AoS) (often called `data`, they are the 3D position, the particle ID and the index of the CPU owning the particle), where the Struct is an ``amrex::Particle`` and (ii) Struct-Of-Arrays (SoA) for extra variables (often called ``attribs``, in WarpX they are the momentum, field on particle etc.).
+* ``amrex::ParticleContainer``: A collection of particles, typically for particles of a physical species. Particles in a ``ParticleContainer`` are organized per ``Box``. Particles in a ``Box`` are organized per tile (this feature is off when running on GPU). Particles within a tile are stored in several structures, each being contiguous in memory: (i) a Struct-of-Array (SoA) for ``amrex::ParticleReal`` data such as positions, weight, momentum, etc., (ii) a Struct-of-Array (SoA) for ``int`` data, such as ionization levels, and (iii) a Struct-of-Array (SoA) for a ``uint64_t`` unique identifier index per particle (containing a 40bit id and 24bit cpu sub-identifier as assigned at particle creation time). This id is also used to check if a particle is active/valid or marked for removal.
 
 The simulation domain is decomposed in several ``Box``, and each MPI rank owns (and performs operations on) the fields and particles defined on a few of these ``Box``, but has the metadata of all of them. For convenience, AMReX provides iterators, to easily iterate over all ``FArrayBox`` (or even tile-by-tile, optionally) in a ``MultiFab`` own by the MPI rank (``MFIter``), or over all particles in a ``ParticleContainer`` on a per-box basis (``ParIter``, or its derived class ``WarpXParIter``). These are respectively done in loops like:
 
diff --git a/Docs/source/developers/dimensionality.rst b/Docs/source/developers/dimensionality.rst
index e775d31a6df..52493e91e38 100644
--- a/Docs/source/developers/dimensionality.rst
+++ b/Docs/source/developers/dimensionality.rst
@@ -49,12 +49,12 @@ WarpX axis labels     ``x, y, z``   ``x, z``    ``z``        ``x, z``
 --------------------  -----------  -----------  -----------  -----------
 *Particles*
 ------------------------------------------------------------------------
-AMReX AoS ``.pos()``  ``0, 1, 2``  ``0, 1``     ``0``        ``0, 1``
+AMReX ``.pos()``      ``0, 1, 2``  ``0, 1``     ``0``        ``0, 1``
 WarpX position names  ``x, y, z``  ``x, z``     ``z``        ``r, z``
 extra SoA attribute                                          ``theta``
 ====================  ===========  ===========  ===========  ===========
 
-Please see the following sections for particle AoS and SoA details.
+Please see the following sections for particle SoA details.
 
 Conventions
 -----------
diff --git a/Docs/source/developers/particles.rst b/Docs/source/developers/particles.rst
index c1a711e0a01..a0182f3845f 100644
--- a/Docs/source/developers/particles.rst
+++ b/Docs/source/developers/particles.rst
@@ -109,24 +109,26 @@ Particle attributes
 -------------------
 
 WarpX adds the following particle attributes by default to WarpX particles.
-These attributes are either stored in an Array-of-Struct (AoS) or Struct-of-Array (SoA) location of the AMReX particle containers.
+These attributes are stored in Struct-of-Array (SoA) locations of the AMReX particle containers: one SoA for ``amrex::ParticleReal`` attributes, one SoA for ``int`` attributes and one SoA for a ``uint64_t`` global particle index per particle.
 The data structures for those are either pre-described at compile-time (CT) or runtime (RT).
 
-====================  ================  ==================================  ===== ==== =====================
+====================  ================  ==================================  ===== ==== ======================
 Attribute name        ``int``/``real``  Description                         Where When Notes
-====================  ================  ==================================  ===== ==== =====================
-``position_x/y/z``    ``real``          Particle position.                  AoS   CT
-``cpu``               ``int``           CPU index where the particle        AoS   CT
+====================  ================  ==================================  ===== ==== ======================
+``position_x/y/z``    ``real``          Particle position.                  SoA   CT
+``weight``            ``real``          Particle position.                  SoA   CT
+``momentum_x/y/z``    ``real``          Particle position.                  SoA   CT
+``id``                ``amrex::Long``   CPU-local particle index            SoA   CT   First 40 bytes of
+                                        where the particle was created.                idcpu
+``cpu``               ``int``           CPU index where the particle        SoA   CT   Last 24 bytes of idcpu
                                         was created.
-``id``                ``int``           CPU-local particle index            AoS   CT
-                                        where the particle was created.
 ``ionizationLevel``   ``int``           Ion ionization level                SoA   RT   Added when ionization
                                                                                        physics is used.
 ``opticalDepthQSR``   ``real``          QED: optical depth of the Quantum-  SoA   RT   Added when PICSAR QED
                                         Synchrotron process                            physics is used.
 ``opticalDepthBW``    ``real``          QED: optical depth of the Breit-    SoA   RT   Added when PICSAR QED
                                         Wheeler process                                physics is used.
-====================  ================  ==================================  ===== ==== =====================
+====================  ================  ==================================  ===== ==== ======================
 
 WarpX allows extra runtime attributes to be added to particle containers (through ``AddRealComp("attrname")`` or ``AddIntComp("attrname")``).
 The attribute name can then be used to access the values of that attribute.
diff --git a/Docs/source/usage/workflows/python_extend.rst b/Docs/source/usage/workflows/python_extend.rst
index 6c9286c02ce..d3049508da6 100644
--- a/Docs/source/usage/workflows/python_extend.rst
+++ b/Docs/source/usage/workflows/python_extend.rst
@@ -190,6 +190,7 @@ Particles
    @callfromafterstep
    def my_after_step_callback():
        warpx = sim.extension.warpx
+       Config = sim.extension.Config
 
        # data access
        multi_pc = warpx.multi_particle_container()
@@ -200,27 +201,27 @@ Particles
        for lvl in range(pc.finest_level + 1):
            # get every local chunk of particles
            for pti in pc.iterator(pc, level=lvl):
-               # default layout: AoS with positions and cpuid
-               aos = pti.aos().to_numpy()
-
-               # additional compile-time and runtime attributes in SoA format
-               soa = pti.soa().to_numpy()
+               # compile-time and runtime attributes in SoA format
+               soa = pti.soa().to_cupy() if Config.have_gpu else \
+                     pti.soa().to_numpy()
 
                # notes:
                # Only the next lines are the "HOT LOOP" of the computation.
-               # For efficiency, use numpy array operation for speed on CPUs.
-               # For GPUs use .to_cupy() above and compute with cupy or numba.
+               # For speed, use array operation.
 
                # write to all particles in the chunk
-               # note: careful, if you change particle positions, you need to
+               # note: careful, if you change particle positions, you might need to
                #       redistribute particles before continuing the simulation step
-               # aos[()]["x"] = 0.30
-               # aos[()]["y"] = 0.35
-               # aos[()]["z"] = 0.40
+               soa.real[0][()] = 0.30  # x
+               soa.real[1][()] = 0.35  # y
+               soa.real[2][()] = 0.40  # z
 
-               for soa_real in soa.real:
+               # all other attributes: weight, momentum x, y, z, ...
+               for soa_real in soa.real[3:]:
                    soa_real[()] = 42.0
 
+               # by default empty unless ionization or QED physics is used
+               # or other runtime attributes were added manually
                for soa_int in soa.int:
                    soa_int[()] = 12
 
@@ -252,7 +253,8 @@ Particles can be added to the simulation at specific positions and with specific
 .. autoclass:: pywarpx.particle_containers.ParticleContainerWrapper
    :members:
 
-The ``get_particle_structs()`` and ``get_particle_arrays()`` functions are called
+The ``get_particle_real_arrays()``, ``get_particle_int_arrays()`` and
+``get_particle_idcpu_arrays()`` functions are called
 by several utility functions of the form ``get_particle_{comp_name}`` where
 ``comp_name`` is one of ``x``, ``y``, ``z``, ``r``, ``theta``, ``id``, ``cpu``,
 ``weight``, ``ux``, ``uy`` or ``uz``.
diff --git a/Examples/Tests/particle_data_python/PICMI_inputs_2d.py b/Examples/Tests/particle_data_python/PICMI_inputs_2d.py
index a4b7d9e134e..572871b8ed5 100755
--- a/Examples/Tests/particle_data_python/PICMI_inputs_2d.py
+++ b/Examples/Tests/particle_data_python/PICMI_inputs_2d.py
@@ -153,10 +153,10 @@ def add_particles():
 ##########################
 
 assert (elec_wrapper.nps == 270 / (2 - args.unique))
-assert (elec_wrapper.particle_container.get_comp_index('w') == 0)
-assert (elec_wrapper.particle_container.get_comp_index('newPid') == 4)
+assert (elec_wrapper.particle_container.get_comp_index('w') == 2)
+assert (elec_wrapper.particle_container.get_comp_index('newPid') == 6)
 
-new_pid_vals = elec_wrapper.get_particle_arrays('newPid', 0)
+new_pid_vals = elec_wrapper.get_particle_real_arrays('newPid', 0)
 for vals in new_pid_vals:
     assert np.allclose(vals, 5)
 
diff --git a/Examples/Tests/particle_data_python/PICMI_inputs_prev_pos_2d.py b/Examples/Tests/particle_data_python/PICMI_inputs_prev_pos_2d.py
index 1becd4464e7..5de9879f0f8 100755
--- a/Examples/Tests/particle_data_python/PICMI_inputs_prev_pos_2d.py
+++ b/Examples/Tests/particle_data_python/PICMI_inputs_prev_pos_2d.py
@@ -120,12 +120,12 @@
 elec_count = elec_wrapper.nps
 
 # check that the runtime attributes have the right indices
-assert (elec_wrapper.particle_container.get_comp_index('prev_x') == 4)
-assert (elec_wrapper.particle_container.get_comp_index('prev_z') == 5)
+assert (elec_wrapper.particle_container.get_comp_index('prev_x') == 6)
+assert (elec_wrapper.particle_container.get_comp_index('prev_z') == 7)
 
 # sanity check that the prev_z values are reasonable and
 # that the correct number of values are returned
-prev_z_vals = elec_wrapper.get_particle_arrays('prev_z', 0)
+prev_z_vals = elec_wrapper.get_particle_real_arrays('prev_z', 0)
 running_count = 0
 
 for z_vals in prev_z_vals:
diff --git a/Examples/Tests/restart/PICMI_inputs_runtime_component_analyze.py b/Examples/Tests/restart/PICMI_inputs_runtime_component_analyze.py
index 706dedb6959..32c9f4e5808 100755
--- a/Examples/Tests/restart/PICMI_inputs_runtime_component_analyze.py
+++ b/Examples/Tests/restart/PICMI_inputs_runtime_component_analyze.py
@@ -158,10 +158,10 @@ def add_particles():
 ##########################
 
 assert electron_wrapper.nps == 90
-assert electron_wrapper.particle_container.get_comp_index("w") == 0
-assert electron_wrapper.particle_container.get_comp_index("newPid") == 4
+assert electron_wrapper.particle_container.get_comp_index("w") == 2
+assert electron_wrapper.particle_container.get_comp_index("newPid") == 6
 
-new_pid_vals = electron_wrapper.get_particle_arrays("newPid", 0)
+new_pid_vals = electron_wrapper.get_particle_real_arrays("newPid", 0)
 for vals in new_pid_vals:
     assert np.allclose(vals, 5)
 
diff --git a/Python/pywarpx/_libwarpx.py b/Python/pywarpx/_libwarpx.py
index fa2044d4240..1eb410e65b8 100755
--- a/Python/pywarpx/_libwarpx.py
+++ b/Python/pywarpx/_libwarpx.py
@@ -5,8 +5,8 @@
 #
 # NOTE: We will reduce the libwarpx.py level of abstraction eventually!
 # Please add new functionality directly to pybind11-bound modules
-# and call them via sim.extension.libwarpx_so. ... and sim.extension.warpx.
-# ... from user code.
+# and call them via sim.extension.libwarpx_so. ... and sim.extension.Config and
+# sim.extension.warpx. ... from user code.
 #
 # Authors: Axel Huebl, Andrew Myers, David Grote, Remi Lehe, Weiqun Zhang
 #
@@ -108,6 +108,8 @@ def load_library(self):
                 from . import warpx_pybind_3d as cxx_3d
                 self.libwarpx_so = cxx_3d
                 self.dim = 3
+
+            self.Config = self.libwarpx_so.Config
         except ImportError:
             raise Exception(f"Dimensionality '{self.geometry_dim}' was not compiled in this Python install. Please recompile with -DWarpX_DIMS={_dims}")
 
diff --git a/Python/pywarpx/particle_containers.py b/Python/pywarpx/particle_containers.py
index 273a981f4bd..72a675ec43c 100644
--- a/Python/pywarpx/particle_containers.py
+++ b/Python/pywarpx/particle_containers.py
@@ -117,8 +117,10 @@ def add_particles(self, x=None, y=None, z=None, ux=None, uy=None,
                 kwargs[key] = np.full(maxlen, val)
 
         # --- The number of built in attributes
+        # --- The positions
+        built_in_attrs = libwarpx.dim
         # --- The three velocities
-        built_in_attrs = 3
+        built_in_attrs += 3
         if libwarpx.geometry_dim == 'rz':
             # --- With RZ, there is also theta
             built_in_attrs += 1
@@ -188,28 +190,25 @@ def add_real_comp(self, pid_name, comm=True):
         self.particle_container.add_real_comp(pid_name, comm)
 
 
-    def get_particle_structs(self, level, copy_to_host=False):
+    def get_particle_real_arrays(self, comp_name, level, copy_to_host=False):
         '''
-        This returns a list of numpy or cupy arrays containing the particle struct data
-        on each tile for this process. The particle data is represented as a structured
-        array and contains the particle 'x', 'y', 'z', and 'idcpu'.
+        This returns a list of numpy or cupy arrays containing the particle real array data
+        on each tile for this process.
 
-        Unless copy_to_host is specified, the data for the structs are
-        not copied, but share the underlying memory buffer with WarpX. The
+        Unless copy_to_host is specified, the data for the arrays are not
+        copied, but share the underlying memory buffer with WarpX. The
         arrays are fully writeable.
 
-        Note that cupy does not support structs:
-        https://github.com/cupy/cupy/issues/2031
-        and will return arrays of binary blobs for the AoS (DP: ``"|V24"``). If copied
-        to host via copy_to_host, we correct for the right numpy AoS type.
-
         Parameters
         ----------
 
-        level        : int
+        comp_name      : str
+            The component of the array data that will be returned
+
+        level          : int
             The refinement level to reference (default=0)
 
-        copy_to_host : bool
+        copy_to_host   : bool
             For GPU-enabled runs, one can either return the GPU
             arrays (the default) or force a device-to-host copy.
 
@@ -217,30 +216,29 @@ def get_particle_structs(self, level, copy_to_host=False):
         -------
 
         List of arrays
-            The requested particle struct data
+            The requested particle array data
         '''
-        particle_data = []
+        comp_idx = self.particle_container.get_comp_index(comp_name)
+
+        data_array = []
         for pti in libwarpx.libwarpx_so.WarpXParIter(self.particle_container, level):
+            soa = pti.soa()
+            idx = soa.GetRealData(comp_idx)
             if copy_to_host:
-                particle_data.append(pti.aos().to_numpy(copy=True))
+                data_array.append(idx.to_numpy(copy=True))
             else:
-                if libwarpx.amr.Config.have_gpu:
-                    libwarpx.amr.Print(
-                        "get_particle_structs: cupy does not yet support structs. "
-                        "https://github.com/cupy/cupy/issues/2031"
-                        "Did you mean copy_to_host=True?"
-                    )
                 xp, cupy_status = load_cupy()
                 if cupy_status is not None:
                     libwarpx.amr.Print(cupy_status)
-                aos_arr = xp.array(pti.aos(), copy=False)   # void blobs for cupy
-                particle_data.append(aos_arr)
-        return particle_data
+
+                data_array.append(xp.array(idx, copy=False))
+
+        return data_array
 
 
-    def get_particle_arrays(self, comp_name, level, copy_to_host=False):
+    def get_particle_int_arrays(self, comp_name, level, copy_to_host=False):
         '''
-        This returns a list of numpy or cupy arrays containing the particle array data
+        This returns a list of numpy or cupy arrays containing the particle int array data
         on each tile for this process.
 
         Unless copy_to_host is specified, the data for the arrays are not
@@ -266,12 +264,52 @@ def get_particle_arrays(self, comp_name, level, copy_to_host=False):
         List of arrays
             The requested particle array data
         '''
-        comp_idx = self.particle_container.get_comp_index(comp_name)
+        comp_idx = self.particle_container.get_icomp_index(comp_name)
 
         data_array = []
         for pti in libwarpx.libwarpx_so.WarpXParIter(self.particle_container, level):
             soa = pti.soa()
-            idx = soa.GetRealData(comp_idx)
+            idx = soa.GetIntData(comp_idx)
+            if copy_to_host:
+                data_array.append(idx.to_numpy(copy=True))
+            else:
+                xp, cupy_status = load_cupy()
+                if cupy_status is not None:
+                    libwarpx.amr.Print(cupy_status)
+
+                data_array.append(xp.array(idx, copy=False))
+
+        return data_array
+
+
+    def get_particle_idcpu_arrays(self, level, copy_to_host=False):
+        '''
+        This returns a list of numpy or cupy arrays containing the particle idcpu data
+        on each tile for this process.
+
+        Unless copy_to_host is specified, the data for the arrays are not
+        copied, but share the underlying memory buffer with WarpX. The
+        arrays are fully writeable.
+
+        Parameters
+        ----------
+        level          : int
+            The refinement level to reference (default=0)
+
+        copy_to_host   : bool
+            For GPU-enabled runs, one can either return the GPU
+            arrays (the default) or force a device-to-host copy.
+
+        Returns
+        -------
+
+        List of arrays
+            The requested particle array data
+        '''
+        data_array = []
+        for pti in libwarpx.libwarpx_so.WarpXParIter(self.particle_container, level):
+            soa = pti.soa()
+            idx = soa.GetIdCPUData()
             if copy_to_host:
                 data_array.append(idx.to_numpy(copy=True))
             else:
@@ -284,6 +322,31 @@ def get_particle_arrays(self, comp_name, level, copy_to_host=False):
         return data_array
 
 
+    def get_particle_idcpu(self, level=0, copy_to_host=False):
+        '''
+        Return a list of numpy or cupy arrays containing the particle 'idcpu'
+        numbers on each tile.
+
+        Parameters
+        ----------
+
+        level        : int
+            The refinement level to reference (default=0)
+
+        copy_to_host : bool
+            For GPU-enabled runs, one can either return the GPU
+            arrays (the default) or force a device-to-host copy.
+
+        Returns
+        -------
+
+        List of arrays
+            The requested particle idcpu
+        '''
+        return self.get_particle_idcpu_arrays(level, copy_to_host=copy_to_host)
+    idcpu = property(get_particle_idcpu)
+
+
     def get_particle_id(self, level=0, copy_to_host=False):
         '''
         Return a list of numpy or cupy arrays containing the particle 'id'
@@ -305,8 +368,8 @@ def get_particle_id(self, level=0, copy_to_host=False):
         List of arrays
             The requested particle ids
         '''
-        structs = self.get_particle_structs(level, copy_to_host)
-        return [libwarpx.amr.unpack_ids(struct['cpuid']) for struct in structs]
+        idcpu = self.get_particle_idcpu(level, copy_to_host)
+        return [libwarpx.amr.unpack_ids(tile) for tile in idcpu]
 
 
     def get_particle_cpu(self, level=0, copy_to_host=False):
@@ -330,8 +393,8 @@ def get_particle_cpu(self, level=0, copy_to_host=False):
         List of arrays
             The requested particle cpus
         '''
-        structs = self.get_particle_structs(level, copy_to_host)
-        return [libwarpx.amr.unpack_cpus(struct['cpuid']) for struct in structs]
+        idcpu = self.get_particle_idcpu(level, copy_to_host)
+        return [libwarpx.amr.unpack_cpus(tile) for tile in idcpu]
 
 
     def get_particle_x(self, level=0, copy_to_host=False):
@@ -355,20 +418,7 @@ def get_particle_x(self, level=0, copy_to_host=False):
         List of arrays
             The requested particle x position
         '''
-        if copy_to_host:
-            # Use the numpy version of cosine
-            xp = np
-        else:
-            xp, cupy_status = load_cupy()
-
-        structs = self.get_particle_structs(level, copy_to_host)
-        if libwarpx.geometry_dim == '3d' or libwarpx.geometry_dim == '2d':
-            return [struct['x'] for struct in structs]
-        elif libwarpx.geometry_dim == 'rz':
-            theta = self.get_particle_theta(level, copy_to_host)
-            return [struct['x']*xp.cos(theta) for struct, theta in zip(structs, theta)]
-        elif libwarpx.geometry_dim == '1d':
-            raise Exception('get_particle_x: There is no x coordinate with 1D Cartesian')
+        return self.get_particle_real_arrays('x', level, copy_to_host=copy_to_host)
     xp = property(get_particle_x)
 
 
@@ -393,20 +443,7 @@ def get_particle_y(self, level=0, copy_to_host=False):
         List of arrays
             The requested particle y position
         '''
-        if copy_to_host:
-            # Use the numpy version of sine
-            xp = np
-        else:
-            xp, cupy_status = load_cupy()
-
-        structs = self.get_particle_structs(level, copy_to_host)
-        if libwarpx.geometry_dim == '3d':
-            return [struct['y'] for struct in structs]
-        elif libwarpx.geometry_dim == 'rz':
-            theta = self.get_particle_theta(level, copy_to_host)
-            return [struct['x']*xp.sin(theta) for struct, theta in zip(structs, theta)]
-        elif libwarpx.geometry_dim == '1d' or libwarpx.geometry_dim == '2d':
-            raise Exception('get_particle_y: There is no y coordinate with 1D or 2D Cartesian')
+        return self.get_particle_real_arrays('y', level, copy_to_host=copy_to_host)
     yp = property(get_particle_y)
 
 
@@ -433,11 +470,12 @@ def get_particle_r(self, level=0, copy_to_host=False):
         '''
         xp, cupy_status = load_cupy()
 
-        structs = self.get_particle_structs(level, copy_to_host)
         if libwarpx.geometry_dim == 'rz':
-            return [struct['x'] for struct in structs]
+            return self.get_particle_x(level, copy_to_host)
         elif libwarpx.geometry_dim == '3d':
-            return [xp.sqrt(struct['x']**2 + struct['y']**2) for struct in structs]
+            x = self.get_particle_x(level, copy_to_host)
+            y = self.get_particle_y(level, copy_to_host)
+            return xp.sqrt(x**2 + y**2)
         elif libwarpx.geometry_dim == '2d' or libwarpx.geometry_dim == '1d':
             raise Exception('get_particle_r: There is no r coordinate with 1D or 2D Cartesian')
     rp = property(get_particle_r)
@@ -467,10 +505,11 @@ def get_particle_theta(self, level=0, copy_to_host=False):
         xp, cupy_status = load_cupy()
 
         if libwarpx.geometry_dim == 'rz':
-            return self.get_particle_arrays('theta', level, copy_to_host)
+            return self.get_particle_real_arrays('theta', level, copy_to_host)
         elif libwarpx.geometry_dim == '3d':
-            structs = self.get_particle_structs(level, copy_to_host)
-            return [xp.arctan2(struct['y'], struct['x']) for struct in structs]
+            x = self.get_particle_x(level, copy_to_host)
+            y = self.get_particle_y(level, copy_to_host)
+            return xp.arctan2(y, x)
         elif libwarpx.geometry_dim == '2d' or libwarpx.geometry_dim == '1d':
             raise Exception('get_particle_theta: There is no theta coordinate with 1D or 2D Cartesian')
     thetap = property(get_particle_theta)
@@ -497,13 +536,7 @@ def get_particle_z(self, level=0, copy_to_host=False):
         List of arrays
             The requested particle z position
         '''
-        structs = self.get_particle_structs(level, copy_to_host)
-        if libwarpx.geometry_dim == '3d':
-            return [struct['z'] for struct in structs]
-        elif libwarpx.geometry_dim == 'rz' or libwarpx.geometry_dim == '2d':
-            return [struct['y'] for struct in structs]
-        elif libwarpx.geometry_dim == '1d':
-            return [struct['x'] for struct in structs]
+        return self.get_particle_real_arrays('z', level, copy_to_host=copy_to_host)
     zp = property(get_particle_z)
 
 
@@ -528,7 +561,7 @@ def get_particle_weight(self, level=0, copy_to_host=False):
         List of arrays
             The requested particle weight
         '''
-        return self.get_particle_arrays('w', level, copy_to_host=copy_to_host)
+        return self.get_particle_real_arrays('w', level, copy_to_host=copy_to_host)
     wp = property(get_particle_weight)
 
 
@@ -553,7 +586,7 @@ def get_particle_ux(self, level=0, copy_to_host=False):
         List of arrays
             The requested particle x momentum
         '''
-        return self.get_particle_arrays('ux', level, copy_to_host=copy_to_host)
+        return self.get_particle_real_arrays('ux', level, copy_to_host=copy_to_host)
     uxp = property(get_particle_ux)
 
 
@@ -578,7 +611,7 @@ def get_particle_uy(self, level=0, copy_to_host=False):
         List of arrays
             The requested particle y momentum
         '''
-        return self.get_particle_arrays('uy', level, copy_to_host=copy_to_host)
+        return self.get_particle_real_arrays('uy', level, copy_to_host=copy_to_host)
     uyp = property(get_particle_uy)
 
 
@@ -604,7 +637,7 @@ def get_particle_uz(self, level=0, copy_to_host=False):
             The requested particle z momentum
         '''
 
-        return self.get_particle_arrays('uz', level, copy_to_host=copy_to_host)
+        return self.get_particle_real_arrays('uz', level, copy_to_host=copy_to_host)
     uzp = property(get_particle_uz)
 
 
@@ -720,70 +753,6 @@ def get_particle_boundary_buffer_size(self, species_name, boundary, local=False)
         )
 
 
-    def get_particle_boundary_buffer_structs(
-            self, species_name, boundary, level, copy_to_host=False
-        ):
-        '''
-        This returns a list of numpy or cupy arrays containing the particle struct data
-        for a species that has been scraped by a specific simulation boundary. The
-        particle data is represented as a structured array and contains the
-        particle 'x', 'y', 'z', and 'idcpu'.
-
-        Unless copy_to_host is specified, the data for the structs are
-        not copied, but share the underlying memory buffer with WarpX. The
-        arrays are fully writeable.
-
-        Note that cupy does not support structs:
-        https://github.com/cupy/cupy/issues/2031
-        and will return arrays of binary blobs for the AoS (DP: ``"|V24"``). If copied
-        to host via copy_to_host, we correct for the right numpy AoS type.
-
-        Parameters
-        ----------
-
-        species_name : str
-            The species name that the data will be returned for
-
-        boundary     : str
-            The boundary from which to get the scraped particle data in the
-            form x/y/z_hi/lo or eb.
-
-        level        : int
-            The refinement level to reference (default=0)
-
-        copy_to_host : bool
-            For GPU-enabled runs, one can either return the GPU
-            arrays (the default) or force a device-to-host copy.
-
-        Returns
-        -------
-
-        List of arrays
-            The requested particle struct data
-        '''
-        particle_container = self.particle_buffer.get_particle_container(
-            species_name, self._get_boundary_number(boundary)
-        )
-
-        particle_data = []
-        for pti in libwarpx.libwarpx_so.BoundaryBufferParIter(particle_container, level):
-            if copy_to_host:
-                particle_data.append(pti.aos().to_numpy(copy=True))
-            else:
-                if libwarpx.amr.Config.have_gpu:
-                    libwarpx.amr.Print(
-                        "get_particle_structs: cupy does not yet support structs. "
-                        "https://github.com/cupy/cupy/issues/2031"
-                        "Did you mean copy_to_host=True?"
-                    )
-                xp, cupy_status = load_cupy()
-                if cupy_status is not None:
-                    libwarpx.amr.Print(cupy_status)
-                aos_arr = xp.array(pti.aos(), copy=False)   # void blobs for cupy
-                particle_data.append(aos_arr)
-        return particle_data
-
-
     def get_particle_boundary_buffer(self, species_name, boundary, comp_name, level):
         '''
         This returns a list of numpy or cupy arrays containing the particle array data
diff --git a/Regression/WarpX-GPU-tests.ini b/Regression/WarpX-GPU-tests.ini
index 0659d530cf5..59123f0004b 100644
--- a/Regression/WarpX-GPU-tests.ini
+++ b/Regression/WarpX-GPU-tests.ini
@@ -60,7 +60,7 @@ emailBody = Check https://ccse.lbl.gov/pub/GpuRegressionTesting/WarpX/ for more
 
 [AMReX]
 dir = /home/regtester/git/amrex/
-branch = 24.02
+branch = 296ed40e16ae1877640f5b78e9162dbd4ba1c279
 
 [source]
 dir = /home/regtester/git/WarpX
diff --git a/Regression/WarpX-tests.ini b/Regression/WarpX-tests.ini
index 3310e642dd3..42232d2a341 100644
--- a/Regression/WarpX-tests.ini
+++ b/Regression/WarpX-tests.ini
@@ -59,7 +59,7 @@ emailBody = Check https://ccse.lbl.gov/pub/RegressionTesting/WarpX/ for more det
 
 [AMReX]
 dir = /home/regtester/AMReX_RegTesting/amrex/
-branch = 24.02
+branch = 296ed40e16ae1877640f5b78e9162dbd4ba1c279
 
 [source]
 dir = /home/regtester/AMReX_RegTesting/warpx
diff --git a/Source/Diagnostics/ComputeDiagFunctors/BackTransformParticleFunctor.H b/Source/Diagnostics/ComputeDiagFunctors/BackTransformParticleFunctor.H
index e88b522f148..fa00f6288f9 100644
--- a/Source/Diagnostics/ComputeDiagFunctors/BackTransformParticleFunctor.H
+++ b/Source/Diagnostics/ComputeDiagFunctors/BackTransformParticleFunctor.H
@@ -143,19 +143,19 @@ struct LorentzTransformParticles
         const amrex::ParticleReal uzp = uz_old_p * weight_old
                                       + uz_new_p * weight_new;
 #if defined (WARPX_DIM_3D)
-        dst.m_aos[i_dst].pos(0) = xp;
-        dst.m_aos[i_dst].pos(1) = yp;
-        dst.m_aos[i_dst].pos(2) = zp;
+        dst.m_rdata[PIdx::x][i_dst] = xp;
+        dst.m_rdata[PIdx::y][i_dst] = yp;
+        dst.m_rdata[PIdx::z][i_dst] = zp;
 #elif defined (WARPX_DIM_RZ)
-        dst.m_aos[i_dst].pos(0) = std::sqrt(xp*xp + yp*yp);
-        dst.m_aos[i_dst].pos(1) = zp;
+        dst.m_rdata[PIdx::x][i_dst] = std::sqrt(xp*xp + yp*yp);
+        dst.m_rdata[PIdx::z][i_dst] = zp;
         dst.m_rdata[PIdx::theta][i_dst] = std::atan2(yp, xp);
 #elif defined (WARPX_DIM_XZ)
-        dst.m_aos[i_dst].pos(0) = xp;
-        dst.m_aos[i_dst].pos(1) = zp;
+        dst.m_rdata[PIdx::x][i_dst] = xp;
+        dst.m_rdata[PIdx::z][i_dst] = zp;
         amrex::ignore_unused(yp);
 #elif defined (WARPX_DIM_1D_Z)
-        dst.m_aos[i_dst].pos(0) = zp;
+        dst.m_rdata[PIdx::z][i_dst] = zp;
         amrex::ignore_unused(xp, yp);
 #else
         amrex::ignore_unused(xp, yp, zp);
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatAscent.cpp b/Source/Diagnostics/FlushFormats/FlushFormatAscent.cpp
index abfba37cd15..0024122ebf9 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatAscent.cpp
+++ b/Source/Diagnostics/FlushFormats/FlushFormatAscent.cpp
@@ -94,9 +94,6 @@ FlushFormatAscent::WriteParticles(const amrex::Vector<ParticleDiag>& particle_di
         // get names of real comps
         std::map<std::string, int> real_comps_map = pc->getParticleComps();
 
-        // WarpXParticleContainer compile-time extra AoS attributes (Real): 0
-        // WarpXParticleContainer compile-time extra AoS attributes (int): 0
-
         // WarpXParticleContainer compile-time extra SoA attributes (Real): PIdx::nattribs
         // not an efficient search, but N is small...
         for(int j = 0; j < PIdx::nattribs; ++j)
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatCheckpoint.cpp b/Source/Diagnostics/FlushFormats/FlushFormatCheckpoint.cpp
index d77437fb931..b083e60529f 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatCheckpoint.cpp
+++ b/Source/Diagnostics/FlushFormats/FlushFormatCheckpoint.cpp
@@ -178,8 +178,8 @@ FlushFormatCheckpoint::CheckpointParticles (
         Vector<std::string> real_names;
         Vector<std::string> int_names;
 
+        // note: positions skipped here, since we reconstruct a plotfile SoA from them
         real_names.push_back("weight");
-
         real_names.push_back("momentum_x");
         real_names.push_back("momentum_y");
         real_names.push_back("momentum_z");
@@ -189,9 +189,12 @@ FlushFormatCheckpoint::CheckpointParticles (
 #endif
 
         // get the names of the real comps
-        real_names.resize(pc->NumRealComps());
+        //   note: skips the mandatory AMREX_SPACEDIM positions for pure SoA
+        real_names.resize(pc->NumRealComps() - AMREX_SPACEDIM);
         auto runtime_rnames = pc->getParticleRuntimeComps();
-        for (auto const& x : runtime_rnames) { real_names[x.second+PIdx::nattribs] = x.first; }
+        for (auto const& x : runtime_rnames) {
+            real_names[x.second + PIdx::nattribs - AMREX_SPACEDIM] = x.first;
+        }
 
         // and the int comps
         int_names.resize(pc->NumIntComps());
diff --git a/Source/Diagnostics/FlushFormats/FlushFormatPlotfile.cpp b/Source/Diagnostics/FlushFormats/FlushFormatPlotfile.cpp
index 970d9a504d2..880e2df01ff 100644
--- a/Source/Diagnostics/FlushFormats/FlushFormatPlotfile.cpp
+++ b/Source/Diagnostics/FlushFormats/FlushFormatPlotfile.cpp
@@ -355,8 +355,8 @@ FlushFormatPlotfile::WriteParticles(const std::string& dir,
         Vector<int> int_flags;
         Vector<int> real_flags;
 
+        // note: positions skipped here, since we reconstruct a plotfile SoA from them
         real_names.push_back("weight");
-
         real_names.push_back("momentum_x");
         real_names.push_back("momentum_y");
         real_names.push_back("momentum_z");
@@ -366,14 +366,21 @@ FlushFormatPlotfile::WriteParticles(const std::string& dir,
 #endif
 
         // get the names of the real comps
-        real_names.resize(tmp.NumRealComps());
+
+        //   note: skips the mandatory AMREX_SPACEDIM positions for pure SoA
+        real_names.resize(tmp.NumRealComps() - AMREX_SPACEDIM);
         auto runtime_rnames = tmp.getParticleRuntimeComps();
-        for (auto const& x : runtime_rnames) { real_names[x.second+PIdx::nattribs] = x.first; }
+        for (auto const& x : runtime_rnames) {
+            real_names[x.second + PIdx::nattribs - AMREX_SPACEDIM] = x.first;
+        }
 
         // plot any "extra" fields by default
         real_flags = part_diag.m_plot_flags;
         real_flags.resize(tmp.NumRealComps(), 1);
 
+        //   note: skip the mandatory AMREX_SPACEDIM positions for pure SoA
+        real_flags.erase(real_flags.begin(), real_flags.begin() + AMREX_SPACEDIM);
+
         // and the names
         int_names.resize(tmp.NumIntComps());
         auto runtime_inames = tmp.getParticleRuntimeiComps();
diff --git a/Source/Diagnostics/ParticleIO.cpp b/Source/Diagnostics/ParticleIO.cpp
index 7ca5e6541d7..a8bb9303fe1 100644
--- a/Source/Diagnostics/ParticleIO.cpp
+++ b/Source/Diagnostics/ParticleIO.cpp
@@ -160,7 +160,7 @@ MultiParticleContainer::Restart (const std::string& dir)
             );
         }
 
-        for (int j = PIdx::nattribs; j < nr; ++j) {
+        for (int j = PIdx::nattribs-AMREX_SPACEDIM; j < nr; ++j) {
             const auto& comp_name = real_comp_names[j];
             auto current_comp_names = pc->getParticleComps();
             auto search = current_comp_names.find(comp_name);
diff --git a/Source/Diagnostics/ReducedDiags/FieldProbe.cpp b/Source/Diagnostics/ReducedDiags/FieldProbe.cpp
index 9f45392bb0a..24ad0e64ea8 100644
--- a/Source/Diagnostics/ReducedDiags/FieldProbe.cpp
+++ b/Source/Diagnostics/ReducedDiags/FieldProbe.cpp
@@ -431,8 +431,6 @@ void FieldProbe::ComputeDiags (int step)
         {
             const auto getPosition = GetParticlePosition<FieldProbePIdx>(pti);
             auto setPosition = SetParticlePosition<FieldProbePIdx>(pti);
-            const auto& aos = pti.GetArrayOfStructs();
-            const auto* AMREX_RESTRICT m_structs = aos().dataPtr();
 
             auto const np = pti.numParticles();
             if (update_particles_moving_window)
@@ -482,6 +480,8 @@ void FieldProbe::ComputeDiags (int step)
                 ParticleReal* const AMREX_RESTRICT part_Bz = attribs[FieldProbePIdx::Bz].dataPtr();
                 ParticleReal* const AMREX_RESTRICT part_S = attribs[FieldProbePIdx::S].dataPtr();
 
+                auto * const AMREX_RESTRICT idcpu = pti.GetStructOfArrays().GetIdCPUData().data();
+
                 const auto &xyzmin = WarpX::LowerCorner(box, lev, 0._rt);
                 const std::array<Real, 3> &dx = WarpX::CellSize(lev);
 
@@ -556,7 +556,7 @@ void FieldProbe::ComputeDiags (int step)
                         amrex::ParticleReal xp, yp, zp;
                         getPosition(ip, xp, yp, zp);
                         long idx = ip*noutputs;
-                        dvp[idx++] = m_structs[ip].id();
+                        dvp[idx++] = amrex::ParticleIDWrapper{idcpu[ip]};  // all particles created on IO cpu
                         dvp[idx++] = xp;
                         dvp[idx++] = yp;
                         dvp[idx++] = zp;
diff --git a/Source/Diagnostics/ReducedDiags/FieldProbeParticleContainer.H b/Source/Diagnostics/ReducedDiags/FieldProbeParticleContainer.H
index c85bf8fd541..7d59ade5dc6 100644
--- a/Source/Diagnostics/ReducedDiags/FieldProbeParticleContainer.H
+++ b/Source/Diagnostics/ReducedDiags/FieldProbeParticleContainer.H
@@ -24,7 +24,14 @@ struct FieldProbePIdx
 {
     enum
     {
-        Ex = 0, Ey, Ez,
+#if !defined (WARPX_DIM_1D_Z)
+        x,
+#endif
+#if defined (WARPX_DIM_3D)
+        y,
+#endif
+        z,
+        Ex, Ey, Ez,
         Bx, By, Bz,
         S, //!< the Poynting vector
 #ifdef WARPX_DIM_RZ
@@ -40,9 +47,14 @@ struct FieldProbePIdx
  * nattribs tells the particle container to allot 7 SOA values.
  */
 class FieldProbeParticleContainer
-    : public amrex::ParticleContainer<0, 0, FieldProbePIdx::nattribs>
+    : public amrex::ParticleContainerPureSoA<FieldProbePIdx::nattribs, 0>
 {
 public:
+    static constexpr int NStructReal = 0;
+    static constexpr int NStructInt = 0;
+    static constexpr int NReal = FieldProbePIdx::nattribs;
+    static constexpr int NInt = 0;
+
     FieldProbeParticleContainer (amrex::AmrCore* amr_core);
     ~FieldProbeParticleContainer() override = default;
 
@@ -52,9 +64,9 @@ public:
     FieldProbeParticleContainer& operator= ( FieldProbeParticleContainer&& )       = default;
 
     //! amrex iterator for our number of attributes
-    using iterator = amrex::ParIter<0, 0, FieldProbePIdx::nattribs, 0>;
+    using iterator = amrex::ParIterSoA<FieldProbePIdx::nattribs, 0>;
     //! amrex iterator for our number of attributes (read-only)
-    using const_iterator = amrex::ParConstIter<0, 0, FieldProbePIdx::nattribs, 0>;
+    using const_iterator = amrex::ParConstIterSoA<FieldProbePIdx::nattribs, 0>;
 
     //! similar to WarpXParticleContainer::AddNParticles but does not include u(x,y,z)
     void AddNParticles (int lev, amrex::Vector<amrex::ParticleReal> const & x, amrex::Vector<amrex::ParticleReal> const & y, amrex::Vector<amrex::ParticleReal> const & z);
diff --git a/Source/Diagnostics/ReducedDiags/FieldProbeParticleContainer.cpp b/Source/Diagnostics/ReducedDiags/FieldProbeParticleContainer.cpp
index 1fd741ddc47..7e7aecb9167 100644
--- a/Source/Diagnostics/ReducedDiags/FieldProbeParticleContainer.cpp
+++ b/Source/Diagnostics/ReducedDiags/FieldProbeParticleContainer.cpp
@@ -59,7 +59,7 @@
 using namespace amrex;
 
 FieldProbeParticleContainer::FieldProbeParticleContainer (AmrCore* amr_core)
-    : ParticleContainer<0, 0, FieldProbePIdx::nattribs>(amr_core->GetParGDB())
+    : ParticleContainerPureSoA<FieldProbePIdx::nattribs, 0>(amr_core->GetParGDB())
 {
     SetParticleSize();
 }
@@ -89,33 +89,15 @@ FieldProbeParticleContainer::AddNParticles (int lev,
      * is then coppied to the permament tile which is stored on the particle
      * (particle_tile).
      */
+    using PinnedTile = typename ContainerLike<amrex::PinnedArenaAllocator>::ParticleTileType;
 
-    using PinnedTile = ParticleTile<amrex::Particle<NStructReal, NStructInt>,
-                                    NArrayReal, NArrayInt,
-                                    amrex::PinnedArenaAllocator>;
     PinnedTile pinned_tile;
     pinned_tile.define(NumRuntimeRealComps(), NumRuntimeIntComps());
 
     for (int i = 0; i < np; i++)
     {
-        ParticleType p;
-        p.id() = ParticleType::NextID();
-        p.cpu() = ParallelDescriptor::MyProc();
-#if defined(WARPX_DIM_3D)
-        p.pos(0) = x[i];
-        p.pos(1) = y[i];
-        p.pos(2) = z[i];
-#elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
-        amrex::ignore_unused(y);
-        p.pos(0) = x[i];
-        p.pos(1) = z[i];
-#elif defined(WARPX_DIM_1D_Z)
-        amrex::ignore_unused(x, y);
-        p.pos(0) = z[i];
-#endif
-
-        // write position, cpu id, and particle id to particle
-        pinned_tile.push_back(p);
+        auto & idcpu_data = pinned_tile.GetStructOfArrays().GetIdCPUData();
+        idcpu_data.push_back(amrex::SetParticleIDandCPU(ParticleType::NextID(), ParallelDescriptor::MyProc()));
     }
 
     // write Real attributes (SoA) to particle initialized zero
@@ -125,7 +107,13 @@ FieldProbeParticleContainer::AddNParticles (int lev,
 #ifdef WARPX_DIM_RZ
     pinned_tile.push_back_real(FieldProbePIdx::theta, np, 0.0);
 #endif
-
+#if !defined (WARPX_DIM_1D_Z)
+    pinned_tile.push_back_real(FieldProbePIdx::x, x);
+#endif
+#if defined (WARPX_DIM_3D)
+    pinned_tile.push_back_real(FieldProbePIdx::y, y);
+#endif
+    pinned_tile.push_back_real(FieldProbePIdx::z, z);
     pinned_tile.push_back_real(FieldProbePIdx::Ex, np, 0.0);
     pinned_tile.push_back_real(FieldProbePIdx::Ey, np, 0.0);
     pinned_tile.push_back_real(FieldProbePIdx::Ez, np, 0.0);
diff --git a/Source/Diagnostics/ReducedDiags/LoadBalanceCosts.cpp b/Source/Diagnostics/ReducedDiags/LoadBalanceCosts.cpp
index 893b00a5f00..b4e07b51982 100644
--- a/Source/Diagnostics/ReducedDiags/LoadBalanceCosts.cpp
+++ b/Source/Diagnostics/ReducedDiags/LoadBalanceCosts.cpp
@@ -56,8 +56,7 @@ namespace
             auto const & plev  = pc.GetParticles(lev);
 
             auto const & ptile = plev.at(box_index);
-            auto const & aos   = ptile.GetArrayOfStructs();
-            auto const np = aos.numParticles();
+            auto const np = ptile.numParticles();
             num_macro_particles += np;
         }
 
diff --git a/Source/Diagnostics/WarpXOpenPMD.H b/Source/Diagnostics/WarpXOpenPMD.H
index 4597dacd9ae..6c904790e15 100644
--- a/Source/Diagnostics/WarpXOpenPMD.H
+++ b/Source/Diagnostics/WarpXOpenPMD.H
@@ -41,7 +41,7 @@ class WarpXParticleCounter
 {
 public:
   using ParticleContainer = typename WarpXParticleContainer::ContainerLike<amrex::PinnedArenaAllocator>;
-  using ParticleIter = typename amrex::ParIter<0, 0, PIdx::nattribs, 0, amrex::PinnedArenaAllocator>;
+  using ParticleIter = typename amrex::ParIterSoA<PIdx::nattribs, 0, amrex::PinnedArenaAllocator>;
 
   WarpXParticleCounter (ParticleContainer* pc);
   [[nodiscard]] unsigned long GetTotalNumParticles () const {return m_Total;}
@@ -77,7 +77,7 @@ class WarpXOpenPMDPlot
 {
 public:
   using ParticleContainer = typename WarpXParticleContainer::ContainerLike<amrex::PinnedArenaAllocator>;
-  using ParticleIter = typename amrex::ParConstIter<0, 0, PIdx::nattribs, 0, amrex::PinnedArenaAllocator>;
+  using ParticleIter = typename amrex::ParConstIterSoA<PIdx::nattribs, 0, amrex::PinnedArenaAllocator>;
 
   /** Initialize openPMD I/O routines
    *
diff --git a/Source/Diagnostics/WarpXOpenPMD.cpp b/Source/Diagnostics/WarpXOpenPMD.cpp
index 7cc9f571a4a..39717ef6ec5 100644
--- a/Source/Diagnostics/WarpXOpenPMD.cpp
+++ b/Source/Diagnostics/WarpXOpenPMD.cpp
@@ -18,11 +18,9 @@
 #include "WarpX.H"
 #include "OpenPMDHelpFunction.H"
 
-#include <ablastr/particles/IndexHandling.H>
 #include <ablastr/warn_manager/WarnManager.H>
 
 #include <AMReX.H>
-#include <AMReX_ArrayOfStructs.H>
 #include <AMReX_BLassert.H>
 #include <AMReX_Box.H>
 #include <AMReX_Config.H>
@@ -550,6 +548,13 @@ for (unsigned i = 0, n = particle_diags.size(); i < n; ++i) {
     // see openPMD ED-PIC extension for namings
     // note: an underscore separates the record name from its component
     //       for non-scalar records
+#if !defined (WARPX_DIM_1D_Z)
+    real_names.push_back("position_x");
+#endif
+#if defined (WARPX_DIM_3D)
+    real_names.push_back("position_y");
+#endif
+    real_names.push_back("position_z");
     real_names.push_back("weighting");
     real_names.push_back("momentum_x");
     real_names.push_back("momentum_y");
@@ -722,77 +727,7 @@ WarpXOpenPMDPlot::DumpToFile (ParticleContainer* pc,
 
             contributed_particles = true;
 
-            // get position and particle ID from aos
-            // note: this implementation iterates the AoS 4x...
-            // if we flush late as we do now, we can also copy out the data in one go
-            const auto &aos = pti.GetArrayOfStructs();  // size =  numParticlesOnTile
-            {
-                // Save positions
-#if defined(WARPX_DIM_RZ)
-                {
-                   const std::shared_ptr<amrex::ParticleReal> z(
-                           new amrex::ParticleReal[numParticleOnTile],
-                           [](amrex::ParticleReal const *p) { delete[] p; }
-                   );
-                   for (auto i = 0; i < numParticleOnTile; i++) {
-                       z.get()[i] = aos[i].pos(1);  // {0: "r", 1: "z"}
-                   }
-                   std::string const positionComponent = "z";
-                   currSpecies["position"]["z"].storeChunk(z, {offset}, {numParticleOnTile64});
-                }
-
-                //   reconstruct x and y from polar coordinates r, theta
-                auto const& soa = pti.GetStructOfArrays();
-                amrex::ParticleReal const* theta = soa.GetRealData(PIdx::theta).dataPtr();
-                WARPX_ALWAYS_ASSERT_WITH_MESSAGE(theta != nullptr, "openPMD: invalid theta pointer.");
-                WARPX_ALWAYS_ASSERT_WITH_MESSAGE(int(soa.GetRealData(PIdx::theta).size()) == numParticleOnTile,
-                                                 "openPMD: theta and tile size do not match");
-                {
-                    const std::shared_ptr< amrex::ParticleReal > x(
-                            new amrex::ParticleReal[numParticleOnTile],
-                            [](amrex::ParticleReal const *p){ delete[] p; }
-                    );
-                    const std::shared_ptr< amrex::ParticleReal > y(
-                            new amrex::ParticleReal[numParticleOnTile],
-                            [](amrex::ParticleReal const *p){ delete[] p; }
-                    );
-                    for (auto i=0; i<numParticleOnTile; i++) {
-                        auto const r = aos[i].pos(0);  // {0: "r", 1: "z"}
-                        x.get()[i] = r * std::cos(theta[i]);
-                        y.get()[i] = r * std::sin(theta[i]);
-                    }
-                    currSpecies["position"]["x"].storeChunk(x, {offset}, {numParticleOnTile64});
-                    currSpecies["position"]["y"].storeChunk(y, {offset}, {numParticleOnTile64});
-                }
-#else
-                auto const positionComponents = detail::getParticlePositionComponentLabels();
-                for (auto currDim = 0; currDim < AMREX_SPACEDIM; currDim++) {
-                    const std::shared_ptr<amrex::ParticleReal> curr(
-                            new amrex::ParticleReal[numParticleOnTile],
-                            [](amrex::ParticleReal const *p) { delete[] p; }
-                    );
-                    for (auto i = 0; i < numParticleOnTile; i++) {
-                        curr.get()[i] = aos[i].pos(currDim);
-                    }
-                    std::string const positionComponent = positionComponents[currDim];
-                    currSpecies["position"][positionComponent].storeChunk(curr, {offset},
-                                                                          {numParticleOnTile64});
-                }
-#endif
-
-                // save particle ID after converting it to a globally unique ID
-                const std::shared_ptr<uint64_t> ids(
-                        new uint64_t[numParticleOnTile],
-                        [](uint64_t const *p) { delete[] p; }
-                );
-                for (auto i = 0; i < numParticleOnTile; i++) {
-                    ids.get()[i] = ablastr::particles::localIDtoGlobal(static_cast<int>(aos[i].id()), static_cast<int>(aos[i].cpu()));
-                }
-                const auto *const scalar = openPMD::RecordComponent::SCALAR;
-                currSpecies["id"][scalar].storeChunk(ids, {offset}, {numParticleOnTile64});
-
-            }
-            //  save "extra" particle properties in AoS and SoA
+            //  save particle properties
             SaveRealProperty(pti,
                              currSpecies,
                              offset,
@@ -893,10 +828,9 @@ WarpXOpenPMDPlot::SetupRealProperties (ParticleContainer const * pc,
 
     std::set< std::string > addedRecords; // add meta-data per record only once
     for (auto idx=0; idx<pc->NumRealComps(); idx++) {
-        auto ii = ParticleContainer::NStructReal + idx; // jump over extra AoS names
-        if (write_real_comp[ii]) {
+        if (write_real_comp[idx]) {
             // handle scalar and non-scalar records by name
-            const auto [record_name, component_name] = detail::name2openPMD(real_comp_names[ii]);
+            const auto [record_name, component_name] = detail::name2openPMD(real_comp_names[idx]);
             auto currRecord = currSpecies[record_name];
 
             // meta data for ED-PIC extension
@@ -917,10 +851,9 @@ WarpXOpenPMDPlot::SetupRealProperties (ParticleContainer const * pc,
         }
     }
     for (auto idx=0; idx<int_counter; idx++) {
-        auto ii = ParticleContainer::NStructInt + idx; // jump over extra AoS names
-        if (write_int_comp[ii]) {
+        if (write_int_comp[idx]) {
             // handle scalar and non-scalar records by name
-            const auto [record_name, component_name] = detail::name2openPMD(int_comp_names[ii]);
+            const auto [record_name, component_name] = detail::name2openPMD(int_comp_names[idx]);
             auto currRecord = currSpecies[record_name];
 
             // meta data for ED-PIC extension
@@ -949,33 +882,8 @@ WarpXOpenPMDPlot::SaveRealProperty (ParticleIter& pti,
 
 {
   auto const numParticleOnTile = pti.numParticles();
-  auto const numParticleOnTile64 = static_cast<uint64_t>( numParticleOnTile );
-  auto const& aos = pti.GetArrayOfStructs();  // size =  numParticlesOnTile
+  auto const numParticleOnTile64 = static_cast<uint64_t>(numParticleOnTile);
   auto const& soa = pti.GetStructOfArrays();
-  // first we concatenate the AoS into contiguous arrays
-  {
-    // note: WarpX does not yet use extra AoS Real attributes
-    for( auto idx=0; idx<ParticleIter::ContainerType::NStructReal; idx++ ) {  // lgtm [cpp/constant-comparison]
-        if( write_real_comp[idx] ) {
-            // handle scalar and non-scalar records by name
-            const auto [record_name, component_name] = detail::name2openPMD(real_comp_names[idx]);
-            auto currRecord = currSpecies[record_name];
-            auto currRecordComp = currRecord[component_name];
-
-            const std::shared_ptr< amrex::ParticleReal > d(
-                new amrex::ParticleReal[numParticleOnTile],
-                [](amrex::ParticleReal const *p){ delete[] p; }
-            );
-
-            for( auto kk=0; kk<numParticleOnTile; kk++ ) {
-                d.get()[kk] = aos[kk].rdata(idx);
-            }
-
-            currRecordComp.storeChunk(d,
-                {offset}, {numParticleOnTile64});
-        }
-    }
-  }
 
   auto const getComponentRecord = [&currSpecies](std::string const comp_name) {
     // handle scalar and non-scalar records by name
@@ -983,24 +891,29 @@ WarpXOpenPMDPlot::SaveRealProperty (ParticleIter& pti,
     return currSpecies[record_name][component_name];
   };
 
+    // here we the save the SoA properties (idcpu)
+    {
+        // todo: add support to not write the particle index
+        getComponentRecord("id").storeChunkRaw(
+        soa.GetIdCPUData().data(), {offset}, {numParticleOnTile64});
+    }
+
   // here we the save the SoA properties (real)
   {
     auto const real_counter = std::min(write_real_comp.size(), real_comp_names.size());
     for (auto idx=0; idx<real_counter; idx++) {
-        auto ii = ParticleIter::ContainerType::NStructReal + idx;  // jump over extra AoS names
-        if (write_real_comp[ii]) {
-            getComponentRecord(real_comp_names[ii]).storeChunkRaw(
-                soa.GetRealData(idx).data(), {offset}, {numParticleOnTile64});
-        }
+      if (write_real_comp[idx]) {
+        getComponentRecord(real_comp_names[idx]).storeChunkRaw(
+          soa.GetRealData(idx).data(), {offset}, {numParticleOnTile64});
+      }
     }
   }
   // and now SoA int properties
   {
     auto const int_counter = std::min(write_int_comp.size(), int_comp_names.size());
     for (auto idx=0; idx<int_counter; idx++) {
-        auto ii = ParticleIter::ContainerType::NStructInt + idx;  // jump over extra AoS names
-        if (write_int_comp[ii]) {
-            getComponentRecord(int_comp_names[ii]).storeChunkRaw(
+        if (write_int_comp[idx]) {
+            getComponentRecord(int_comp_names[idx]).storeChunkRaw(
                 soa.GetIntData(idx).data(), {offset}, {numParticleOnTile64});
         }
     }
diff --git a/Source/EmbeddedBoundary/ParticleBoundaryProcess.H b/Source/EmbeddedBoundary/ParticleBoundaryProcess.H
index a78aa19458a..3099f5df43d 100644
--- a/Source/EmbeddedBoundary/ParticleBoundaryProcess.H
+++ b/Source/EmbeddedBoundary/ParticleBoundaryProcess.H
@@ -7,10 +7,12 @@
 #ifndef PARTICLEBOUNDARYPROCESS_H_
 #define PARTICLEBOUNDARYPROCESS_H_
 
+#include <AMReX_Particle.H>
 #include <AMReX_REAL.H>
 #include <AMReX_RealVect.H>
 #include <AMReX_Random.H>
 
+
 namespace ParticleBoundaryProcess {
 
 struct NoOp {
@@ -25,12 +27,11 @@ struct NoOp {
 struct Absorb {
     template <typename PData>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    void operator() (const PData& ptd, int i,
+    void operator() (PData& ptd, int i,
                      const amrex::RealVect& /*pos*/, const amrex::RealVect& /*normal*/,
                      amrex::RandomEngine const& /*engine*/) const noexcept
     {
-        auto& p = ptd.m_aos[i];
-        p.id() = -p.id();
+        amrex::ParticleIDWrapper{ptd.m_idcpu[i]}.make_invalid();
     }
 };
 }
diff --git a/Source/EmbeddedBoundary/ParticleScraper.H b/Source/EmbeddedBoundary/ParticleScraper.H
index d6196c35f44..a175fe23133 100644
--- a/Source/EmbeddedBoundary/ParticleScraper.H
+++ b/Source/EmbeddedBoundary/ParticleScraper.H
@@ -38,7 +38,7 @@
  *  passed in to this function as an argument. This function can access the
  *  position at which the particle hit the boundary, and also the associated
  *  normal vector. Particles can be `absorbed` by setting their ids to negative
- *  to flag them for removal. Likewise, the can be reflected back into the domain
+ *  to flag them for removal. Likewise, they can be reflected back into the domain
  *  by modifying their data appropriately and leaving their ids alone.
  *
  *  This version operates only at the specified level.
@@ -82,7 +82,7 @@ scrapeParticles (PC& pc, const amrex::Vector<const amrex::MultiFab*>& distance_t
  *  passed in to this function as an argument. This function can access the
  *  position at which the particle hit the boundary, and also the associated
  *  normal vector. Particles can be `absorbed` by setting their ids to negative
- *  to flag them for removal. Likewise, the can be reflected back into the domain
+ *  to flag them for removal. Likewise, they can be reflected back into the domain
  *  by modifying their data appropriately and leaving their ids alone.
  *
  *  This version operates over all the levels in the pc.
@@ -170,13 +170,12 @@ scrapeParticles (PC& pc, const amrex::Vector<const amrex::MultiFab*>& distance_t
             auto& tile = pti.GetParticleTile();
             auto ptd = tile.getParticleTileData();
             const auto np = tile.numParticles();
-            amrex::Particle<0,0> * const particles = tile.GetArrayOfStructs()().data();
             auto phi = (*distance_to_eb[lev])[pti].array();  // signed distance function
             amrex::ParallelForRNG( np,
             [=] AMREX_GPU_DEVICE (const int ip, amrex::RandomEngine const& engine) noexcept
             {
                 // skip particles that are already flagged for removal
-                if (particles[ip].id() < 0) return;
+                if (!amrex::ParticleIDWrapper{ptd.m_idcpu[ip]}.is_valid()) return;
 
                 amrex::ParticleReal xp, yp, zp;
                 getPosition(ip, xp, yp, zp);
diff --git a/Source/Particles/Collision/BinaryCollision/BinaryCollision.H b/Source/Particles/Collision/BinaryCollision/BinaryCollision.H
index 5c90dab25e6..c69f07acdb2 100644
--- a/Source/Particles/Collision/BinaryCollision/BinaryCollision.H
+++ b/Source/Particles/Collision/BinaryCollision/BinaryCollision.H
@@ -72,7 +72,8 @@ class BinaryCollision final
     // Define shortcuts for frequently-used type names
     using ParticleType = WarpXParticleContainer::ParticleType;
     using ParticleTileType = WarpXParticleContainer::ParticleTileType;
-    using ParticleBins = amrex::DenseBins<ParticleType>;
+    using ParticleTileDataType = ParticleTileType::ParticleTileDataType;
+    using ParticleBins = amrex::DenseBins<ParticleTileDataType>;
     using SoaData_type = WarpXParticleContainer::ParticleTileType::ParticleTileDataType;
     using index_type = ParticleBins::index_type;
 
@@ -261,9 +262,6 @@ public:
             const amrex::ParticleReal q1 = species_1.getCharge();
             const amrex::ParticleReal m1 = species_1.getMass();
             auto get_position_1  = GetParticlePosition<PIdx>(ptile_1, getpos_offset);
-            // Needed to access the particle id
-            ParticleType * AMREX_RESTRICT
-                                 particle_ptr_1 = ptile_1.GetArrayOfStructs()().data();
 
             amrex::Geometry const& geom = WarpX::GetInstance().Geom(lev);
 #if defined WARPX_DIM_1D_Z
@@ -371,7 +369,7 @@ public:
                                                     soa_1, soa_1,
                                                     product_species_vector,
                                                     tile_products_data,
-                                                    particle_ptr_1, particle_ptr_1, m1, m1,
+                                                    m1, m1,
                                                     products_mass, p_mask, products_np,
                                                     copy_species1, copy_species2,
                                                     p_pair_indices_1, p_pair_indices_2,
@@ -403,9 +401,6 @@ public:
             const amrex::ParticleReal q1 = species_1.getCharge();
             const amrex::ParticleReal m1 = species_1.getMass();
             auto get_position_1  = GetParticlePosition<PIdx>(ptile_1, getpos_offset);
-            // Needed to access the particle id
-            ParticleType * AMREX_RESTRICT
-                                 particle_ptr_1 = ptile_1.GetArrayOfStructs()().data();
             // - Species 2
             const auto soa_2 = ptile_2.getParticleTileData();
             index_type* AMREX_RESTRICT indices_2 = bins_2.permutationPtr();
@@ -413,9 +408,6 @@ public:
             const amrex::ParticleReal q2 = species_2.getCharge();
             const amrex::ParticleReal m2 = species_2.getMass();
             auto get_position_2  = GetParticlePosition<PIdx>(ptile_2, getpos_offset);
-            // Needed to access the particle id
-            ParticleType * AMREX_RESTRICT
-                                 particle_ptr_2 = ptile_2.GetArrayOfStructs()().data();
 
             amrex::Geometry const& geom = WarpX::GetInstance().Geom(lev);
 #if defined WARPX_DIM_1D_Z
@@ -535,7 +527,7 @@ public:
                                                     soa_1, soa_2,
                                                     product_species_vector,
                                                     tile_products_data,
-                                                    particle_ptr_1, particle_ptr_2, m1, m2,
+                                                    m1, m2,
                                                     products_mass, p_mask, products_np,
                                                     copy_species1, copy_species2,
                                                     p_pair_indices_1, p_pair_indices_2,
diff --git a/Source/Particles/Collision/BinaryCollision/Coulomb/PairWiseCoulombCollisionFunc.H b/Source/Particles/Collision/BinaryCollision/Coulomb/PairWiseCoulombCollisionFunc.H
index feb7acf81d3..cfdc36d3c50 100644
--- a/Source/Particles/Collision/BinaryCollision/Coulomb/PairWiseCoulombCollisionFunc.H
+++ b/Source/Particles/Collision/BinaryCollision/Coulomb/PairWiseCoulombCollisionFunc.H
@@ -23,10 +23,13 @@
  * \brief This functor performs pairwise Coulomb collision on a single cell by calling the function
  *        ElasticCollisionPerez. It also reads and contains the Coulomb logarithm.
  */
-class PairWiseCoulombCollisionFunc{
+class PairWiseCoulombCollisionFunc
+{
     // Define shortcuts for frequently-used type names
     using ParticleType = WarpXParticleContainer::ParticleType;
-    using ParticleBins = amrex::DenseBins<ParticleType>;
+    using ParticleTileType = WarpXParticleContainer::ParticleTileType;
+    using ParticleTileDataType = ParticleTileType::ParticleTileDataType;
+    using ParticleBins = amrex::DenseBins<ParticleTileDataType>;
     using index_type = ParticleBins::index_type;
     using SoaData_type = WarpXParticleContainer::ParticleTileType::ParticleTileDataType;
 
diff --git a/Source/Particles/Collision/BinaryCollision/DSMC/DSMC.H b/Source/Particles/Collision/BinaryCollision/DSMC/DSMC.H
index c1be307b811..ab01eba2c81 100644
--- a/Source/Particles/Collision/BinaryCollision/DSMC/DSMC.H
+++ b/Source/Particles/Collision/BinaryCollision/DSMC/DSMC.H
@@ -38,7 +38,8 @@ class DSMC final
     // Define shortcuts for frequently-used type names
     using ParticleType = WarpXParticleContainer::ParticleType;
     using ParticleTileType = WarpXParticleContainer::ParticleTileType;
-    using ParticleBins = amrex::DenseBins<ParticleType>;
+    using ParticleTileDataType = ParticleTileType::ParticleTileDataType;
+    using ParticleBins = amrex::DenseBins<ParticleTileDataType>;
     using SoaData_type = WarpXParticleContainer::ParticleTileType::ParticleTileDataType;
     using index_type = ParticleBins::index_type;
 
diff --git a/Source/Particles/Collision/BinaryCollision/DSMC/SplitAndScatterFunc.H b/Source/Particles/Collision/BinaryCollision/DSMC/SplitAndScatterFunc.H
index c1fb7ee7e38..f684b60da78 100644
--- a/Source/Particles/Collision/BinaryCollision/DSMC/SplitAndScatterFunc.H
+++ b/Source/Particles/Collision/BinaryCollision/DSMC/SplitAndScatterFunc.H
@@ -10,6 +10,9 @@
 #define SPLIT_AND_SCATTER_FUNC_H_
 
 #include "Particles/Collision/ScatteringProcess.H"
+#include "Particles/NamedComponentParticleContainer.H"
+
+#include <AMReX_Particle.H>
 
 /**
  * \brief Function that performs the particle scattering and injection due
@@ -55,8 +58,6 @@ int splitScatteringParticles (
     const auto ptile1_data = ptile1.getParticleTileData();
     const auto ptile2_data = ptile2.getParticleTileData();
 
-    const Long minus_one_long = -1;
-
     ParallelForRNG(n_total_pairs,
     [=] AMREX_GPU_DEVICE (int i, RandomEngine const& engine) noexcept
     {
@@ -70,20 +71,35 @@ int splitScatteringParticles (
             // starting with the parent particles
             auto& w1 = ptile1_data.m_rdata[PIdx::w][p_pair_indices_1[i]];
             auto& w2 = ptile2_data.m_rdata[PIdx::w][p_pair_indices_2[i]];
+            uint64_t* AMREX_RESTRICT idcpu1 = ptile1_data.m_idcpu;
+            uint64_t* AMREX_RESTRICT idcpu2 = ptile2_data.m_idcpu;
+
+            // Note: Particle::atomicSetID should also be provided as a standalone helper function in AMReX
+            //       to replace the following lambda.
+            auto const atomicSetIdMinus = [] AMREX_GPU_DEVICE (uint64_t & idcpu)
+            {
+#if defined(AMREX_USE_OMP)
+#pragma omp atomic write
+                idcpu = amrex::ParticleIdCpus::Invalid;
+#else
+                amrex::Gpu::Atomic::Exch(
+                    (unsigned long long *)&idcpu,
+                    (unsigned long long)amrex::ParticleIdCpus::Invalid
+                );
+#endif
+            };
 
             // Remove p_pair_reaction_weight[i] from the colliding particles' weights.
             // If the colliding particle weight decreases to zero, remove particle by
             // setting its id to -1.
             Gpu::Atomic::AddNoRet(&w1, -p_pair_reaction_weight[i]);
             if (w1 <= 0._prt) {
-                auto& p = ptile1_data.m_aos[p_pair_indices_1[i]];
-                p.atomicSetID(minus_one_long);
+                atomicSetIdMinus(idcpu1[p_pair_indices_1[i]]);
             }
 
             Gpu::Atomic::AddNoRet(&w2, -p_pair_reaction_weight[i]);
             if (w2 <= 0._prt) {
-                auto& p = ptile2_data.m_aos[p_pair_indices_2[i]];
-                p.atomicSetID(minus_one_long);
+                atomicSetIdMinus(idcpu2[p_pair_indices_2[i]]);
             }
 
             // Set the child particle properties appropriately
diff --git a/Source/Particles/Collision/BinaryCollision/NuclearFusion/NuclearFusionFunc.H b/Source/Particles/Collision/BinaryCollision/NuclearFusion/NuclearFusionFunc.H
index 397536b67bf..b2a2112ca68 100644
--- a/Source/Particles/Collision/BinaryCollision/NuclearFusion/NuclearFusionFunc.H
+++ b/Source/Particles/Collision/BinaryCollision/NuclearFusion/NuclearFusionFunc.H
@@ -33,10 +33,13 @@
  *  creation functor.
  *  This functor also reads and contains the fusion multiplier.
  */
-class NuclearFusionFunc{
+class NuclearFusionFunc
+{
     // Define shortcuts for frequently-used type names
     using ParticleType = WarpXParticleContainer::ParticleType;
-    using ParticleBins = amrex::DenseBins<ParticleType>;
+    using ParticleTileType = WarpXParticleContainer::ParticleTileType;
+    using ParticleTileDataType = ParticleTileType::ParticleTileDataType;
+    using ParticleBins = amrex::DenseBins<ParticleTileDataType>;
     using index_type = ParticleBins::index_type;
     using SoaData_type = WarpXParticleContainer::ParticleTileType::ParticleTileDataType;
 
@@ -154,12 +157,13 @@ public:
             // other species and we need to decrease their weight accordingly.
             // c1 corresponds to the minimum number of times a particle of species 1 will be paired
             // with a particle of species 2. Same for c2.
-            const index_type c1 = amrex::max(NI2/NI1,1u);
-            const index_type c2 = amrex::max(NI1/NI2,1u);
+            // index_type(1): https://github.com/AMReX-Codes/amrex/pull/3684
+            const index_type c1 = amrex::max(NI2/NI1, index_type(1));
+            const index_type c2 = amrex::max(NI1/NI2, index_type(1));
 
             // multiplier ratio to take into account unsampled pairs
             const auto multiplier_ratio = static_cast<int>(
-                (m_isSameSpecies)?(2u*max_N - 1):(max_N));
+                m_isSameSpecies ? 2*max_N - 1 : max_N);
 
 #if (defined WARPX_DIM_RZ)
             amrex::ParticleReal * const AMREX_RESTRICT theta1 = soa_1.m_rdata[PIdx::theta];
diff --git a/Source/Particles/Collision/BinaryCollision/NuclearFusion/ProtonBoronFusionInitializeMomentum.H b/Source/Particles/Collision/BinaryCollision/NuclearFusion/ProtonBoronFusionInitializeMomentum.H
index 7b29267ec32..0b51d6b4b61 100644
--- a/Source/Particles/Collision/BinaryCollision/NuclearFusion/ProtonBoronFusionInitializeMomentum.H
+++ b/Source/Particles/Collision/BinaryCollision/NuclearFusion/ProtonBoronFusionInitializeMomentum.H
@@ -22,10 +22,12 @@
 
 namespace {
     // Define shortcuts for frequently-used type names
-    using SoaData_type = WarpXParticleContainer::ParticleTileType::ParticleTileDataType;
-    using ParticleType = WarpXParticleContainer::ParticleType;
-    using ParticleBins = amrex::DenseBins<ParticleType>;
-    using index_type = ParticleBins::index_type;
+    using SoaData_type = typename WarpXParticleContainer::ParticleTileType::ParticleTileDataType;
+    using ParticleType = typename WarpXParticleContainer::ParticleType;
+    using ParticleTileType = typename WarpXParticleContainer::ParticleTileType;
+    using ParticleTileDataType = typename ParticleTileType::ParticleTileDataType;
+    using ParticleBins = amrex::DenseBins<ParticleTileDataType>;
+    using index_type = typename ParticleBins::index_type;
 
     /**
      * \brief This function initializes the momentum of the alpha particles produced from
diff --git a/Source/Particles/Collision/BinaryCollision/NuclearFusion/TwoProductFusionInitializeMomentum.H b/Source/Particles/Collision/BinaryCollision/NuclearFusion/TwoProductFusionInitializeMomentum.H
index be3f5b2d957..52e9db8aa94 100644
--- a/Source/Particles/Collision/BinaryCollision/NuclearFusion/TwoProductFusionInitializeMomentum.H
+++ b/Source/Particles/Collision/BinaryCollision/NuclearFusion/TwoProductFusionInitializeMomentum.H
@@ -24,7 +24,9 @@ namespace {
     // Define shortcuts for frequently-used type names
     using SoaData_type = WarpXParticleContainer::ParticleTileType::ParticleTileDataType;
     using ParticleType = WarpXParticleContainer::ParticleType;
-    using ParticleBins = amrex::DenseBins<ParticleType>;
+    using ParticleTileType = WarpXParticleContainer::ParticleTileType;
+    using ParticleTileDataType = ParticleTileType::ParticleTileDataType;
+    using ParticleBins = amrex::DenseBins<ParticleTileDataType>;
     using index_type = ParticleBins::index_type;
 
     /**
diff --git a/Source/Particles/Collision/BinaryCollision/ParticleCreationFunc.H b/Source/Particles/Collision/BinaryCollision/ParticleCreationFunc.H
index dc830b477df..7a2853e3db5 100644
--- a/Source/Particles/Collision/BinaryCollision/ParticleCreationFunc.H
+++ b/Source/Particles/Collision/BinaryCollision/ParticleCreationFunc.H
@@ -30,13 +30,15 @@
  * \brief This functor creates particles produced from a binary collision and sets their initial
  * properties (position, momentum, weight).
  */
-class ParticleCreationFunc{
+class ParticleCreationFunc
+{
     // Define shortcuts for frequently-used type names
-    using ParticleType = WarpXParticleContainer::ParticleType;
-    using ParticleTileType = WarpXParticleContainer::ParticleTileType;
-    using ParticleBins = amrex::DenseBins<ParticleType>;
-    using index_type = ParticleBins::index_type;
-    using SoaData_type = WarpXParticleContainer::ParticleTileType::ParticleTileDataType;
+    using ParticleType = typename WarpXParticleContainer::ParticleType;
+    using ParticleTileType = typename WarpXParticleContainer::ParticleTileType;
+    using ParticleTileDataType = typename ParticleTileType::ParticleTileDataType;
+    using ParticleBins = amrex::DenseBins<ParticleTileDataType>;
+    using index_type = typename ParticleBins::index_type;
+    using SoaData_type = typename WarpXParticleContainer::ParticleTileType::ParticleTileDataType;
 
 public:
     /**
@@ -69,12 +71,6 @@ public:
      * @param[in, out] soa_1 struct of array data of the first colliding particle species
      * @param[in, out] soa_2 struct of array data of the second colliding particle species
      * @param[out] tile_products array containing tile data of the product particles.
-     * @param[out] particle_ptr_1 pointer to data of the first colliding particle species. Is
-     *             needed to set the id of a particle to -1 in order to delete it when its weight
-     *             reaches 0.
-     * @param[out] particle_ptr_2 pointer to data of the second colliding particle species. Is
-     *             needed to set the id of a particle to -1 in order to delete it when its weight
-     *             reaches 0.
      * @param[in] m1 mass of the first colliding particle species
      * @param[in] m2 mass of the second colliding particle species
      * @param[in] products_mass array storing the mass of product particles
@@ -102,7 +98,6 @@ public:
                     const SoaData_type& soa_1, const SoaData_type& soa_2,
                     const amrex::Vector<WarpXParticleContainer*>& pc_products,
                     ParticleTileType** AMREX_RESTRICT tile_products,
-                    ParticleType* particle_ptr_1, ParticleType*  particle_ptr_2,
                     const amrex::ParticleReal& m1, const amrex::ParticleReal& m2,
                     const amrex::Vector<amrex::ParticleReal>& products_mass,
                     const index_type* AMREX_RESTRICT p_mask,
@@ -137,6 +132,8 @@ public:
 
         amrex::ParticleReal* AMREX_RESTRICT w1 = soa_1.m_rdata[PIdx::w];
         amrex::ParticleReal* AMREX_RESTRICT w2 = soa_2.m_rdata[PIdx::w];
+        uint64_t* AMREX_RESTRICT idcpu1 = soa_1.m_idcpu;
+        uint64_t* AMREX_RESTRICT idcpu2 = soa_2.m_idcpu;
 
         // Create necessary GPU vectors, that will be used in the kernel below
         amrex::Vector<SoaData_type> soa_products;
@@ -205,16 +202,31 @@ public:
                 amrex::Gpu::Atomic::AddNoRet(&w2[p_pair_indices_2[i]],
                                                 -p_pair_reaction_weight[i]);
 
+                // Note: Particle::atomicSetID should also be provided as a standalone helper function in AMReX
+                //       to replace the following lambda.
+                auto const atomicSetIdMinus = [] AMREX_GPU_DEVICE (uint64_t & idcpu)
+                {
+#if defined(AMREX_USE_OMP)
+#pragma omp atomic write
+                    idcpu = amrex::ParticleIdCpus::Invalid;
+#else
+                    amrex::Gpu::Atomic::Exch(
+                        (unsigned long long *)&idcpu,
+                        (unsigned long long)amrex::ParticleIdCpus::Invalid
+                    );
+#endif
+                };
+
                 // If the colliding particle weight decreases to zero, remove particle by
                 // setting its id to -1
-                constexpr amrex::Long minus_one_long = -1;
                 if (w1[p_pair_indices_1[i]] <= amrex::ParticleReal(0.))
                 {
-                    particle_ptr_1[p_pair_indices_1[i]].atomicSetID(minus_one_long);
+                    atomicSetIdMinus(idcpu1[p_pair_indices_1[i]]);
+
                 }
                 if (w2[p_pair_indices_2[i]] <= amrex::ParticleReal(0.))
                 {
-                    particle_ptr_2[p_pair_indices_2[i]].atomicSetID(minus_one_long);
+                    atomicSetIdMinus(idcpu2[p_pair_indices_2[i]]);
                 }
 
                 // Initialize the product particles' momentum, using a function depending on the
@@ -294,12 +306,14 @@ private:
  * \brief This class does nothing and is used as second template parameter for binary collisions
  * that do not create particles.
  */
-class NoParticleCreationFunc{
-    using ParticleType = WarpXParticleContainer::ParticleType;
-    using ParticleTileType = WarpXParticleContainer::ParticleTileType;
-    using ParticleBins = amrex::DenseBins<ParticleType>;
-    using index_type = ParticleBins::index_type;
-    using SoaData_type = WarpXParticleContainer::ParticleTileType::ParticleTileDataType;
+class NoParticleCreationFunc
+{
+    using ParticleType = typename WarpXParticleContainer::ParticleType;
+    using ParticleTileType = typename WarpXParticleContainer::ParticleTileType;
+    using ParticleTileDataType = typename ParticleTileType::ParticleTileDataType;
+    using ParticleBins = amrex::DenseBins<ParticleTileDataType>;
+    using index_type = typename ParticleBins::index_type;
+    using SoaData_type = typename WarpXParticleContainer::ParticleTileType::ParticleTileDataType;
 
 public:
     NoParticleCreationFunc () = default;
@@ -313,7 +327,6 @@ public:
                     const SoaData_type& /*soa_1*/, const SoaData_type& /*soa_2*/,
                     amrex::Vector<WarpXParticleContainer*>& /*pc_products*/,
                     ParticleTileType** /*tile_products*/,
-                    ParticleType* /*particle_ptr_1*/, ParticleType* /*particle_ptr_2*/,
                     const amrex::ParticleReal& /*m1*/, const amrex::ParticleReal& /*m2*/,
                     const amrex::Vector<amrex::ParticleReal>& /*products_mass*/,
                     const index_type* /*p_mask*/, const amrex::Vector<index_type>& /*products_np*/,
diff --git a/Source/Particles/Collision/BinaryCollision/ShuffleFisherYates.H b/Source/Particles/Collision/BinaryCollision/ShuffleFisherYates.H
index 42259512b0d..3b8f72f4b84 100644
--- a/Source/Particles/Collision/BinaryCollision/ShuffleFisherYates.H
+++ b/Source/Particles/Collision/BinaryCollision/ShuffleFisherYates.H
@@ -12,7 +12,7 @@
 /* \brief Shuffle array according to Fisher-Yates algorithm.
  *        Only shuffle the part between is <= i < ie, n = ie-is.
  *        T_index shall be
- *        amrex::DenseBins<WarpXParticleContainer::ParticleType>::index_type
+ *        amrex::DenseBins<WarpXParticleContainer::ParticleTileType::ParticleTileDataType>::index_type
 */
 
 template <typename T_index>
diff --git a/Source/Particles/Deposition/ChargeDeposition.H b/Source/Particles/Deposition/ChargeDeposition.H
index d0db678dfda..d0822789015 100644
--- a/Source/Particles/Deposition/ChargeDeposition.H
+++ b/Source/Particles/Deposition/ChargeDeposition.H
@@ -252,7 +252,7 @@ void doChargeDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPositio
                                      const int n_rz_azimuthal_modes,
                                      amrex::Real* cost,
                                      const long load_balance_costs_update_algo,
-                                     const amrex::DenseBins<WarpXParticleContainer::ParticleType>& a_bins,
+                                     const amrex::DenseBins<WarpXParticleContainer::ParticleTileType::ParticleTileDataType>& a_bins,
                                      const amrex::Box& box,
                                      const amrex::Geometry& geom,
                                      const amrex::IntVect& a_tbox_max_size,
diff --git a/Source/Particles/Deposition/CurrentDeposition.H b/Source/Particles/Deposition/CurrentDeposition.H
index 18df09c3b43..2252a63fd07 100644
--- a/Source/Particles/Deposition/CurrentDeposition.H
+++ b/Source/Particles/Deposition/CurrentDeposition.H
@@ -592,7 +592,7 @@ void doDepositionSharedShapeN (const GetParticlePosition<PIdx>& GetPosition,
                                int n_rz_azimuthal_modes,
                                amrex::Real* cost,
                                long load_balance_costs_update_algo,
-                               const amrex::DenseBins<WarpXParticleContainer::ParticleType>& a_bins,
+                               const amrex::DenseBins<WarpXParticleContainer::ParticleTileType::ParticleTileDataType>& a_bins,
                                const amrex::Box& box,
                                const amrex::Geometry& geom,
                                const amrex::IntVect& a_tbox_max_size)
diff --git a/Source/Particles/ElementaryProcess/QEDPairGeneration.H b/Source/Particles/ElementaryProcess/QEDPairGeneration.H
index 5abc9282d4f..fb723f0b79a 100644
--- a/Source/Particles/ElementaryProcess/QEDPairGeneration.H
+++ b/Source/Particles/ElementaryProcess/QEDPairGeneration.H
@@ -167,7 +167,7 @@ public:
             p_ux, p_uy, p_uz,
             engine);
 
-        src.m_aos[i_src].id() = -1; //destroy photon after pair generation
+        src.m_idcpu[i_src] = amrex::ParticleIdCpus::Invalid;  // destroy photon after pair generation
     }
 
 private:
diff --git a/Source/Particles/ElementaryProcess/QEDPhotonEmission.H b/Source/Particles/ElementaryProcess/QEDPhotonEmission.H
index 8ba5c63ad57..567b260d0e4 100644
--- a/Source/Particles/ElementaryProcess/QEDPhotonEmission.H
+++ b/Source/Particles/ElementaryProcess/QEDPhotonEmission.H
@@ -22,6 +22,7 @@
 #include <AMReX_GpuLaunch.H>
 #include <AMReX_GpuQualifiers.H>
 #include <AMReX_IndexType.H>
+#include <AMReX_ParticleTile.H>
 #include <AMReX_REAL.H>
 
 #include <AMReX_BaseFwd.H>
@@ -237,12 +238,11 @@ void cleanLowEnergyPhotons(
     const int old_size, const int num_added,
     const amrex::ParticleReal energy_threshold)
 {
-    auto pp = ptile.GetArrayOfStructs()().data() + old_size;
-
-    const auto& soa = ptile.GetStructOfArrays();
+    auto& soa = ptile.GetStructOfArrays();
+    auto p_idcpu = soa.GetIdCPUData().data() + old_size;
     const auto p_ux = soa.GetRealData(PIdx::ux).data() + old_size;
-    const auto  p_uy = soa.GetRealData(PIdx::uy).data() + old_size;
-    const auto  p_uz = soa.GetRealData(PIdx::uz).data() + old_size;
+    const auto p_uy = soa.GetRealData(PIdx::uy).data() + old_size;
+    const auto p_uz = soa.GetRealData(PIdx::uz).data() + old_size;
 
     //The square of the energy threshold
     const auto energy_threshold2 = std::max(
@@ -251,8 +251,6 @@ void cleanLowEnergyPhotons(
 
     amrex::ParallelFor(num_added, [=] AMREX_GPU_DEVICE (int ip) noexcept
     {
-        auto& p = pp[ip];
-
         const auto ux = p_ux[ip];
         const auto uy = p_uy[ip];
         const auto uz = p_uz[ip];
@@ -262,8 +260,8 @@ void cleanLowEnergyPhotons(
         constexpr amrex::ParticleReal me_c = PhysConst::m_e*PhysConst::c;
         const auto phot_energy2 = (ux*ux + uy*uy + uz*uz)*me_c*me_c;
 
-        if (phot_energy2 < energy_threshold2){
-            p.id() = - 1;
+        if (phot_energy2 < energy_threshold2) {
+            p_idcpu[ip] = amrex::ParticleIdCpus::Invalid;
         }
     });
 }
diff --git a/Source/Particles/LaserParticleContainer.H b/Source/Particles/LaserParticleContainer.H
index e6fa308431c..fac94ff20a3 100644
--- a/Source/Particles/LaserParticleContainer.H
+++ b/Source/Particles/LaserParticleContainer.H
@@ -56,10 +56,9 @@ public:
      * \brief Method to initialize runtime attributes. Does nothing for LaserParticleContainer.
      */
     void DefaultInitializeRuntimeAttributes (
-                        amrex::ParticleTile<amrex::Particle<NStructReal, NStructInt>,
-                                            NArrayReal, NArrayInt, amrex::PinnedArenaAllocator>& /*pinned_tile*/,
-                        const int /*n_external_attr_real*/,
-                        const int /*n_external_attr_int*/) final {}
+        typename ContainerLike<amrex::PinnedArenaAllocator>::ParticleTileType& /*pinned_tile*/,
+        int /*n_external_attr_real*/,
+        int /*n_external_attr_int*/) final {}
 
     void ReadHeader (std::istream& is) final;
 
diff --git a/Source/Particles/NamedComponentParticleContainer.H b/Source/Particles/NamedComponentParticleContainer.H
index 3be0886425d..e7a7a20fad5 100644
--- a/Source/Particles/NamedComponentParticleContainer.H
+++ b/Source/Particles/NamedComponentParticleContainer.H
@@ -18,24 +18,39 @@
 #include <utility>
 
 
-/** Particle Attributes stored in amrex::ParticleContainer's struct of array
+/** Real Particle Attributes stored in amrex::ParticleContainer's struct of array
  */
 struct PIdx
 {
     enum {
-        w = 0,      ///< weight
+#if !defined (WARPX_DIM_1D_Z)
+        x,
+#endif
+#if defined (WARPX_DIM_3D)
+        y,
+#endif
+        z,
+        w,      ///< weight
         ux, uy, uz,
 #ifdef WARPX_DIM_RZ
         theta,      ///< RZ needs all three position components
 #endif
-        nattribs    ///< number of attributes
+        nattribs    ///< number of compile-time attributes
+    };
+};
+
+/** Integer Particle Attributes stored in amrex::ParticleContainer's struct of array
+ */
+struct PIdxInt
+{
+    enum {
+        nattribs    ///< number of compile-time attributes
     };
 };
 
 /** Particle Container class that allows to add/access particle components
  *  with a name (string) instead of doing so with an integer index.
- *  (The "components" are all the particle quantities - except those
- *  that are stored in an AoS by amrex, i.e. the particle positions and ID)
+ *  (The "components" are all the particle amrex::Real quantities.)
  *
  *  This is done by storing maps that give the index of the component
  *  that corresponds to a given string.
@@ -45,11 +60,11 @@ struct PIdx
  */
 template <template<class> class T_Allocator=amrex::DefaultAllocator>
 class NamedComponentParticleContainer :
-public amrex::ParticleContainer<0,0,PIdx::nattribs,0,T_Allocator>
+public amrex::ParticleContainerPureSoA<PIdx::nattribs, 0, T_Allocator>
 {
 public:
     /** Construct an empty NamedComponentParticleContainer **/
-    NamedComponentParticleContainer () : amrex::ParticleContainer<0,0,PIdx::nattribs,0,T_Allocator>() {}
+    NamedComponentParticleContainer () : amrex::ParticleContainerPureSoA<PIdx::nattribs, 0, T_Allocator>() {}
 
     /** Construct a NamedComponentParticleContainer from an AmrParGDB object
      *
@@ -61,8 +76,15 @@ public:
      * AMR hierarchy. Usually, this is generated by an AmrCore or AmrLevel object.
      */
     NamedComponentParticleContainer (amrex::AmrParGDB* amr_pgdb)
-    : amrex::ParticleContainer<0,0,PIdx::nattribs,0,T_Allocator>(amr_pgdb) {
+    : amrex::ParticleContainerPureSoA<PIdx::nattribs, 0, T_Allocator>(amr_pgdb) {
         // build up the map of string names to particle component numbers
+#if !defined (WARPX_DIM_1D_Z)
+        particle_comps["x"]  = PIdx::x;
+#endif
+#if defined (WARPX_DIM_3D)
+        particle_comps["y"]  = PIdx::y;
+#endif
+        particle_comps["z"]  = PIdx::z;
         particle_comps["w"]  = PIdx::w;
         particle_comps["ux"] = PIdx::ux;
         particle_comps["uy"] = PIdx::uy;
@@ -85,12 +107,12 @@ public:
      * @param p_ricomps name-to-index map for run-time integer components
      */
     NamedComponentParticleContainer(
-        amrex::ParticleContainer<0,0,PIdx::nattribs,0,T_Allocator> && pc,
+        amrex::ParticleContainerPureSoA<PIdx::nattribs, 0, T_Allocator> && pc,
         std::map<std::string, int> p_comps,
         std::map<std::string, int> p_icomps,
         std::map<std::string, int> p_rcomps,
         std::map<std::string, int> p_ricomps)
-    : amrex::ParticleContainer<0,0,PIdx::nattribs,0,T_Allocator>(std::move(pc)),
+    : amrex::ParticleContainerPureSoA<PIdx::nattribs, 0, T_Allocator>(std::move(pc)),
     particle_comps(std::move(p_comps)),
     particle_icomps(std::move(p_icomps)),
     particle_runtime_comps(std::move(p_rcomps)),
@@ -118,7 +140,7 @@ public:
     NamedComponentParticleContainer<NewAllocator>
     make_alike () const {
         auto tmp = NamedComponentParticleContainer<NewAllocator>(
-            amrex::ParticleContainer<0,0,PIdx::nattribs,0,T_Allocator>::template make_alike<NewAllocator>(),
+            amrex::ParticleContainerPureSoA<PIdx::nattribs, 0, T_Allocator>::template make_alike<NewAllocator>(),
             particle_comps,
             particle_icomps,
             particle_runtime_comps,
@@ -127,10 +149,10 @@ public:
         return tmp;
     }
 
-    using amrex::ParticleContainer<0,0,PIdx::nattribs,0,T_Allocator>::NumRealComps;
-    using amrex::ParticleContainer<0,0,PIdx::nattribs,0,T_Allocator>::NumIntComps;
-    using amrex::ParticleContainer<0,0,PIdx::nattribs,0,T_Allocator>::AddRealComp;
-    using amrex::ParticleContainer<0,0,PIdx::nattribs,0,T_Allocator>::AddIntComp;
+    using amrex::ParticleContainerPureSoA<PIdx::nattribs,0,T_Allocator>::NumRealComps;
+    using amrex::ParticleContainerPureSoA<PIdx::nattribs,0,T_Allocator>::NumIntComps;
+    using amrex::ParticleContainerPureSoA<PIdx::nattribs,0,T_Allocator>::AddRealComp;
+    using amrex::ParticleContainerPureSoA<PIdx::nattribs,0,T_Allocator>::AddIntComp;
 
     /** Allocate a new run-time real component
      *
diff --git a/Source/Particles/ParticleBoundaryBuffer.cpp b/Source/Particles/ParticleBoundaryBuffer.cpp
index 54c4396379d..88304bd8a9c 100644
--- a/Source/Particles/ParticleBoundaryBuffer.cpp
+++ b/Source/Particles/ParticleBoundaryBuffer.cpp
@@ -50,7 +50,7 @@ struct CopyAndTimestamp {
     void operator() (const DstData& dst, const SrcData& src,
                      int src_i, int dst_i) const noexcept
     {
-        dst.m_aos[dst_i] = src.m_aos[src_i];
+        dst.m_idcpu[dst_i] = src.m_idcpu[src_i];
         for (int j = 0; j < SrcData::NAR; ++j) {
             dst.m_rdata[j][dst_i] = src.m_rdata[j][src_i];
         }
@@ -222,7 +222,7 @@ void ParticleBoundaryBuffer::gatherParticles (MultiParticleContainer& mypc,
 {
     WARPX_PROFILE("ParticleBoundaryBuffer::gatherParticles");
 
-    using PIter = amrex::ParConstIter<0,0,PIdx::nattribs>;
+    using PIter = amrex::ParConstIterSoA<PIdx::nattribs, 0>;
     const auto& warpx_instance = WarpX::GetInstance();
     const amrex::Geometry& geom = warpx_instance.Geom(0);
     auto plo = geom.ProbLoArray();
diff --git a/Source/Particles/ParticleCreation/SmartCopy.H b/Source/Particles/ParticleCreation/SmartCopy.H
index 2c04baa18bb..6a6ceb3d290 100644
--- a/Source/Particles/ParticleCreation/SmartCopy.H
+++ b/Source/Particles/ParticleCreation/SmartCopy.H
@@ -26,7 +26,7 @@
  * type. Second, if a given component name is found in both the src
  * and the dst, then the src value is copied.
  *
- * Particle structs - positions and id numbers  - are always copied.
+ * Particle positions and id numbers are always copied.
  *
  * You don't create this directly - use the SmartCopyFactory object below.
  */
@@ -48,9 +48,6 @@ struct SmartCopy
     void operator() (DstData& dst, const SrcData& src, int i_src, int i_dst,
                      amrex::RandomEngine const& engine) const noexcept
     {
-        // the particle struct is always copied over
-        dst.m_aos[i_dst] = src.m_aos[i_src];
-
         // initialize the real components
         for (int j = 0; j < DstData::NAR; ++j) {
             dst.m_rdata[j][i_dst] = initializeRealValue(m_policy_real[j], engine);
diff --git a/Source/Particles/ParticleCreation/SmartCreate.H b/Source/Particles/ParticleCreation/SmartCreate.H
index 67d7767a5d3..b4f25d5daad 100644
--- a/Source/Particles/ParticleCreation/SmartCreate.H
+++ b/Source/Particles/ParticleCreation/SmartCreate.H
@@ -14,6 +14,8 @@
 #include <AMReX_AmrCore.H>
 #include <AMReX_GpuContainers.H>
 #include <AMReX_ParallelDescriptor.H>
+#include <AMReX_Particle.H>
+#include <AMReX_ParticleTile.H>
 
 /**
  * \brief This is a functor for performing a "smart create" that works
@@ -47,23 +49,22 @@ struct SmartCreate
         const int id = 0) const noexcept
     {
 #if defined(WARPX_DIM_3D)
-        prt.m_aos[i_prt].pos(0) = x;
-        prt.m_aos[i_prt].pos(1) = y;
-        prt.m_aos[i_prt].pos(2) = z;
+        prt.m_rdata[PIdx::x][i_prt] = x;
+        prt.m_rdata[PIdx::y][i_prt] = y;
+        prt.m_rdata[PIdx::z][i_prt] = z;
 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
-        prt.m_aos[i_prt].pos(0) = x;
-        prt.m_aos[i_prt].pos(1) = z;
+        prt.m_rdata[PIdx::x][i_prt] = x;
+        prt.m_rdata[PIdx::z][i_prt] = z;
         amrex::ignore_unused(y);
 #else
-        prt.m_aos[i_prt].pos(0) = z;
+        prt.m_rdata[PIdx::z][i_prt] = z;
         amrex::ignore_unused(x,y);
 #endif
 
-        prt.m_aos[i_prt].cpu() = cpu;
-        prt.m_aos[i_prt].id() = id;
+        prt.m_idcpu[i_prt] = amrex::SetParticleIDandCPU(id, cpu);
 
-        // initialize the real components
-        for (int j = 0; j < PartData::NAR; ++j) {
+        // initialize the real components after position
+        for (int j = AMREX_SPACEDIM; j < PartData::NAR; ++j) {
             prt.m_rdata[j][i_prt] = initializeRealValue(m_policy_real[j], engine);
         }
         for (int j = 0; j < prt.m_num_runtime_real; ++j) {
diff --git a/Source/Particles/ParticleCreation/SmartUtils.H b/Source/Particles/ParticleCreation/SmartUtils.H
index 732a12bb729..f84734308fb 100644
--- a/Source/Particles/ParticleCreation/SmartUtils.H
+++ b/Source/Particles/ParticleCreation/SmartUtils.H
@@ -17,6 +17,7 @@
 #include <AMReX_GpuQualifiers.H>
 #include <AMReX_INT.H>
 #include <AMReX_ParallelDescriptor.H>
+#include <AMReX_Particle.H>
 
 #include <map>
 #include <string>
@@ -60,12 +61,11 @@ void setNewParticleIDs (PTile& ptile, int old_size, int num_added)
     }
 
     const int cpuid = amrex::ParallelDescriptor::MyProc();
-    auto pp = ptile.GetArrayOfStructs()().data() + old_size;
+    auto ptd = ptile.getParticleTileData();
     amrex::ParallelFor(num_added, [=] AMREX_GPU_DEVICE (int ip) noexcept
     {
-        auto& p = pp[ip];
-        p.id() = pid+ip;
-        p.cpu() = cpuid;
+        auto const new_id = ip + old_size;
+        ptd.m_idcpu[new_id] = amrex::SetParticleIDandCPU(pid+ip, cpuid);
     });
 }
 
diff --git a/Source/Particles/PhysicalParticleContainer.H b/Source/Particles/PhysicalParticleContainer.H
index a12ae75f629..edf91a84526 100644
--- a/Source/Particles/PhysicalParticleContainer.H
+++ b/Source/Particles/PhysicalParticleContainer.H
@@ -268,11 +268,9 @@ public:
      * @param[in] engine the random engine, used in initialization of QED optical depths
      */
     void DefaultInitializeRuntimeAttributes (
-                    amrex::ParticleTile<amrex::Particle<NStructReal, NStructInt>,
-                                        NArrayReal, NArrayInt,
-                                        amrex::PinnedArenaAllocator>& pinned_tile,
-                    int n_external_attr_real,
-                    int n_external_attr_int) final;
+        typename ContainerLike<amrex::PinnedArenaAllocator>::ParticleTileType& pinned_tile,
+        int n_external_attr_real,
+        int n_external_attr_int) final;
 
 /**
  * \brief Apply NCI Godfrey filter to all components of E and B before gather
diff --git a/Source/Particles/PhysicalParticleContainer.cpp b/Source/Particles/PhysicalParticleContainer.cpp
index 929c3c26649..08c784709fa 100644
--- a/Source/Particles/PhysicalParticleContainer.cpp
+++ b/Source/Particles/PhysicalParticleContainer.cpp
@@ -198,8 +198,8 @@ namespace
      * and avoid any possible undefined behavior before the next call to redistribute) and sets
      * the particle id to -1 so that it can be effectively deleted.
      *
-     * \param p particle aos data
-     * \param pa particle soa data
+     * \param idcpu particle id soa data
+     * \param pa particle real soa data
      * \param ip index for soa data
      * \param do_field_ionization whether species has ionization
      * \param pi ionization level data
@@ -210,20 +210,21 @@ namespace
      */
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void ZeroInitializeAndSetNegativeID (
-        ParticleType& p, const GpuArray<ParticleReal*,PIdx::nattribs>& pa, long& ip,
+        uint64_t * AMREX_RESTRICT idcpu,
+        const GpuArray<ParticleReal*,PIdx::nattribs>& pa, long& ip,
         const bool& do_field_ionization, int* pi
 #ifdef WARPX_QED
-        ,const bool& has_quantum_sync, amrex::ParticleReal* p_optical_depth_QSR
-        ,const bool& has_breit_wheeler, amrex::ParticleReal* p_optical_depth_BW
+        ,const bool& has_quantum_sync, amrex::ParticleReal* AMREX_RESTRICT p_optical_depth_QSR
+        ,const bool& has_breit_wheeler, amrex::ParticleReal* AMREX_RESTRICT p_optical_depth_BW
 #endif
         ) noexcept
     {
-        p.pos(0) = 0._rt;
+        pa[PIdx::z][ip] = 0._rt;
 #if (AMREX_SPACEDIM >= 2)
-        p.pos(1) = 0._rt;
+        pa[PIdx::x][ip] = 0._rt;
 #endif
 #if defined(WARPX_DIM_3D)
-        p.pos(2) = 0._rt;
+        pa[PIdx::y][ip] = 0._rt;
 #endif
         pa[PIdx::w ][ip] = 0._rt;
         pa[PIdx::ux][ip] = 0._rt;
@@ -238,7 +239,7 @@ namespace
         if (has_breit_wheeler) {p_optical_depth_BW[ip] = 0._rt;}
 #endif
 
-        p.id() = -1;
+        idcpu[ip] = amrex::ParticleIdCpus::Invalid;
     }
 }
 
@@ -780,11 +781,9 @@ PhysicalParticleContainer::AddPlasmaFromFile(PlasmaInjector & plasma_injector,
 
 void
 PhysicalParticleContainer::DefaultInitializeRuntimeAttributes (
-                    amrex::ParticleTile<amrex::Particle<NStructReal, NStructInt>,
-                                        NArrayReal, NArrayInt,
-                                        amrex::PinnedArenaAllocator>& pinned_tile,
-                    const int n_external_attr_real,
-                    const int n_external_attr_int)
+    typename ContainerLike<amrex::PinnedArenaAllocator>::ParticleTileType& pinned_tile,
+    int n_external_attr_real,
+    int n_external_attr_int)
 {
     ParticleCreation::DefaultInitializeRuntimeAttributes(pinned_tile,
                                        n_external_attr_real, n_external_attr_int,
@@ -1084,7 +1083,7 @@ PhysicalParticleContainer::AddPlasma (PlasmaInjector const& plasma_injector, int
         const int max_new_particles = Scan::ExclusiveSum(counts.size(), counts.data(), offset.data());
 
         // Update NextID to include particles created in this function
-        Long pid;
+        int pid;
 #ifdef AMREX_USE_OMP
 #pragma omp critical (add_plasma_nextid)
 #endif
@@ -1093,7 +1092,7 @@ PhysicalParticleContainer::AddPlasma (PlasmaInjector const& plasma_injector, int
             ParticleType::NextID(pid+max_new_particles);
         }
         WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
-            static_cast<Long>(pid + max_new_particles) < LastParticleID,
+            static_cast<amrex::Long>(pid) + static_cast<amrex::Long>(max_new_particles) < LongParticleIds::LastParticleID,
             "ERROR: overflow on particle id numbers");
 
         const int cpuid = ParallelDescriptor::MyProc();
@@ -1104,16 +1103,16 @@ PhysicalParticleContainer::AddPlasma (PlasmaInjector const& plasma_injector, int
             DefineAndReturnParticleTile(lev, grid_id, tile_id);
         }
 
-        auto old_size = particle_tile.GetArrayOfStructs().size();
+        auto old_size = particle_tile.size();
         auto new_size = old_size + max_new_particles;
         particle_tile.resize(new_size);
 
-        ParticleType* pp = particle_tile.GetArrayOfStructs()().data() + old_size;
         auto& soa = particle_tile.GetStructOfArrays();
         GpuArray<ParticleReal*,PIdx::nattribs> pa;
         for (int ia = 0; ia < PIdx::nattribs; ++ia) {
             pa[ia] = soa.GetRealData(ia).data() + old_size;
         }
+        uint64_t * AMREX_RESTRICT pa_idcpu = soa.GetIdCPUData().data() + old_size;
         // user-defined integer and real attributes
         const auto n_user_int_attribs = static_cast<int>(m_user_int_attribs.size());
         const auto n_user_real_attribs = static_cast<int>(m_user_real_attribs.size());
@@ -1226,9 +1225,7 @@ PhysicalParticleContainer::AddPlasma (PlasmaInjector const& plasma_injector, int
             for (int i_part = 0; i_part < pcounts[index]; ++i_part)
             {
                 long ip = poffset[index] + i_part;
-                ParticleType& p = pp[ip];
-                p.id() = pid+ip;
-                p.cpu() = cpuid;
+                pa_idcpu[ip] = amrex::SetParticleIDandCPU(pid+ip, cpuid);
                 const XDim3 r = (fine_overlap_box.ok() && fine_overlap_box.contains(iv)) ?
                   // In the refined injection region: use refinement ratio `lrrfac`
                   inj_pos->getPositionUnitBox(i_part, lrrfac, engine) :
@@ -1238,7 +1235,7 @@ PhysicalParticleContainer::AddPlasma (PlasmaInjector const& plasma_injector, int
 
 #if defined(WARPX_DIM_3D)
                 if (!tile_realbox.contains(XDim3{pos.x,pos.y,pos.z})) {
-                    ZeroInitializeAndSetNegativeID(p, pa, ip, loc_do_field_ionization, pi
+                    ZeroInitializeAndSetNegativeID(pa_idcpu, pa, ip, loc_do_field_ionization, pi
 #ifdef WARPX_QED
                                                    ,loc_has_quantum_sync, p_optical_depth_QSR
                                                    ,loc_has_breit_wheeler, p_optical_depth_BW
@@ -1249,7 +1246,7 @@ PhysicalParticleContainer::AddPlasma (PlasmaInjector const& plasma_injector, int
 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
                 amrex::ignore_unused(k);
                 if (!tile_realbox.contains(XDim3{pos.x,pos.z,0.0_rt})) {
-                    ZeroInitializeAndSetNegativeID(p, pa, ip, loc_do_field_ionization, pi
+                    ZeroInitializeAndSetNegativeID(pa_idcpu, pa, ip, loc_do_field_ionization, pi
 #ifdef WARPX_QED
                                                    ,loc_has_quantum_sync, p_optical_depth_QSR
                                                    ,loc_has_breit_wheeler, p_optical_depth_BW
@@ -1260,7 +1257,7 @@ PhysicalParticleContainer::AddPlasma (PlasmaInjector const& plasma_injector, int
 #else
                 amrex::ignore_unused(j,k);
                 if (!tile_realbox.contains(XDim3{pos.z,0.0_rt,0.0_rt})) {
-                    ZeroInitializeAndSetNegativeID(p, pa, ip, loc_do_field_ionization, pi
+                    ZeroInitializeAndSetNegativeID(pa_idcpu, pa, ip, loc_do_field_ionization, pi
 #ifdef WARPX_QED
                                                    ,loc_has_quantum_sync, p_optical_depth_QSR
                                                    ,loc_has_breit_wheeler, p_optical_depth_BW
@@ -1299,7 +1296,7 @@ PhysicalParticleContainer::AddPlasma (PlasmaInjector const& plasma_injector, int
                     const Real z0 = applyBallisticCorrection(pos, inj_mom, gamma_boost,
                                                              beta_boost, t);
                     if (!inj_pos->insideBounds(xb, yb, z0)) {
-                        ZeroInitializeAndSetNegativeID(p, pa, ip, loc_do_field_ionization, pi
+                        ZeroInitializeAndSetNegativeID(pa_idcpu, pa, ip, loc_do_field_ionization, pi
 #ifdef WARPX_QED
                                                    ,loc_has_quantum_sync, p_optical_depth_QSR
                                                    ,loc_has_breit_wheeler, p_optical_depth_BW
@@ -1313,7 +1310,7 @@ PhysicalParticleContainer::AddPlasma (PlasmaInjector const& plasma_injector, int
 
                     // Remove particle if density below threshold
                     if ( dens < density_min ){
-                        ZeroInitializeAndSetNegativeID(p, pa, ip, loc_do_field_ionization, pi
+                        ZeroInitializeAndSetNegativeID(pa_idcpu, pa, ip, loc_do_field_ionization, pi
 #ifdef WARPX_QED
                                                    ,loc_has_quantum_sync, p_optical_depth_QSR
                                                    ,loc_has_breit_wheeler, p_optical_depth_BW
@@ -1331,7 +1328,7 @@ PhysicalParticleContainer::AddPlasma (PlasmaInjector const& plasma_injector, int
                     // If the particle is not within the lab-frame zmin, zmax, etc.
                     // go to the next generated particle.
                     if (!inj_pos->insideBounds(xb, yb, z0_lab)) {
-                        ZeroInitializeAndSetNegativeID(p, pa, ip, loc_do_field_ionization, pi
+                        ZeroInitializeAndSetNegativeID(pa_idcpu, pa, ip, loc_do_field_ionization, pi
 #ifdef WARPX_QED
                                                    ,loc_has_quantum_sync, p_optical_depth_QSR
                                                    ,loc_has_breit_wheeler, p_optical_depth_BW
@@ -1343,7 +1340,7 @@ PhysicalParticleContainer::AddPlasma (PlasmaInjector const& plasma_injector, int
                     dens = inj_rho->getDensity(pos.x, pos.y, z0_lab);
                     // Remove particle if density below threshold
                     if ( dens < density_min ){
-                        ZeroInitializeAndSetNegativeID(p, pa, ip, loc_do_field_ionization, pi
+                        ZeroInitializeAndSetNegativeID(pa_idcpu, pa, ip, loc_do_field_ionization, pi
 #ifdef WARPX_QED
                                                    ,loc_has_quantum_sync, p_optical_depth_QSR
                                                    ,loc_has_breit_wheeler, p_optical_depth_BW
@@ -1410,17 +1407,17 @@ PhysicalParticleContainer::AddPlasma (PlasmaInjector const& plasma_injector, int
                 pa[PIdx::uz][ip] = u.z;
 
 #if defined(WARPX_DIM_3D)
-                p.pos(0) = pos.x;
-                p.pos(1) = pos.y;
-                p.pos(2) = pos.z;
+                pa[PIdx::x][ip] = pos.x;
+                pa[PIdx::y][ip] = pos.y;
+                pa[PIdx::z][ip] = pos.z;
 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
 #ifdef WARPX_DIM_RZ
                 pa[PIdx::theta][ip] = theta;
 #endif
-                p.pos(0) = xb;
-                p.pos(1) = pos.z;
+                pa[PIdx::x][ip] = xb;
+                pa[PIdx::z][ip] = pos.z;
 #else
-                p.pos(0) = pos.z;
+                pa[PIdx::z][ip] = pos.z;
 #endif
             }
         });
@@ -1645,7 +1642,7 @@ PhysicalParticleContainer::AddPlasmaFlux (PlasmaInjector const& plasma_injector,
         const int max_new_particles = Scan::ExclusiveSum(counts.size(), counts.data(), offset.data());
 
         // Update NextID to include particles created in this function
-        Long pid;
+        int pid;
 #ifdef AMREX_USE_OMP
 #pragma omp critical (add_plasma_nextid)
 #endif
@@ -1654,23 +1651,23 @@ PhysicalParticleContainer::AddPlasmaFlux (PlasmaInjector const& plasma_injector,
             ParticleType::NextID(pid+max_new_particles);
         }
         WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
-            static_cast<Long>(pid + max_new_particles) < LastParticleID,
+            static_cast<amrex::Long>(pid) + static_cast<amrex::Long>(max_new_particles) < LongParticleIds::LastParticleID,
             "overflow on particle id numbers");
 
         const int cpuid = ParallelDescriptor::MyProc();
 
         auto& particle_tile = tmp_pc.DefineAndReturnParticleTile(0, grid_id, tile_id);
 
-        auto old_size = particle_tile.GetArrayOfStructs().size();
+        auto old_size = particle_tile.size();
         auto new_size = old_size + max_new_particles;
         particle_tile.resize(new_size);
 
-        ParticleType* pp = particle_tile.GetArrayOfStructs()().data() + old_size;
         auto& soa = particle_tile.GetStructOfArrays();
         GpuArray<ParticleReal*,PIdx::nattribs> pa;
         for (int ia = 0; ia < PIdx::nattribs; ++ia) {
             pa[ia] = soa.GetRealData(ia).data() + old_size;
         }
+        uint64_t * AMREX_RESTRICT pa_idcpu = soa.GetIdCPUData().data() + old_size;
 
         // user-defined integer and real attributes
         const auto n_user_int_attribs = static_cast<int>(m_user_int_attribs.size());
@@ -1768,9 +1765,7 @@ PhysicalParticleContainer::AddPlasmaFlux (PlasmaInjector const& plasma_injector,
             for (int i_part = 0; i_part < pcounts[index]; ++i_part)
             {
                 const long ip = poffset[index] + i_part;
-                ParticleType& p = pp[ip];
-                p.id() = pid+ip;
-                p.cpu() = cpuid;
+                pa_idcpu[ip] = amrex::SetParticleIDandCPU(pid+ip, cpuid);
 
                 // This assumes the flux_pos is of type InjectorPositionRandomPlane
                 const XDim3 r = (fine_overlap_box.ok() && fine_overlap_box.contains(iv)) ?
@@ -1795,19 +1790,19 @@ PhysicalParticleContainer::AddPlasmaFlux (PlasmaInjector const& plasma_injector,
                 // the particles will be within the domain.
 #if defined(WARPX_DIM_3D)
                 if (!ParticleUtils::containsInclusive(tile_realbox, XDim3{ppos.x,ppos.y,ppos.z})) {
-                    p.id() = -1;
+                    pa_idcpu[ip] = amrex::ParticleIdCpus::Invalid;
                     continue;
                 }
 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
                 amrex::ignore_unused(k);
                 if (!ParticleUtils::containsInclusive(tile_realbox, XDim3{ppos.x,ppos.z,0.0_prt})) {
-                    p.id() = -1;
+                    pa_idcpu[ip] = amrex::ParticleIdCpus::Invalid;
                     continue;
                 }
 #else
                 amrex::ignore_unused(j,k);
                 if (!ParticleUtils::containsInclusive(tile_realbox, XDim3{ppos.z,0.0_prt,0.0_prt})) {
-                    p.id() = -1;
+                    pa_idcpu[ip] = amrex::ParticleIdCpus::Invalid;
                     continue;
                 }
 #endif
@@ -1815,7 +1810,7 @@ PhysicalParticleContainer::AddPlasmaFlux (PlasmaInjector const& plasma_injector,
                 // If the particle's initial position is not within or on the species's
                 // xmin, xmax, ymin, ymax, zmin, zmax, go to the next generated particle.
                 if (!flux_pos->insideBoundsInclusive(ppos.x, ppos.y, ppos.z)) {
-                    p.id() = -1;
+                    pa_idcpu[ip] = amrex::ParticleIdCpus::Invalid;
                     continue;
                 }
 
@@ -1848,8 +1843,8 @@ PhysicalParticleContainer::AddPlasmaFlux (PlasmaInjector const& plasma_injector,
 #endif
                 Real flux = inj_flux->getFlux(ppos.x, ppos.y, ppos.z, t);
                 // Remove particle if flux is negative or 0
-                if ( flux <=0 ){
-                    p.id() = -1;
+                if (flux <= 0) {
+                    pa_idcpu[ip] = amrex::ParticleIdCpus::Invalid;
                     continue;
                 }
 
@@ -1858,7 +1853,7 @@ PhysicalParticleContainer::AddPlasmaFlux (PlasmaInjector const& plasma_injector,
                 }
 
 #ifdef WARPX_QED
-                if(loc_has_quantum_sync){
+                if (loc_has_quantum_sync) {
                     p_optical_depth_QSR[ip] = quantum_sync_get_opt(engine);
                 }
 
@@ -1908,18 +1903,18 @@ PhysicalParticleContainer::AddPlasmaFlux (PlasmaInjector const& plasma_injector,
                 UpdatePosition(ppos.x, ppos.y, ppos.z, pu.x, pu.y, pu.z, t_fract);
 
 #if defined(WARPX_DIM_3D)
-                p.pos(0) = ppos.x;
-                p.pos(1) = ppos.y;
-                p.pos(2) = ppos.z;
+                pa[PIdx::x][ip] = ppos.x;
+                pa[PIdx::y][ip] = ppos.y;
+                pa[PIdx::z][ip] = ppos.z;
 #elif defined(WARPX_DIM_RZ)
                 pa[PIdx::theta][ip] = std::atan2(ppos.y, ppos.x);
-                p.pos(0) = std::sqrt(ppos.x*ppos.x + ppos.y*ppos.y);
-                p.pos(1) = ppos.z;
+                pa[PIdx::x][ip] = std::sqrt(ppos.x*ppos.x + ppos.y*ppos.y);
+                pa[PIdx::z][ip] = ppos.z;
 #elif defined(WARPX_DIM_XZ)
-                p.pos(0) = ppos.x;
-                p.pos(1) = ppos.z;
+                pa[PIdx::x][ip] = ppos.x;
+                pa[PIdx::z][ip] = ppos.z;
 #else
-                p.pos(0) = ppos.z;
+                pa[PIdx::z][ip] = ppos.z;
 #endif
             }
         });
@@ -2342,20 +2337,22 @@ PhysicalParticleContainer::SplitParticles (int lev)
             split_offset[1] /= ppc_nd[1];
             split_offset[2] /= ppc_nd[2];
         }
-        // particle Array Of Structs data
-        auto& particles = pti.GetArrayOfStructs();
         // particle Struct Of Arrays data
         auto& attribs = pti.GetAttribs();
         auto& wp  = attribs[PIdx::w ];
         auto& uxp = attribs[PIdx::ux];
         auto& uyp = attribs[PIdx::uy];
         auto& uzp = attribs[PIdx::uz];
+
+        ParticleTileType& ptile = ParticlesAt(lev, pti);
+        auto& soa = ptile.GetStructOfArrays();
+        uint64_t * const AMREX_RESTRICT idcpu = soa.GetIdCPUData().data();
+
         const long np = pti.numParticles();
         for(int i=0; i<np; i++){
             ParticleReal xp, yp, zp;
             GetPosition(i, xp, yp, zp);
-            auto& p = particles[i];
-            if (p.id() == DoSplitParticleID){
+            if (idcpu[i] == LongParticleIds::DoSplitParticleID){
                 // If particle is tagged, split it and put the
                 // split particles in local arrays psplit_x etc.
                 np_split_to_add += np_split;
@@ -2460,7 +2457,7 @@ PhysicalParticleContainer::SplitParticles (int lev)
                 }
 #endif
                 // invalidate the particle
-                p.id() = -p.id();
+                idcpu[i] = amrex::ParticleIdCpus::Invalid;
             }
         }
     }
@@ -2483,10 +2480,16 @@ PhysicalParticleContainer::SplitParticles (int lev)
     amrex::Vector<amrex::Vector<int>> attr_int;
     pctmp_split.AddNParticles(lev,
                               np_split_to_add,
-                              xp,  yp,  zp, uxp, uyp, uzp,
-                              1, attr,
+                              xp,
+                              yp,
+                              zp,
+                              uxp,
+                              uyp,
+                              uzp,
+                              1,
+                              attr,
                               0, attr_int,
-                              1, NoSplitParticleID);
+                              1, LongParticleIds::NoSplitParticleID);
     // Copy particles from tmp to current particle container
     constexpr bool local_flag = true;
     addParticles(pctmp_split,local_flag);
diff --git a/Source/Particles/Pusher/GetAndSetPosition.H b/Source/Particles/Pusher/GetAndSetPosition.H
index e4477a2a60d..44641557756 100644
--- a/Source/Particles/Pusher/GetAndSetPosition.H
+++ b/Source/Particles/Pusher/GetAndSetPosition.H
@@ -30,24 +30,26 @@ void get_particle_position (const WarpXParticleContainer::SuperParticleType& p,
                             amrex::ParticleReal& y,
                             amrex::ParticleReal& z) noexcept
 {
-#ifdef WARPX_DIM_RZ
-    const amrex::ParticleReal theta = p.rdata(T_PIdx::theta);
-    const amrex::ParticleReal r = p.pos(0);
+    using namespace amrex::literals;
+
+#if defined(WARPX_DIM_RZ)
+    amrex::ParticleReal const theta = p.rdata(T_PIdx::theta);
+    amrex::ParticleReal const r = p.pos(T_PIdx::x);
     x = r*std::cos(theta);
     y = r*std::sin(theta);
-    z = p.pos(1);
-#elif WARPX_DIM_3D
-    x = p.pos(0);
-    y = p.pos(1);
-    z = p.pos(2);
-#elif WARPX_DIM_XZ
-    x = p.pos(0);
-    y = amrex::ParticleReal(0.0);
-    z = p.pos(1);
+    z = p.pos(PIdx::z);
+#elif defined(WARPX_DIM_3D)
+    x = p.pos(PIdx::x);
+    y = p.pos(PIdx::y);
+    z = p.pos(PIdx::z);
+#elif defined(WARPX_DIM_XZ)
+    x = p.pos(PIdx::x);
+    y = 0_prt;
+    z = p.pos(PIdx::z);
 #else
-    x = amrex::ParticleReal(0.0);
-    y = amrex::ParticleReal(0.0);
-    z = p.pos(0);
+    x = 0_prt;
+    y = 0_prt;
+    z = p.pos(PIdx::z);
 #endif
 }
 
@@ -59,10 +61,19 @@ void get_particle_position (const WarpXParticleContainer::SuperParticleType& p,
 template<typename T_PIdx = PIdx>
 struct GetParticlePosition
 {
-    using PType = WarpXParticleContainer::ParticleType;
     using RType = amrex::ParticleReal;
 
-    const PType* AMREX_RESTRICT m_structs = nullptr;
+#if defined(WARPX_DIM_RZ) || defined(WARPX_DIM_XZ)
+    const RType* AMREX_RESTRICT m_x = nullptr;
+    const RType* AMREX_RESTRICT m_z = nullptr;
+#elif defined(WARPX_DIM_3D)
+    const RType* AMREX_RESTRICT m_x = nullptr;
+    const RType* AMREX_RESTRICT m_y = nullptr;
+    const RType* AMREX_RESTRICT m_z = nullptr;
+#elif defined(WARPX_DIM_1D_Z)
+    const RType* AMREX_RESTRICT m_z = nullptr;
+#endif
+
 #if defined(WARPX_DIM_RZ)
     const RType* m_theta = nullptr;
 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
@@ -84,10 +95,19 @@ struct GetParticlePosition
     template <typename ptiType>
     GetParticlePosition (const ptiType& a_pti, long a_offset = 0) noexcept
     {
-        const auto& aos = a_pti.GetArrayOfStructs();
-        m_structs = aos().dataPtr() + a_offset;
-#if defined(WARPX_DIM_RZ)
         const auto& soa = a_pti.GetStructOfArrays();
+
+#if defined(WARPX_DIM_RZ) || defined(WARPX_DIM_XZ)
+        m_x = soa.GetRealData(PIdx::x).dataPtr() + a_offset;
+        m_z = soa.GetRealData(PIdx::z).dataPtr() + a_offset;
+#elif defined(WARPX_DIM_3D)
+        m_x = soa.GetRealData(PIdx::x).dataPtr() + a_offset;
+        m_y = soa.GetRealData(PIdx::y).dataPtr() + a_offset;
+        m_z = soa.GetRealData(PIdx::z).dataPtr() + a_offset;
+#elif defined(WARPX_DIM_1D_Z)
+        m_z = soa.GetRealData(PIdx::z).dataPtr() + a_offset;
+#endif
+#if defined(WARPX_DIM_RZ)
         m_theta = soa.GetRealData(T_PIdx::theta).dataPtr() + a_offset;
 #endif
     }
@@ -98,24 +118,23 @@ struct GetParticlePosition
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void operator() (const long i, RType& x, RType& y, RType& z) const noexcept
     {
-        const PType& p = m_structs[i];
 #ifdef WARPX_DIM_RZ
-        const RType r = p.pos(0);
+        RType const r = m_x[i];
         x = r*std::cos(m_theta[i]);
         y = r*std::sin(m_theta[i]);
-        z = p.pos(1);
+        z = m_z[i];
 #elif WARPX_DIM_3D
-        x = p.pos(0);
-        y = p.pos(1);
-        z = p.pos(2);
+        x = m_x[i];
+        y = m_y[i];
+        z = m_z[i];
 #elif WARPX_DIM_XZ
-        x = p.pos(0);
+        x = m_x[i];
         y = m_y_default;
-        z = p.pos(1);
+        z = m_z[i];
 #else
         x = m_x_default;
         y = m_y_default;
-        z = p.pos(0);
+        z = m_z[i];
 #endif
     }
 
@@ -127,23 +146,22 @@ struct GetParticlePosition
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void AsStored (const long i, RType& x, RType& y, RType& z) const noexcept
     {
-        const PType& p = m_structs[i];
 #ifdef WARPX_DIM_RZ
-        x = p.pos(0);
+        x = m_x[i];
         y = m_theta[i];
-        z = p.pos(1);
+        z = m_z[i];
 #elif WARPX_DIM_3D
-        x = p.pos(0);
-        y = p.pos(1);
-        z = p.pos(2);
+        x = m_x[i];
+        y = m_y[i];
+        z = m_z[i];
 #elif WARPX_DIM_XZ
-        x = p.pos(0);
+        x = m_x[i];
         y = m_y_default;
-        z = p.pos(1);
+        z = m_z[i];
 #else
         x = m_x_default;
         y = m_y_default;
-        z = p.pos(0);
+        z = m_z[i];
 #endif
     }
 };
@@ -158,10 +176,18 @@ struct GetParticlePosition
 template<typename T_PIdx = PIdx>
 struct SetParticlePosition
 {
-    using PType = WarpXParticleContainer::ParticleType;
     using RType = amrex::ParticleReal;
 
-    PType* AMREX_RESTRICT m_structs;
+#if defined(WARPX_DIM_RZ) || defined(WARPX_DIM_XZ)
+    RType* AMREX_RESTRICT m_x;
+    RType* AMREX_RESTRICT m_z;
+#elif defined(WARPX_DIM_3D)
+    RType* AMREX_RESTRICT m_x;
+    RType* AMREX_RESTRICT m_y;
+    RType* AMREX_RESTRICT m_z;
+#elif defined(WARPX_DIM_1D_Z)
+    RType* AMREX_RESTRICT m_z;
+#endif
 #if defined(WARPX_DIM_RZ)
     RType* AMREX_RESTRICT m_theta;
 #endif
@@ -169,10 +195,18 @@ struct SetParticlePosition
     template <typename ptiType>
     SetParticlePosition (const ptiType& a_pti, long a_offset = 0) noexcept
     {
-        auto& aos = a_pti.GetArrayOfStructs();
-        m_structs = aos().dataPtr() + a_offset;
-#if defined(WARPX_DIM_RZ)
         auto& soa = a_pti.GetStructOfArrays();
+#if defined(WARPX_DIM_RZ) || defined(WARPX_DIM_XZ)
+        m_x = soa.GetRealData(PIdx::x).dataPtr() + a_offset;
+        m_z = soa.GetRealData(PIdx::z).dataPtr() + a_offset;
+#elif defined(WARPX_DIM_3D)
+        m_x = soa.GetRealData(PIdx::x).dataPtr() + a_offset;
+        m_y = soa.GetRealData(PIdx::y).dataPtr() + a_offset;
+        m_z = soa.GetRealData(PIdx::z).dataPtr() + a_offset;
+#elif defined(WARPX_DIM_1D_Z)
+        m_z = soa.GetRealData(PIdx::z).dataPtr() + a_offset;
+#endif
+#if defined(WARPX_DIM_RZ)
         m_theta = soa.GetRealData(T_PIdx::theta).dataPtr() + a_offset;
 #endif
     }
@@ -190,17 +224,17 @@ struct SetParticlePosition
 #endif
 #ifdef WARPX_DIM_RZ
         m_theta[i] = std::atan2(y, x);
-        m_structs[i].pos(0) = std::sqrt(x*x + y*y);
-        m_structs[i].pos(1) = z;
+        m_x[i] = std::sqrt(x*x + y*y);
+        m_z[i] = z;
 #elif WARPX_DIM_3D
-        m_structs[i].pos(0) = x;
-        m_structs[i].pos(1) = y;
-        m_structs[i].pos(2) = z;
+        m_x[i] = x;
+        m_y[i] = y;
+        m_z[i] = z;
 #elif WARPX_DIM_XZ
-        m_structs[i].pos(0) = x;
-        m_structs[i].pos(1) = z;
+        m_x[i] = x;
+        m_z[i] = z;
 #else
-        m_structs[i].pos(0) = z;
+        m_z[i] = z;
 #endif
     }
 
@@ -218,18 +252,18 @@ struct SetParticlePosition
         amrex::ignore_unused(x,y);
 #endif
 #ifdef WARPX_DIM_RZ
-        m_structs[i].pos(0) = x;
+        m_x[i] = x;
         m_theta[i] = y;
-        m_structs[i].pos(1) = z;
+        m_z[i] = z;
 #elif WARPX_DIM_3D
-        m_structs[i].pos(0) = x;
-        m_structs[i].pos(1) = y;
-        m_structs[i].pos(2) = z;
+        m_x[i] = x;
+        m_y[i] = y;
+        m_z[i] = z;
 #elif WARPX_DIM_XZ
-        m_structs[i].pos(0) = x;
-        m_structs[i].pos(1) = z;
+        m_x[i] = x;
+        m_z[i] = z;
 #else
-        m_structs[i].pos(0) = z;
+        m_z[i] = z;
 #endif
     }
 };
diff --git a/Source/Particles/Resampling/LevelingThinning.cpp b/Source/Particles/Resampling/LevelingThinning.cpp
index 680e33ebe6a..5dc6a458f97 100644
--- a/Source/Particles/Resampling/LevelingThinning.cpp
+++ b/Source/Particles/Resampling/LevelingThinning.cpp
@@ -60,8 +60,7 @@ void LevelingThinning::operator() (WarpXParIter& pti, const int lev,
     auto& ptile = pc->ParticlesAt(lev, pti);
     auto& soa = ptile.GetStructOfArrays();
     amrex::ParticleReal * const AMREX_RESTRICT w = soa.GetRealData(PIdx::w).data();
-    WarpXParticleContainer::ParticleType * const AMREX_RESTRICT
-                                 particle_ptr = ptile.GetArrayOfStructs()().data();
+    auto * const AMREX_RESTRICT idcpu = soa.GetIdCPUData().data();
 
     // Using this function means that we must loop over the cells in the ParallelFor. In the case
     // of the leveling thinning algorithm, it would have possibly been more natural and more
@@ -114,7 +113,7 @@ void LevelingThinning::operator() (WarpXParIter& pti, const int lev,
                 // Remove particle with probability 1 - particle_weight/level_weight
                 if (random_number > w[indices[i]]/level_weight)
                 {
-                    particle_ptr[indices[i]].id() = -1;
+                    idcpu[indices[i]] = amrex::ParticleIdCpus::Invalid;
                 }
                 // Set particle weight to level weight otherwise
                 else
diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index 58511cfd5e7..58e3450f47d 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -61,7 +61,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     // Initialize temporary arrays
     Gpu::DeviceVector<int> inexflag;
     inexflag.resize(np);
-    Gpu::DeviceVector<long> pid;
+    Gpu::DeviceVector<int> pid;
     pid.resize(np);
 
     // First, partition particles into the larger buffer
@@ -109,7 +109,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
             // - For each particle in the large buffer, find whether it is in
             // the smaller buffer, by looking up the mask. Store the answer in `inexflag`.
             amrex::ParallelFor( np - n_fine,
-               fillBufferFlagRemainingParticles(pti, bmasks, inexflag, Geom(lev), pid, n_fine) );
+               fillBufferFlagRemainingParticles(pti, bmasks, inexflag, Geom(lev), pid, int(n_fine)) );
             auto *const sep2 = stablePartition( sep, pid.end(), inexflag );
 
             if (bmasks == gather_masks) {
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
index ac2c63e88f8..ba7761bf48a 100644
--- a/Source/Particles/Sorting/SortingUtils.H
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -12,6 +12,7 @@
 
 #include <AMReX_Gpu.H>
 #include <AMReX_Partition.H>
+#include <AMReX_ParticleUtil.H>
 
 
 /** \brief Fill the elements of the input vector with consecutive integer,
@@ -19,7 +20,7 @@
  *
  * \param[inout] v Vector of integers, to be filled by this routine
  */
-void fillWithConsecutiveIntegers( amrex::Gpu::DeviceVector<long>& v );
+void fillWithConsecutiveIntegers( amrex::Gpu::DeviceVector<int>& v );
 
 /** \brief Find the indices that would reorder the elements of `predicate`
  * so that the elements with non-zero value precede the other elements
@@ -41,7 +42,7 @@ ForwardIterator stablePartition(ForwardIterator const index_begin,
     int const* AMREX_RESTRICT predicate_ptr = predicate.dataPtr();
     int N = static_cast<int>(std::distance(index_begin, index_end));
     auto num_true = amrex::StablePartition(&(*index_begin), N,
-        [predicate_ptr] AMREX_GPU_DEVICE (long i) { return predicate_ptr[i]; });
+        [predicate_ptr] AMREX_GPU_DEVICE (int i) { return predicate_ptr[i]; });
 
     ForwardIterator sep = index_begin;
     std::advance(sep, num_true);
@@ -49,7 +50,7 @@ ForwardIterator stablePartition(ForwardIterator const index_begin,
     // On CPU: Use std library
     ForwardIterator const sep = std::stable_partition(
         index_begin, index_end,
-        [&predicate](long i) { return predicate[i]; }
+        [&predicate](int i) { return predicate[i]; }
     );
 #endif
     return sep;
@@ -88,7 +89,7 @@ class fillBufferFlag
             // Extract simple structure that can be used directly on the GPU
             m_domain{geom.Domain()},
             m_inexflag_ptr{inexflag.dataPtr()},
-            m_particles{pti.GetArrayOfStructs().data()},
+            m_ptd{pti.GetParticleTile().getConstParticleTileData()},
             m_buffer_mask{(*bmasks)[pti].array()}
         {
             for (int idim=0; idim<AMREX_SPACEDIM; idim++) {
@@ -99,11 +100,9 @@ class fillBufferFlag
 
 
         AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-        void operator()( const long i ) const {
-            // Select a particle
-            auto const& p = m_particles[i];
+        void operator()( const int i ) const {
             // Find the index of the cell where this particle is located
-            amrex::IntVect const iv = amrex::getParticleCell( p,
+            amrex::IntVect const iv = amrex::getParticleCell( m_ptd, i,
                                 m_prob_lo, m_inv_cell_size, m_domain );
             // Find the value of the buffer flag in this cell and
             // store it at the corresponding particle position in the array `inexflag`
@@ -113,7 +112,7 @@ class fillBufferFlag
     private:
         amrex::Box m_domain;
         int* m_inexflag_ptr;
-        WarpXParticleContainer::ParticleType const* m_particles;
+        WarpXParticleContainer::ParticleTileType::ConstParticleTileDataType m_ptd;
         amrex::Array4<int const> m_buffer_mask;
         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> m_prob_lo;
         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> m_inv_cell_size;
@@ -141,12 +140,12 @@ class fillBufferFlagRemainingParticles
                         amrex::iMultiFab const* bmasks,
                         amrex::Gpu::DeviceVector<int>& inexflag,
                         amrex::Geometry const& geom,
-                        amrex::Gpu::DeviceVector<long> const& particle_indices,
-                        long const start_index ) :
+                        amrex::Gpu::DeviceVector<int> const& particle_indices,
+                        int start_index ) :
             m_domain{geom.Domain()},
             // Extract simple structure that can be used directly on the GPU
             m_inexflag_ptr{inexflag.dataPtr()},
-            m_particles{pti.GetArrayOfStructs().data()},
+            m_ptd{pti.GetParticleTile().getConstParticleTileData()},
             m_buffer_mask{(*bmasks)[pti].array()},
             m_start_index{start_index},
             m_indices_ptr{particle_indices.dataPtr()}
@@ -159,11 +158,11 @@ class fillBufferFlagRemainingParticles
 
 
         AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-        void operator()( const long i ) const {
+        void operator()( const int i ) const {
             // Select a particle
-            auto const& p = m_particles[m_indices_ptr[i+m_start_index]];
+            auto const j = m_indices_ptr[i+m_start_index];
             // Find the index of the cell where this particle is located
-            amrex::IntVect const iv = amrex::getParticleCell( p,
+            amrex::IntVect const iv = amrex::getParticleCell( m_ptd, j,
                                 m_prob_lo, m_inv_cell_size, m_domain );
             // Find the value of the buffer flag in this cell and
             // store it at the corresponding particle position in the array `inexflag`
@@ -175,10 +174,10 @@ class fillBufferFlagRemainingParticles
         amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> m_inv_cell_size;
         amrex::Box m_domain;
         int* m_inexflag_ptr;
-        WarpXParticleContainer::ParticleType const* m_particles;
+        WarpXParticleContainer::ParticleTileType::ConstParticleTileDataType const m_ptd;
         amrex::Array4<int const> m_buffer_mask;
-        long const m_start_index;
-        long const* m_indices_ptr;
+        int const m_start_index;
+        int const* m_indices_ptr;
 };
 
 /** \brief Functor that copies the elements of `src` into `dst`,
@@ -195,7 +194,7 @@ class copyAndReorder
         copyAndReorder(
             amrex::Gpu::DeviceVector<T> const& src,
             amrex::Gpu::DeviceVector<T>& dst,
-            amrex::Gpu::DeviceVector<long> const& indices ):
+            amrex::Gpu::DeviceVector<int> const& indices ):
                 // Extract simple structure that can be used directly on the GPU
                 m_src_ptr{src.dataPtr()},
                 m_dst_ptr{dst.dataPtr()},
@@ -203,14 +202,14 @@ class copyAndReorder
         {}
 
         AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-        void operator()( const long ip ) const {
+        void operator()( const int ip ) const {
             m_dst_ptr[ip] = m_src_ptr[ m_indices_ptr[ip] ];
         }
 
     private:
         T const* m_src_ptr;
         T* m_dst_ptr;
-        long const* m_indices_ptr;
+        int const* m_indices_ptr;
 };
 
 #endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
diff --git a/Source/Particles/Sorting/SortingUtils.cpp b/Source/Particles/Sorting/SortingUtils.cpp
index 699119e8e18..cd4b6a13c76 100644
--- a/Source/Particles/Sorting/SortingUtils.cpp
+++ b/Source/Particles/Sorting/SortingUtils.cpp
@@ -8,7 +8,7 @@
 
 #include "SortingUtils.H"
 
-void fillWithConsecutiveIntegers( amrex::Gpu::DeviceVector<long>& v )
+void fillWithConsecutiveIntegers( amrex::Gpu::DeviceVector<int>& v )
 {
 #ifdef AMREX_USE_GPU
     // On GPU: Use amrex
diff --git a/Source/Particles/WarpXParticleContainer.H b/Source/Particles/WarpXParticleContainer.H
index 33aa71d1c7d..7d2d5619da9 100644
--- a/Source/Particles/WarpXParticleContainer.H
+++ b/Source/Particles/WarpXParticleContainer.H
@@ -49,10 +49,10 @@
 
 
 class WarpXParIter
-    : public amrex::ParIter<0,0,PIdx::nattribs>
+    : public amrex::ParIterSoA<PIdx::nattribs, 0>
 {
 public:
-    using amrex::ParIter<0,0,PIdx::nattribs>::ParIter;
+    using amrex::ParIterSoA<PIdx::nattribs, 0>::ParIterSoA;
 
     WarpXParIter (ContainerType& pc, int level);
 
@@ -89,13 +89,14 @@ public:
  * particle container classes (that store a collection of particles) derive. Derived
  * classes can be used for plasma particles, photon particles, or non-physical
  * particles (e.g., for the laser antenna).
- * It derives from amrex::ParticleContainer<0,0,PIdx::nattribs>, where the
- * template arguments stand for the number of int and amrex::Real SoA and AoS
- * data in amrex::Particle.
- *  - AoS amrex::Real: x, y, z (default), 0 additional (first template
- *    parameter)
- *  - AoS int: id, cpu (default), 0 additional (second template parameter)
- *  - SoA amrex::Real: PIdx::nattribs (third template parameter), see PIdx for
+ * It derives from amrex::ParticleContainerPureSoA<PIdx::nattribs>, where the
+ * template arguments stand for the number of int and amrex::Real SoA
+ * data in amrex::SoAParticle.
+ *  - SoA amrex::Real: positions x, y, z, momentum ux, uy, uz, ... see PIdx for details;
+ *                     more can be added at runtime
+ *  - SoA int: 0 attributes by default, but can be added at runtime
+ *  - SoA uint64_t: idcpu, a global 64bit index, with a 40bit local id and a 24bit cpu id
+ *    (both set at creation)
  * the list.
  *
  * WarpXParticleContainer contains the main functions for initialization,
@@ -164,11 +165,9 @@ public:
      * class.
      */
     virtual void DefaultInitializeRuntimeAttributes (
-                        amrex::ParticleTile<amrex::Particle<NStructReal, NStructInt>,
-                                            NArrayReal, NArrayInt,
-                                            amrex::PinnedArenaAllocator>& pinned_tile,
-                        int n_external_attr_real,
-                        int n_external_attr_int) = 0;
+        typename ContainerLike<amrex::PinnedArenaAllocator>::ParticleTileType& pinned_tile,
+        int n_external_attr_real,
+        int n_external_attr_int) = 0;
 
     ///
     /// This pushes the particle positions by one half time step.
diff --git a/Source/Particles/WarpXParticleContainer.cpp b/Source/Particles/WarpXParticleContainer.cpp
index a395198e361..0d565c039e6 100644
--- a/Source/Particles/WarpXParticleContainer.cpp
+++ b/Source/Particles/WarpXParticleContainer.cpp
@@ -75,13 +75,13 @@
 using namespace amrex;
 
 WarpXParIter::WarpXParIter (ContainerType& pc, int level)
-    : amrex::ParIter<0,0,PIdx::nattribs>(pc, level,
+    : amrex::ParIterSoA<PIdx::nattribs, 0>(pc, level,
              MFItInfo().SetDynamic(WarpX::do_dynamic_scheduling))
 {
 }
 
 WarpXParIter::WarpXParIter (ContainerType& pc, int level, MFItInfo& info)
-    : amrex::ParIter<0,0,PIdx::nattribs>(pc, level,
+    : amrex::ParIterSoA<PIdx::nattribs, 0>(pc, level,
                    info.SetDynamic(WarpX::do_dynamic_scheduling))
 {
 }
@@ -198,52 +198,53 @@ WarpXParticleContainer::AddNParticles (int /*lev*/, long n,
     // Redistribute() will move them to proper places.
     auto& particle_tile = DefineAndReturnParticleTile(0, 0, 0);
 
-    using PinnedTile = amrex::ParticleTile<Particle<NStructReal, NStructInt>,
-                                           NArrayReal, NArrayInt,
-                                           amrex::PinnedArenaAllocator>;
+    using PinnedTile = typename ContainerLike<amrex::PinnedArenaAllocator>::ParticleTileType;
     PinnedTile pinned_tile;
     pinned_tile.define(NumRuntimeRealComps(), NumRuntimeIntComps());
 
     const std::size_t np = iend-ibegin;
 
 #ifdef WARPX_DIM_RZ
+    amrex::Vector<amrex::ParticleReal> r(np);
     amrex::Vector<amrex::ParticleReal> theta(np);
 #endif
 
     for (auto i = ibegin; i < iend; ++i)
     {
-        ParticleType p;
-        if (id==-1)
-        {
-            p.id() = ParticleType::NextID();
-        } else {
-            p.id() = id;
+        auto & idcpu_data = pinned_tile.GetStructOfArrays().GetIdCPUData();
+
+        amrex::Long current_id = id;  // copy input
+        if (id == -1) {
+            current_id = ParticleType::NextID();
         }
-        p.cpu() = amrex::ParallelDescriptor::MyProc();
+        idcpu_data.push_back(amrex::SetParticleIDandCPU(current_id, ParallelDescriptor::MyProc()));
+
+#ifdef WARPX_DIM_RZ
+        r[i-ibegin] = std::sqrt(x[i]*x[i] + y[i]*y[i]);
+        theta[i-ibegin] = std::atan2(y[i], x[i]);
+#endif
+    }
+
+    if (np > 0)
+    {
 #if defined(WARPX_DIM_3D)
-        p.pos(0) = x[i];
-        p.pos(1) = y[i];
-        p.pos(2) = z[i];
+        pinned_tile.push_back_real(PIdx::x, x.data() + ibegin, x.data() + iend);
+        pinned_tile.push_back_real(PIdx::y, y.data() + ibegin, y.data() + iend);
+        pinned_tile.push_back_real(PIdx::z, z.data() + ibegin, z.data() + iend);
 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
         amrex::ignore_unused(y);
 #ifdef WARPX_DIM_RZ
-        theta[i-ibegin] = std::atan2(y[i], x[i]);
-        p.pos(0) = std::sqrt(x[i]*x[i] + y[i]*y[i]);
+        pinned_tile.push_back_real(PIdx::x, r.data(), r.data() + np);
 #else
-        p.pos(0) = x[i];
+        pinned_tile.push_back_real(PIdx::x, x.data() + ibegin, x.data() + iend);
 #endif
-        p.pos(1) = z[i];
+        pinned_tile.push_back_real(PIdx::z, z.data() + ibegin, z.data() + iend);
 #else //AMREX_SPACEDIM == 1
         amrex::ignore_unused(x,y);
-        p.pos(0) = z[i];
+        pinned_tile.push_back_real(PIdx::z, z.data() + ibegin, z.data() + iend);
 #endif
 
-        pinned_tile.push_back(p);
-    }
-
-    if (np > 0)
-    {
-        pinned_tile.push_back_real(PIdx::w , attr_real[0].data() + ibegin, attr_real[0].data() + iend);
+        pinned_tile.push_back_real(PIdx::w, attr_real[0].data() + ibegin, attr_real[0].data() + iend);
         pinned_tile.push_back_real(PIdx::ux, ux.data() + ibegin, ux.data() + iend);
         pinned_tile.push_back_real(PIdx::uy, uy.data() + ibegin, uy.data() + iend);
         pinned_tile.push_back_real(PIdx::uz, uz.data() + ibegin, uz.data() + iend);
@@ -476,15 +477,14 @@ WarpXParticleContainer::DepositCurrent (WarpXParIter& pti,
 
         //sort particles by bin
         WARPX_PROFILE_VAR_START(blp_sort);
-        amrex::DenseBins<ParticleType> bins;
+        amrex::DenseBins<ParticleTileType::ParticleTileDataType> bins;
         {
             auto& ptile = ParticlesAt(lev, pti);
-            auto& aos = ptile.GetArrayOfStructs();
-            auto *pstruct_ptr = aos().dataPtr();
+            auto ptd = ptile.getParticleTileData();
 
             const int ntiles = numTilesInBox(box, true, bin_size);
 
-            bins.build(ptile.numParticles(), pstruct_ptr, ntiles,
+            bins.build(ptile.numParticles(), ptd, ntiles,
                     [=] AMREX_GPU_HOST_DEVICE (const ParticleType& p) -> unsigned int
                     {
                         Box tbox;
@@ -947,7 +947,7 @@ WarpXParticleContainer::DepositCharge (WarpXParIter& pti, RealVector const& wp,
 
         // HACK - sort particles by bin here.
         WARPX_PROFILE_VAR_START(blp_sort);
-        amrex::DenseBins<ParticleType> bins;
+        amrex::DenseBins<ParticleTileType::ParticleTileDataType> bins;
         {
             const Geometry& geom = Geom(lev);
             const auto dxi = geom.InvCellSizeArray();
@@ -955,16 +955,15 @@ WarpXParticleContainer::DepositCharge (WarpXParIter& pti, RealVector const& wp,
             const auto domain = geom.Domain();
 
             auto& ptile = ParticlesAt(lev, pti);
-            auto& aos   = ptile.GetArrayOfStructs();
-            auto *pstruct_ptr = aos().dataPtr();
+            auto ptd = ptile.getParticleTileData();
 
             Box box = pti.validbox();
             box.grow(ng_rho);
             const amrex::IntVect bin_size = WarpX::shared_tilesize;
             const int ntiles = numTilesInBox(box, true, bin_size);
 
-            bins.build(ptile.numParticles(), pstruct_ptr, ntiles,
-                       [=] AMREX_GPU_HOST_DEVICE (const ParticleType& p) -> unsigned int
+            bins.build(ptile.numParticles(), ptd, ntiles,
+                       [=] AMREX_GPU_HOST_DEVICE (ParticleType const & p) -> unsigned int
                        {
                            Box tbx;
                            auto iv = getParticleCell(p, plo, dxi, domain);
@@ -984,8 +983,7 @@ WarpXParticleContainer::DepositCharge (WarpXParIter& pti, RealVector const& wp,
             const auto domain = geom.Domain();
 
             auto& ptile = ParticlesAt(lev, pti);
-            auto& aos   = ptile.GetArrayOfStructs();
-            auto *pstruct_ptr = aos().dataPtr();
+            auto ptd = ptile.getParticleTileData();
 
             Box box = pti.validbox();
             box.grow(ng_rho);
@@ -999,9 +997,10 @@ WarpXParticleContainer::DepositCharge (WarpXParIter& pti, RealVector const& wp,
                                    const auto bin_start = offsets_ptr[ibin];
                                    const auto bin_stop = offsets_ptr[ibin+1];
                                    if (bin_start < bin_stop) {
-                                       auto p = pstruct_ptr[permutation[bin_start]];
+                                       // static_cast until https://github.com/AMReX-Codes/amrex/pull/3684
+                                       auto const i = static_cast<int>(permutation[bin_start]);
                                        Box tbx;
-                                       auto iv = getParticleCell(p, plo, dxi, domain);
+                                       auto iv = getParticleCell(ptd, i, plo, dxi, domain);
                                        AMREX_ASSERT(box.contains(iv));
                                        [[maybe_unused]] auto tid = getTileIndex(iv, box, true, bin_size, tbx);
                                        AMREX_ASSERT(tid == ibin);
@@ -1490,10 +1489,10 @@ WarpXParticleContainer::particlePostLocate(ParticleType& p,
     // Tag particle if goes to higher level.
     // It will be split later in the loop
     if (pld.m_lev == lev+1
-        and p.id() != NoSplitParticleID
+        and p.id() != amrex::LongParticleIds::NoSplitParticleID
         and p.id() >= 0)
     {
-        p.id() = DoSplitParticleID;
+        p.id() = amrex::LongParticleIds::DoSplitParticleID;
     }
 
     if (pld.m_lev == lev-1){
@@ -1532,9 +1531,9 @@ WarpXParticleContainer::ApplyBoundaryConditions (){
             const Real zmax = Geom(lev).ProbHi(WARPX_ZINDEX);
 
             ParticleTileType& ptile = ParticlesAt(lev, pti);
-            ParticleType * const pp = ptile.GetArrayOfStructs()().data();
 
             auto& soa = ptile.GetStructOfArrays();
+            uint64_t * const AMREX_RESTRICT idcpu = soa.GetIdCPUData().data();
             amrex::ParticleReal * const AMREX_RESTRICT ux = soa.GetRealData(PIdx::ux).data();
             amrex::ParticleReal * const AMREX_RESTRICT uy = soa.GetRealData(PIdx::uy).data();
             amrex::ParticleReal * const AMREX_RESTRICT uz = soa.GetRealData(PIdx::uz).data();
@@ -1543,10 +1542,9 @@ WarpXParticleContainer::ApplyBoundaryConditions (){
             amrex::ParallelForRNG(
                 pti.numParticles(),
                 [=] AMREX_GPU_DEVICE (long i, amrex::RandomEngine const& engine) {
-                    ParticleType& p = pp[i];
-
                     // skip particles that are already flagged for removal
-                    if (p.id() < 0) { return; }
+                    auto pidw = amrex::ParticleIDWrapper{idcpu[i]};
+                    if (!pidw.is_valid()) { return; }
 
                     ParticleReal x, y, z;
                     GetPosition.AsStored(i, x, y, z);
@@ -1568,7 +1566,7 @@ WarpXParticleContainer::ApplyBoundaryConditions (){
                                                               boundary_conditions, engine);
 
                     if (particle_lost) {
-                        p.id() = -p.id();
+                        pidw.make_invalid();
                     } else {
                         SetPosition.AsStored(i, x, y, z);
                     }
diff --git a/Source/Python/Particles/ParticleBoundaryBuffer.cpp b/Source/Python/Particles/ParticleBoundaryBuffer.cpp
index 2a35faece9b..b04ac75e600 100644
--- a/Source/Python/Particles/ParticleBoundaryBuffer.cpp
+++ b/Source/Python/Particles/ParticleBoundaryBuffer.cpp
@@ -10,13 +10,13 @@
 
 namespace warpx {
     class BoundaryBufferParIter
-        : public amrex::ParIter<0,0,PIdx::nattribs,0,amrex::PinnedArenaAllocator>
+        : public amrex::ParIterSoA<PIdx::nattribs, 0, amrex::PinnedArenaAllocator>
     {
     public:
-        using amrex::ParIter<0,0,PIdx::nattribs,0,amrex::PinnedArenaAllocator>::ParIter;
+        using amrex::ParIterSoA<PIdx::nattribs, 0, amrex::PinnedArenaAllocator>::ParIterSoA;
 
         BoundaryBufferParIter(ContainerType& pc, int level) :
-            amrex::ParIter<0,0,PIdx::nattribs,0,amrex::PinnedArenaAllocator>(pc, level) {}
+            amrex::ParIterSoA<PIdx::nattribs, 0, amrex::PinnedArenaAllocator>(pc, level) {}
     };
 }
 
@@ -24,9 +24,9 @@ void init_BoundaryBufferParIter (py::module& m)
 {
     py::class_<
         warpx::BoundaryBufferParIter,
-        amrex::ParIter<0,0,PIdx::nattribs,0,amrex::PinnedArenaAllocator>
+        amrex::ParIterSoA<PIdx::nattribs, 0, amrex::PinnedArenaAllocator>
     >(m, "BoundaryBufferParIter")
-        .def(py::init<amrex::ParIter<0,0,PIdx::nattribs,0,amrex::PinnedArenaAllocator>::ContainerType&, int>(),
+        .def(py::init<amrex::ParIterSoA<PIdx::nattribs, 0, amrex::PinnedArenaAllocator>::ContainerType&, int>(),
             py::arg("particle_container"), py::arg("level")
         )
     ;
diff --git a/Source/Python/Particles/PinnedMemoryParticleContainer.cpp b/Source/Python/Particles/PinnedMemoryParticleContainer.cpp
index 600d56a62c9..d4f6a422dbe 100644
--- a/Source/Python/Particles/PinnedMemoryParticleContainer.cpp
+++ b/Source/Python/Particles/PinnedMemoryParticleContainer.cpp
@@ -13,6 +13,6 @@ void init_PinnedMemoryParticleContainer (py::module& m)
 {
     py::class_<
         PinnedMemoryParticleContainer,
-        amrex::ParticleContainer<0,0,PIdx::nattribs,0,amrex::PinnedArenaAllocator>
+        amrex::ParticleContainerPureSoA<PIdx::nattribs, 0, amrex::PinnedArenaAllocator>
     > pmpc (m, "PinnedMemoryParticleContainer");
 }
diff --git a/Source/Python/Particles/WarpXParticleContainer.cpp b/Source/Python/Particles/WarpXParticleContainer.cpp
index 1473a750941..07793a373f3 100644
--- a/Source/Python/Particles/WarpXParticleContainer.cpp
+++ b/Source/Python/Particles/WarpXParticleContainer.cpp
@@ -12,11 +12,11 @@
 void init_WarpXParIter (py::module& m)
 {
     py::class_<
-        WarpXParIter, amrex::ParIter<0,0,PIdx::nattribs>
+        WarpXParIter, amrex::ParIterSoA<PIdx::nattribs, 0>
     >(m, "WarpXParIter")
-        .def(py::init<amrex::ParIter<0,0,PIdx::nattribs>::ContainerType&, int>(),
+        .def(py::init<amrex::ParIterSoA<PIdx::nattribs, 0>::ContainerType&, int>(),
             py::arg("particle_container"), py::arg("level"))
-        .def(py::init<amrex::ParIter<0,0,PIdx::nattribs>::ContainerType&, int, amrex::MFItInfo&>(),
+        .def(py::init<amrex::ParIterSoA<PIdx::nattribs, 0>::ContainerType&, int, amrex::MFItInfo&>(),
             py::arg("particle_container"), py::arg("level"),
             py::arg("info"))
     ;
@@ -26,11 +26,11 @@ void init_WarpXParticleContainer (py::module& m)
 {
     py::class_<
         WarpXParticleContainer,
-        amrex::ParticleContainer<0, 0, PIdx::nattribs, 0>
+        amrex::ParticleContainerPureSoA<PIdx::nattribs, 0>
     > wpc (m, "WarpXParticleContainer");
     wpc
         .def("add_real_comp",
-            [](WarpXParticleContainer& pc, const std::string& name, bool const comm) { pc.AddRealComp(name, comm); },
+            [](WarpXParticleContainer& pc, const std::string& name, bool comm) { pc.AddRealComp(name, comm); },
             py::arg("name"), py::arg("comm")
         )
         .def("add_n_particles",
@@ -93,6 +93,14 @@ void init_WarpXParticleContainer (py::module& m)
             },
             py::arg("comp_name")
         )
+        .def("get_icomp_index",
+            [](WarpXParticleContainer& pc, std::string comp_name)
+            {
+                auto particle_comps = pc.getParticleiComps();
+                return particle_comps.at(comp_name);
+            },
+            py::arg("comp_name")
+        )
         .def("num_local_tiles_at_level",
             &WarpXParticleContainer::numLocalTilesAtLevel,
             py::arg("level")
diff --git a/Source/Utils/ParticleUtils.H b/Source/Utils/ParticleUtils.H
index b04176d4d83..7e3c89228ea 100644
--- a/Source/Utils/ParticleUtils.H
+++ b/Source/Utils/ParticleUtils.H
@@ -28,9 +28,10 @@ namespace ParticleUtils {
      * @param[in] mfi the MultiFAB iterator.
      * @param[in] ptile the particle tile.
      */
-    amrex::DenseBins<WarpXParticleContainer::ParticleType>
-    findParticlesInEachCell(int lev, amrex::MFIter const& mfi,
-                             WarpXParticleContainer::ParticleTileType const& ptile);
+    amrex::DenseBins<typename WarpXParticleContainer::ParticleTileType::ParticleTileDataType>
+    findParticlesInEachCell (int lev,
+                             amrex::MFIter const & mfi,
+                             WarpXParticleContainer::ParticleTileType & ptile);
 
     /**
      * \brief Return (relativistic) particle energy given velocity and mass.
diff --git a/Source/Utils/ParticleUtils.cpp b/Source/Utils/ParticleUtils.cpp
index 60e04f12b86..b8207b61fa0 100644
--- a/Source/Utils/ParticleUtils.cpp
+++ b/Source/Utils/ParticleUtils.cpp
@@ -22,24 +22,28 @@
 #include <AMReX_REAL.H>
 #include <AMReX_SPACE.H>
 
-namespace ParticleUtils {
+namespace ParticleUtils
+{
 
     using namespace amrex;
+
     // Define shortcuts for frequently-used type names
-    using ParticleType = WarpXParticleContainer::ParticleType;
-    using ParticleTileType = WarpXParticleContainer::ParticleTileType;
-    using ParticleBins = DenseBins<ParticleType>;
-    using index_type = ParticleBins::index_type;
+    using ParticleType = typename WarpXParticleContainer::ParticleType;
+    using ParticleTileType = typename WarpXParticleContainer::ParticleTileType;
+    using ParticleTileDataType = typename ParticleTileType::ParticleTileDataType;
+    using ParticleBins = DenseBins<ParticleTileDataType>;
+    using index_type = typename ParticleBins::index_type;
 
     /* Find the particles and count the particles that are in each cell.
        Note that this does *not* rearrange particle arrays */
     ParticleBins
-    findParticlesInEachCell( int const lev, MFIter const& mfi,
-                             ParticleTileType const& ptile) {
+    findParticlesInEachCell (int lev,
+                             MFIter const & mfi,
+                             ParticleTileType & ptile) {
 
         // Extract particle structures for this tile
         int const np = ptile.numParticles();
-        ParticleType const* particle_ptr = ptile.GetArrayOfStructs()().data();
+        auto ptd = ptile.getParticleTileData();
 
         // Extract box properties
         Geometry const& geom = WarpX::GetInstance().Geom(lev);
@@ -51,9 +55,9 @@ namespace ParticleUtils {
         // Find particles that are in each cell;
         // results are stored in the object `bins`.
         ParticleBins bins;
-        bins.build(np, particle_ptr, cbx,
+        bins.build(np, ptd, cbx,
             // Pass lambda function that returns the cell index
-            [=] AMREX_GPU_DEVICE (const ParticleType& p) noexcept
+            [=] AMREX_GPU_DEVICE (ParticleType const & p) noexcept -> amrex::IntVect
             {
                 return IntVect{AMREX_D_DECL(
                                    static_cast<int>((p.pos(0)-plo[0])*dxi[0] - lo.x),
@@ -64,4 +68,4 @@ namespace ParticleUtils {
         return bins;
     }
 
-}
+} // namespace ParticleUtils
diff --git a/Source/ablastr/particles/IndexHandling.H b/Source/ablastr/particles/IndexHandling.H
deleted file mode 100644
index 0ad5ca60446..00000000000
--- a/Source/ablastr/particles/IndexHandling.H
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2019-2022 Axel Huebl
- *
- * This file is part of WarpX.
- *
- * License: BSD-3-Clause-LBNL
- */
-#ifndef ABLASTR_INDEX_HANDLING_H
-#define ABLASTR_INDEX_HANDLING_H
-
-#include <cstdint>
-
-
-namespace ablastr::particles {
-
-    /** A helper function to derive a globally unique particle ID
-     *
-     * @param[in] id  AMReX particle ID (on local cpu/rank), AoS .id
-     * @param[in] cpu AMReX particle CPU (rank) at creation of the particle, AoS .cpu
-     * @return global particle ID that is unique and permanent in the whole simulation
-     */
-    constexpr uint64_t
-    localIDtoGlobal (int const id, int const cpu)
-    {
-        static_assert(sizeof(int) * 2u <= sizeof(uint64_t),
-                      "int size might cause collisions in global IDs");
-        // implementation:
-        //   - we cast both 32-bit (or smaller) ints to a 64bit unsigned int
-        //   - this will leave half of the "upper range" bits in the 64bit unsigned int zeroed out
-        //     because the corresponding (extended) value range was not part of the value range in
-        //     the int representation
-        //   - we bit-shift the cpu into the upper half of zero bits in the 64 bit unsigned int
-        //     (imagine this step as "adding a per-cpu/rank offset to the local integers")
-        //   - then we add this offset
-        //     note: the add is expressed as bitwise OR (|) since this saves us from writing
-        //           brackets for operator precedence between + and <<
-        return uint64_t(id) | uint64_t(cpu) << 32u;
-    }
-
-} // namespace ablastr::particles
-
-#endif // ABLASTR_INDEX_HANDLING_H
diff --git a/Source/ablastr/particles/ParticleMoments.H b/Source/ablastr/particles/ParticleMoments.H
index e45fb574cce..b648ccb28aa 100644
--- a/Source/ablastr/particles/ParticleMoments.H
+++ b/Source/ablastr/particles/ParticleMoments.H
@@ -35,7 +35,7 @@ namespace particles {
             amrex::ParticleReal, amrex::ParticleReal>
     MinAndMaxPositions (T_PC const & pc)
     {
-        using PType = typename T_PC::SuperParticleType;
+        using ConstParticleTileDataType = typename T_PC::ParticleTileType::ConstParticleTileDataType;
 
         // Get min and max for the local rank
         amrex::ReduceOps<
@@ -46,11 +46,11 @@ namespace particles {
                               amrex::ParticleReal, amrex::ParticleReal, amrex::ParticleReal>
         >(
             pc,
-            [=] AMREX_GPU_DEVICE(PType const & p) noexcept
+            [=] AMREX_GPU_DEVICE(const ConstParticleTileDataType& ptd, const int i) noexcept
             {
-                amrex::ParticleReal const x = p.pos(0);
-                amrex::ParticleReal const y = p.pos(1);
-                amrex::ParticleReal const z = p.pos(2);
+                const amrex::ParticleReal x = ptd.rdata(0)[i];
+                const amrex::ParticleReal y = ptd.rdata(1)[i];
+                const amrex::ParticleReal z = ptd.rdata(2)[i];
 
                 return amrex::makeTuple(x, y, z, x, y, z);
             },
@@ -90,7 +90,8 @@ namespace particles {
             amrex::ParticleReal, amrex::ParticleReal>
     MeanAndStdPositions (T_PC const & pc)
     {
-        using PType = typename T_PC::SuperParticleType;
+
+        using ConstParticleTileDataType = typename T_PC::ParticleTileType::ConstParticleTileDataType;
 
         amrex::ReduceOps<
             amrex::ReduceOpSum, amrex::ReduceOpSum, amrex::ReduceOpSum,
@@ -103,12 +104,14 @@ namespace particles {
                 amrex::ParticleReal>
         >(
             pc,
-            [=] AMREX_GPU_DEVICE(const PType& p) noexcept
+            [=] AMREX_GPU_DEVICE(const ConstParticleTileDataType& ptd, const int i) noexcept
             {
-                amrex::ParticleReal const x = p.pos(0);
-                amrex::ParticleReal const y = p.pos(1);
-                amrex::ParticleReal const z = p.pos(2);
-                amrex::ParticleReal const w = p.rdata(T_RealSoAWeight);
+
+                const amrex::ParticleReal x = ptd.rdata(0)[i];
+                const amrex::ParticleReal y = ptd.rdata(1)[i];
+                const amrex::ParticleReal z = ptd.rdata(2)[i];
+
+                const amrex::ParticleReal w = ptd.rdata(T_RealSoAWeight)[i];
 
                 return amrex::makeTuple(x, x*x, y, y*y, z, z*z, w);
             },
diff --git a/cmake/dependencies/AMReX.cmake b/cmake/dependencies/AMReX.cmake
index 0f6a15a5ff4..81f5a533a76 100644
--- a/cmake/dependencies/AMReX.cmake
+++ b/cmake/dependencies/AMReX.cmake
@@ -269,7 +269,7 @@ set(WarpX_amrex_src ""
 set(WarpX_amrex_repo "https://github.com/AMReX-Codes/amrex.git"
     CACHE STRING
     "Repository URI to pull and build AMReX from if(WarpX_amrex_internal)")
-set(WarpX_amrex_branch "24.02"
+set(WarpX_amrex_branch "296ed40e16ae1877640f5b78e9162dbd4ba1c279"
     CACHE STRING
     "Repository branch for WarpX_amrex_repo if(WarpX_amrex_internal)")
 
diff --git a/cmake/dependencies/pyAMReX.cmake b/cmake/dependencies/pyAMReX.cmake
index b4cf9f3f9c1..8a9e35c6579 100644
--- a/cmake/dependencies/pyAMReX.cmake
+++ b/cmake/dependencies/pyAMReX.cmake
@@ -79,7 +79,7 @@ option(WarpX_pyamrex_internal "Download & build pyAMReX" ON)
 set(WarpX_pyamrex_repo "https://github.com/AMReX-Codes/pyamrex.git"
     CACHE STRING
     "Repository URI to pull and build pyamrex from if(WarpX_pyamrex_internal)")
-set(WarpX_pyamrex_branch "24.02"
+set(WarpX_pyamrex_branch "defb663d74ef9f50183b31c5dc9731cf6adb447c"
     CACHE STRING
     "Repository branch for WarpX_pyamrex_repo if(WarpX_pyamrex_internal)")
 
diff --git a/run_test.sh b/run_test.sh
index e1b45ab7c28..6d8a1ddb014 100755
--- a/run_test.sh
+++ b/run_test.sh
@@ -68,7 +68,7 @@ python3 -m pip install --upgrade -r warpx/Regression/requirements.txt
 
 # Clone AMReX and warpx-data
 git clone https://github.com/AMReX-Codes/amrex.git
-cd amrex && git checkout --detach 24.02 && cd -
+cd amrex && git checkout --detach 296ed40e16ae1877640f5b78e9162dbd4ba1c279 && cd -
 # warpx-data contains various required data sets
 git clone --depth 1 https://github.com/ECP-WarpX/warpx-data.git
 # openPMD-example-datasets contains various required data sets

From a9d8126b500e1c7197eb0ed1e52fd50bb09cbdf4 Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Mon, 5 Feb 2024 04:15:34 -0800
Subject: [PATCH 10/13] Fix: Pre-Installed AMReX w/ CUDA (#4668)

Fix CMake language activation with pre-installed AMReX
using the CUDA backend.
---
 cmake/dependencies/AMReX.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/dependencies/AMReX.cmake b/cmake/dependencies/AMReX.cmake
index 81f5a533a76..6c7c9466dfd 100644
--- a/cmake/dependencies/AMReX.cmake
+++ b/cmake/dependencies/AMReX.cmake
@@ -257,6 +257,10 @@ macro(find_amrex)
         list(APPEND CMAKE_MODULE_PATH "${AMReX_DIR}/AMReXCMakeModules")
 
         message(STATUS "AMReX: Found version '${AMReX_VERSION}'")
+
+        if(WarpX_COMPUTE STREQUAL CUDA)
+            enable_language(CUDA)
+        endif()
     endif()
 endmacro()
 

From 7e368134be037599d8ed1983a04f400765dd719b Mon Sep 17 00:00:00 2001
From: "S. Eric Clark" <25495882+clarkse@users.noreply.github.com>
Date: Mon, 5 Feb 2024 16:26:55 -0800
Subject: [PATCH 11/13] Add hybrid resistivity current term (#4661)

* Adding total current magnitude dependence in hybrid resistivity.

* Removed dead line of code.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Adding logic to only do current interpolation when the resistivity has J dependence.

* Fixing staggering bug and changing how squares are computed.

* Changing to using std::sqrt and adding _rt to value initialization for jtot_val.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 Docs/source/usage/parameters.rst              |  2 +-
 Python/pywarpx/picmi.py                       |  2 +-
 .../HybridPICModel/HybridPICModel.H           |  3 +-
 .../HybridPICModel/HybridPICModel.cpp         |  8 ++-
 .../HybridPICSolveE.cpp                       | 68 +++++++++++++++++--
 5 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/Docs/source/usage/parameters.rst b/Docs/source/usage/parameters.rst
index 493e8307037..d26c22e6dea 100644
--- a/Docs/source/usage/parameters.rst
+++ b/Docs/source/usage/parameters.rst
@@ -2247,7 +2247,7 @@ Maxwell solver: kinetic-fluid hybrid
     If ``algo.maxwell_solver`` is set to ``hybrid``, this sets the exponent used to calculate
     the electron pressure (see :ref:`here <theory-hybrid-model-elec-temp>`).
 
-* ``hybrid_pic_model.plasma_resistivity(rho)`` (`float` or `str`) optional (default ``0``)
+* ``hybrid_pic_model.plasma_resistivity(rho,J)`` (`float` or `str`) optional (default ``0``)
     If ``algo.maxwell_solver`` is set to ``hybrid``, this sets the plasma resistivity in :math:`\Omega m`.
 
 * ``hybrid_pic_model.J[x/y/z]_external_grid_function(x, y, z, t)`` (`float` or `str`) optional (default ``0``)
diff --git a/Python/pywarpx/picmi.py b/Python/pywarpx/picmi.py
index 89bd5af2eab..f11ecb379f2 100644
--- a/Python/pywarpx/picmi.py
+++ b/Python/pywarpx/picmi.py
@@ -1184,7 +1184,7 @@ def solver_initialize_inputs(self):
         pywarpx.hybridpicmodel.gamma = self.gamma
         pywarpx.hybridpicmodel.n_floor = self.n_floor
         pywarpx.hybridpicmodel.__setattr__(
-            'plasma_resistivity(rho)',
+            'plasma_resistivity(rho,J)',
             pywarpx.my_constants.mangle_expression(self.plasma_resistivity, self.mangle_dict)
         )
         pywarpx.hybridpicmodel.substeps = self.substeps
diff --git a/Source/FieldSolver/FiniteDifferenceSolver/HybridPICModel/HybridPICModel.H b/Source/FieldSolver/FiniteDifferenceSolver/HybridPICModel/HybridPICModel.H
index d1931a71765..23ef49b58cb 100644
--- a/Source/FieldSolver/FiniteDifferenceSolver/HybridPICModel/HybridPICModel.H
+++ b/Source/FieldSolver/FiniteDifferenceSolver/HybridPICModel/HybridPICModel.H
@@ -172,7 +172,8 @@ public:
     /** Plasma resistivity */
     std::string m_eta_expression = "0.0";
     std::unique_ptr<amrex::Parser> m_resistivity_parser;
-    amrex::ParserExecutor<1> m_eta;
+    amrex::ParserExecutor<2> m_eta;
+    bool m_resistivity_has_J_dependence = false;
 
     /** External current */
     std::string m_Jx_ext_grid_function = "0.0";
diff --git a/Source/FieldSolver/FiniteDifferenceSolver/HybridPICModel/HybridPICModel.cpp b/Source/FieldSolver/FiniteDifferenceSolver/HybridPICModel/HybridPICModel.cpp
index fb7e90f21a1..034bb71efbc 100644
--- a/Source/FieldSolver/FiniteDifferenceSolver/HybridPICModel/HybridPICModel.cpp
+++ b/Source/FieldSolver/FiniteDifferenceSolver/HybridPICModel/HybridPICModel.cpp
@@ -37,7 +37,7 @@ void HybridPICModel::ReadParameters ()
         Abort("hybrid_pic_model.n0_ref should be specified if hybrid_pic_model.gamma != 1");
     }
 
-    pp_hybrid.query("plasma_resistivity(rho)", m_eta_expression);
+    pp_hybrid.query("plasma_resistivity(rho,J)", m_eta_expression);
     utils::parser::queryWithParser(pp_hybrid, "n_floor", m_n_floor);
 
     // convert electron temperature from eV to J
@@ -123,8 +123,10 @@ void HybridPICModel::ClearLevel (int lev)
 void HybridPICModel::InitData ()
 {
     m_resistivity_parser = std::make_unique<amrex::Parser>(
-        utils::parser::makeParser(m_eta_expression, {"rho"}));
-    m_eta = m_resistivity_parser->compile<1>();
+        utils::parser::makeParser(m_eta_expression, {"rho","J"}));
+    m_eta = m_resistivity_parser->compile<2>();
+    const std::set<std::string> resistivity_symbols = m_resistivity_parser->symbols();
+    m_resistivity_has_J_dependence += resistivity_symbols.count("J");
 
     m_J_external_parser[0] = std::make_unique<amrex::Parser>(
         utils::parser::makeParser(m_Jx_ext_grid_function,{"x","y","z","t"}));
diff --git a/Source/FieldSolver/FiniteDifferenceSolver/HybridPICSolveE.cpp b/Source/FieldSolver/FiniteDifferenceSolver/HybridPICSolveE.cpp
index 1a72fee53c2..5100eed0df3 100644
--- a/Source/FieldSolver/FiniteDifferenceSolver/HybridPICSolveE.cpp
+++ b/Source/FieldSolver/FiniteDifferenceSolver/HybridPICSolveE.cpp
@@ -433,6 +433,7 @@ void FiniteDifferenceSolver::HybridPICSolveECylindrical (
     // get hybrid model parameters
     const auto eta = hybrid_model->m_eta;
     const auto rho_floor = hybrid_model->m_n_floor * PhysConst::q_e;
+    const auto resistivity_has_J_dependence = hybrid_model->m_resistivity_has_J_dependence;
 
     // Index type required for interpolating fields from their respective
     // staggering to the Ex, Ey, Ez locations
@@ -589,6 +590,15 @@ void FiniteDifferenceSolver::HybridPICSolveECylindrical (
                 // Interpolate to get the appropriate charge density in space
                 Real rho_val = Interp(rho, nodal, Er_stag, coarsen, i, j, 0, 0);
 
+                // Interpolate current to appropriate staggering to match E field
+                Real jtot_val = 0._rt;
+                if (include_resistivity_term && resistivity_has_J_dependence) {
+                    Real jr_val = Interp(Jr, Jr_stag, Er_stag, coarsen, i, j, 0, 0);
+                    Real jt_val = Interp(Jt, Jt_stag, Er_stag, coarsen, i, j, 0, 0);
+                    Real jz_val = Interp(Jz, Jz_stag, Er_stag, coarsen, i, j, 0, 0);
+                    jtot_val = std::sqrt(jr_val*jr_val + jt_val*jt_val + jz_val*jz_val);
+                }
+
                 // safety condition since we divide by rho_val later
                 if (rho_val < rho_floor) { rho_val = rho_floor; }
 
@@ -601,7 +611,7 @@ void FiniteDifferenceSolver::HybridPICSolveECylindrical (
                 Er(i, j, 0) = (enE_r - grad_Pe) / rho_val;
 
                 // Add resistivity only if E field value is used to update B
-                if (include_resistivity_term) { Er(i, j, 0) += eta(rho_val) * Jr(i, j, 0); }
+                if (include_resistivity_term) { Er(i, j, 0) += eta(rho_val, jtot_val) * Jr(i, j, 0); }
             },
 
             // Et calculation
@@ -622,6 +632,15 @@ void FiniteDifferenceSolver::HybridPICSolveECylindrical (
                 // Interpolate to get the appropriate charge density in space
                 Real rho_val = Interp(rho, nodal, Er_stag, coarsen, i, j, 0, 0);
 
+                // Interpolate current to appropriate staggering to match E field
+                Real jtot_val = 0._rt;
+                if (include_resistivity_term && resistivity_has_J_dependence) {
+                    Real jr_val = Interp(Jr, Jr_stag, Et_stag, coarsen, i, j, 0, 0);
+                    Real jt_val = Interp(Jt, Jt_stag, Et_stag, coarsen, i, j, 0, 0);
+                    Real jz_val = Interp(Jz, Jz_stag, Et_stag, coarsen, i, j, 0, 0);
+                    jtot_val = std::sqrt(jr_val*jr_val + jt_val*jt_val + jz_val*jz_val);
+                }
+
                 // safety condition since we divide by rho_val later
                 if (rho_val < rho_floor) { rho_val = rho_floor; }
 
@@ -635,7 +654,7 @@ void FiniteDifferenceSolver::HybridPICSolveECylindrical (
                 Et(i, j, 0) = (enE_t - grad_Pe) / rho_val;
 
                 // Add resistivity only if E field value is used to update B
-                if (include_resistivity_term) { Et(i, j, 0) += eta(rho_val) * Jt(i, j, 0); }
+                if (include_resistivity_term) { Et(i, j, 0) += eta(rho_val, jtot_val) * Jt(i, j, 0); }
             },
 
             // Ez calculation
@@ -647,6 +666,15 @@ void FiniteDifferenceSolver::HybridPICSolveECylindrical (
                 // Interpolate to get the appropriate charge density in space
                 Real rho_val = Interp(rho, nodal, Ez_stag, coarsen, i, j, k, 0);
 
+                // Interpolate current to appropriate staggering to match E field
+                Real jtot_val = 0._rt;
+                if (include_resistivity_term && resistivity_has_J_dependence) {
+                    Real jr_val = Interp(Jr, Jr_stag, Ez_stag, coarsen, i, j, 0, 0);
+                    Real jt_val = Interp(Jt, Jt_stag, Ez_stag, coarsen, i, j, 0, 0);
+                    Real jz_val = Interp(Jz, Jz_stag, Ez_stag, coarsen, i, j, 0, 0);
+                    jtot_val = std::sqrt(jr_val*jr_val + jt_val*jt_val + jz_val*jz_val);
+                }
+
                 // safety condition since we divide by rho_val later
                 if (rho_val < rho_floor) { rho_val = rho_floor; }
 
@@ -659,7 +687,7 @@ void FiniteDifferenceSolver::HybridPICSolveECylindrical (
                 Ez(i, j, k) = (enE_z - grad_Pe) / rho_val;
 
                 // Add resistivity only if E field value is used to update B
-                if (include_resistivity_term) { Ez(i, j, k) += eta(rho_val) * Jz(i, j, k); }
+                if (include_resistivity_term) { Ez(i, j, k) += eta(rho_val, jtot_val) * Jz(i, j, k); }
             }
         );
 
@@ -699,6 +727,7 @@ void FiniteDifferenceSolver::HybridPICSolveECartesian (
     // get hybrid model parameters
     const auto eta = hybrid_model->m_eta;
     const auto rho_floor = hybrid_model->m_n_floor * PhysConst::q_e;
+    const auto resistivity_has_J_dependence = hybrid_model->m_resistivity_has_J_dependence;
 
     // Index type required for interpolating fields from their respective
     // staggering to the Ex, Ey, Ez locations
@@ -853,6 +882,15 @@ void FiniteDifferenceSolver::HybridPICSolveECartesian (
                 // Interpolate to get the appropriate charge density in space
                 Real rho_val = Interp(rho, nodal, Ex_stag, coarsen, i, j, k, 0);
 
+                // Interpolate current to appropriate staggering to match E field
+                Real jtot_val = 0._rt;
+                if (include_resistivity_term && resistivity_has_J_dependence) {
+                    Real jx_val = Interp(Jx, Jx_stag, Ex_stag, coarsen, i, j, k, 0);
+                    Real jy_val = Interp(Jy, Jy_stag, Ex_stag, coarsen, i, j, k, 0);
+                    Real jz_val = Interp(Jz, Jz_stag, Ex_stag, coarsen, i, j, k, 0);
+                    jtot_val = std::sqrt(jx_val*jx_val + jy_val*jy_val + jz_val*jz_val);
+                }
+
                 // safety condition since we divide by rho_val later
                 if (rho_val < rho_floor) { rho_val = rho_floor; }
 
@@ -865,7 +903,7 @@ void FiniteDifferenceSolver::HybridPICSolveECartesian (
                 Ex(i, j, k) = (enE_x - grad_Pe) / rho_val;
 
                 // Add resistivity only if E field value is used to update B
-                if (include_resistivity_term) { Ex(i, j, k) += eta(rho_val) * Jx(i, j, k); }
+                if (include_resistivity_term) { Ex(i, j, k) += eta(rho_val, jtot_val) * Jx(i, j, k); }
             },
 
             // Ey calculation
@@ -883,6 +921,15 @@ void FiniteDifferenceSolver::HybridPICSolveECartesian (
                 // Interpolate to get the appropriate charge density in space
                 Real rho_val = Interp(rho, nodal, Ey_stag, coarsen, i, j, k, 0);
 
+                // Interpolate current to appropriate staggering to match E field
+                Real jtot_val = 0._rt;
+                if (include_resistivity_term && resistivity_has_J_dependence) {
+                    Real jx_val = Interp(Jx, Jx_stag, Ey_stag, coarsen, i, j, k, 0);
+                    Real jy_val = Interp(Jy, Jy_stag, Ey_stag, coarsen, i, j, k, 0);
+                    Real jz_val = Interp(Jz, Jz_stag, Ey_stag, coarsen, i, j, k, 0);
+                    jtot_val = std::sqrt(jx_val*jx_val + jy_val*jy_val + jz_val*jz_val);
+                }
+
                 // safety condition since we divide by rho_val later
                 if (rho_val < rho_floor) { rho_val = rho_floor; }
 
@@ -895,7 +942,7 @@ void FiniteDifferenceSolver::HybridPICSolveECartesian (
                 Ey(i, j, k) = (enE_y - grad_Pe) / rho_val;
 
                 // Add resistivity only if E field value is used to update B
-                if (include_resistivity_term) { Ey(i, j, k) += eta(rho_val) * Jy(i, j, k); }
+                if (include_resistivity_term) { Ey(i, j, k) += eta(rho_val, jtot_val) * Jy(i, j, k); }
             },
 
             // Ez calculation
@@ -907,6 +954,15 @@ void FiniteDifferenceSolver::HybridPICSolveECartesian (
                 // Interpolate to get the appropriate charge density in space
                 Real rho_val = Interp(rho, nodal, Ez_stag, coarsen, i, j, k, 0);
 
+                // Interpolate current to appropriate staggering to match E field
+                Real jtot_val = 0._rt;
+                if (include_resistivity_term && resistivity_has_J_dependence) {
+                    Real jx_val = Interp(Jx, Jx_stag, Ez_stag, coarsen, i, j, k, 0);
+                    Real jy_val = Interp(Jy, Jy_stag, Ez_stag, coarsen, i, j, k, 0);
+                    Real jz_val = Interp(Jz, Jz_stag, Ez_stag, coarsen, i, j, k, 0);
+                    jtot_val = std::sqrt(jx_val*jx_val + jy_val*jy_val + jz_val*jz_val);
+                }
+
                 // safety condition since we divide by rho_val later
                 if (rho_val < rho_floor) { rho_val = rho_floor; }
 
@@ -919,7 +975,7 @@ void FiniteDifferenceSolver::HybridPICSolveECartesian (
                 Ez(i, j, k) = (enE_z - grad_Pe) / rho_val;
 
                 // Add resistivity only if E field value is used to update B
-                if (include_resistivity_term) { Ez(i, j, k) += eta(rho_val) * Jz(i, j, k); }
+                if (include_resistivity_term) { Ez(i, j, k) += eta(rho_val, jtot_val) * Jz(i, j, k); }
             }
         );
 

From 83e16ee9a6875d4434bf9d9aff9bc51d1923c6e4 Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Mon, 5 Feb 2024 18:24:51 -0800
Subject: [PATCH 12/13] CI: 4 Cores Linux/Win, 3 Cores macOS (#4673)

Increase build and test parallelism according to new increased core
limits on public GH hosted runners.
---
 .github/workflows/clang_tidy.yml |  6 +++---
 .github/workflows/codeql.yml     |  4 ++--
 .github/workflows/cuda.yml       |  6 +++---
 .github/workflows/hip.yml        |  4 ++--
 .github/workflows/insitu.yml     |  4 ++--
 .github/workflows/intel.yml      |  8 ++++----
 .github/workflows/ubuntu.yml     | 10 +++++-----
 .github/workflows/windows.yml    |  4 ++--
 8 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/clang_tidy.yml b/.github/workflows/clang_tidy.yml
index b8b0053adaa..2a30696fb8c 100644
--- a/.github/workflows/clang_tidy.yml
+++ b/.github/workflows/clang_tidy.yml
@@ -37,7 +37,7 @@ jobs:
 
         cmake -S . -B build_clang_tidy \
           -DCMAKE_VERBOSE_MAKEFILE=ON  \
-          -DWarpX_DIMS="1;2;3;RZ"      \
+          -DWarpX_DIMS="1;2;RZ;3"      \
           -DWarpX_MPI=ON               \
           -DWarpX_COMPUTE=OMP          \
           -DWarpX_PSATD=ON             \
@@ -47,10 +47,10 @@ jobs:
           -DWarpX_PRECISION=SINGLE     \
           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
 
-        cmake --build build_clang_tidy -j 2
+        cmake --build build_clang_tidy -j 4
 
         ${{github.workspace}}/.github/workflows/source/makeMakefileForClangTidy.py --input ${{github.workspace}}/ccache.log.txt
-        make -j2 --keep-going -f clang-tidy-ccache-misses.mak \
+        make -j4 --keep-going -f clang-tidy-ccache-misses.mak \
             CLANG_TIDY=clang-tidy \
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 436df798d3b..008d82af239 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -75,7 +75,7 @@ jobs:
           export CCACHE_MAXSIZE=100M
           ccache -z
 
-          $CMAKE --build build -j 2
+          $CMAKE --build build -j 4
 
           ccache -s
           du -hs ~/.cache/ccache
@@ -83,7 +83,7 @@ jobs:
           # Make sure CodeQL has something to do
           touch Source/Utils/WarpXVersion.cpp
           export CCACHE_DISABLE=1
-          $CMAKE --build build -j 2
+          $CMAKE --build build -j 4
 
       - name: Perform CodeQL Analysis
         uses: github/codeql-action/analyze@v2
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 5e9f43f639d..ed8b315c4e9 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -73,7 +73,7 @@ jobs:
           -DWarpX_PSATD=ON             \
           -DAMReX_CUDA_ERROR_CROSS_EXECUTION_SPACE_CALL=ON \
           -DAMReX_CUDA_ERROR_CAPTURE_THIS=ON
-        cmake --build build_sp -j 2
+        cmake --build build_sp -j 4
 
         python3 -m pip install --upgrade pip
         python3 -m pip install --upgrade build packaging setuptools wheel
@@ -116,7 +116,7 @@ jobs:
 
         git clone https://github.com/AMReX-Codes/amrex.git ../amrex
         cd ../amrex && git checkout --detach 296ed40e16ae1877640f5b78e9162dbd4ba1c279 && cd -
-        make COMP=gcc QED=FALSE USE_MPI=TRUE USE_GPU=TRUE USE_OMP=FALSE USE_PSATD=TRUE USE_CCACHE=TRUE -j 2
+        make COMP=gcc QED=FALSE USE_MPI=TRUE USE_GPU=TRUE USE_OMP=FALSE USE_PSATD=TRUE USE_CCACHE=TRUE -j 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -171,7 +171,7 @@ jobs:
           -DWarpX_PSATD=ON             \
           -DAMReX_CUDA_ERROR_CROSS_EXECUTION_SPACE_CALL=ON \
           -DAMReX_CUDA_ERROR_CAPTURE_THIS=ON
-        cmake --build build -j 2
+        cmake --build build -j 4
 
         # work-around for mpi4py 3.1.1 build system issue with using
         # a GNU-built Python executable with non-GNU Python modules
diff --git a/.github/workflows/hip.yml b/.github/workflows/hip.yml
index f7378bfa775..51cadc89604 100644
--- a/.github/workflows/hip.yml
+++ b/.github/workflows/hip.yml
@@ -56,7 +56,7 @@ jobs:
           -DWarpX_OPENPMD=ON          \
           -DWarpX_PRECISION=SINGLE    \
           -DWarpX_PSATD=ON
-        cmake --build build_sp -j 2
+        cmake --build build_sp -j 4
 
         export WARPX_MPI=OFF
         export PYWARPX_LIB_DIR=$PWD/build_sp/lib/site-packages/pywarpx/
@@ -116,7 +116,7 @@ jobs:
           -DWarpX_OPENPMD=ON          \
           -DWarpX_PRECISION=DOUBLE    \
           -DWarpX_PSATD=ON
-        cmake --build build_2d -j 2
+        cmake --build build_2d -j 4
 
         export WARPX_MPI=OFF
         export PYWARPX_LIB_DIR=$PWD/build_2d/lib/site-packages/pywarpx/
diff --git a/.github/workflows/insitu.yml b/.github/workflows/insitu.yml
index 57a25ce7629..6006c3e5c5b 100644
--- a/.github/workflows/insitu.yml
+++ b/.github/workflows/insitu.yml
@@ -28,7 +28,7 @@ jobs:
           -DWarpX_COMPUTE=NOACC
     - name: Build
       run: |
-        cmake --build build -j 2
+        cmake --build build -j 4
 
   ascent:
     name: Ascent
@@ -51,7 +51,7 @@ jobs:
     - name: Build
       run: |
         . /ascent_docker_setup_env.sh
-        cmake --build build -j 2
+        cmake --build build -j 4
     - name: Test
       run: |
         cp Examples/Physics_applications/laser_acceleration/inputs_3d .
diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml
index 9124715fe18..1731f6e3723 100644
--- a/.github/workflows/intel.yml
+++ b/.github/workflows/intel.yml
@@ -53,7 +53,7 @@ jobs:
           -DWarpX_MPI=OFF             \
           -DWarpX_OPENPMD=ON          \
           -DWarpX_openpmd_internal=OFF
-        cmake --build build_dp -j 2
+        cmake --build build_dp -j 4
 
         cmake -S . -B build_sp \
           -DCMAKE_VERBOSE_MAKEFILE=ON  \
@@ -64,7 +64,7 @@ jobs:
           -DWarpX_OPENPMD=ON           \
           -DWarpX_openpmd_internal=OFF \
           -DWarpX_PRECISION=SINGLE
-        cmake --build build_sp -j 2
+        cmake --build build_sp -j 4
         cmake --build build_sp --target pip_install
 
         ccache -s
@@ -120,7 +120,7 @@ jobs:
           -DWarpX_MPI=OFF              \
           -DWarpX_OPENPMD=ON           \
           -DWarpX_PRECISION=SINGLE
-        cmake --build build_sp -j 2
+        cmake --build build_sp -j 4
         cmake --build build_sp --target pip_install
 
         ccache -s
@@ -184,7 +184,7 @@ jobs:
           -DWarpX_MPI=OFF              \
           -DWarpX_OPENPMD=ON           \
           -DWarpX_PRECISION=SINGLE
-        cmake --build build_sp -j 2
+        cmake --build build_sp -j 4
 
         ccache -s
         du -hs ~/.cache/ccache
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
index 6b8e26111b8..239da17be64 100644
--- a/.github/workflows/ubuntu.yml
+++ b/.github/workflows/ubuntu.yml
@@ -38,7 +38,7 @@ jobs:
           -DWarpX_EB=OFF               \
           -DWarpX_MPI=OFF              \
           -DWarpX_QED=OFF
-        cmake --build build -j 2
+        cmake --build build -j 4
         ./build/bin/warpx.3d Examples/Physics_applications/laser_acceleration/inputs_3d
         ./build/bin/warpx.rz Examples/Physics_applications/laser_acceleration/inputs_rz
 
@@ -79,7 +79,7 @@ jobs:
           -DWarpX_EB=OFF               \
           -DWarpX_PSATD=ON             \
           -DWarpX_QED_TABLE_GEN=ON
-        cmake --build build -j 2
+        cmake --build build -j 4
         ./build/bin/warpx.1d Examples/Physics_applications/laser_acceleration/inputs_1d
         ./build/bin/warpx.2d Examples/Physics_applications/laser_acceleration/inputs_2d
 
@@ -126,7 +126,7 @@ jobs:
           -DWarpX_PARTICLE_PRECISION=SINGLE \
           -DWarpX_QED_TABLE_GEN=ON
 
-        cmake --build build -j 2
+        cmake --build build -j 4
         ./build/bin/warpx.3d Examples/Physics_applications/laser_acceleration/inputs_3d
         ./build/bin/warpx.rz Examples/Physics_applications/laser_acceleration/inputs_rz
 
@@ -164,7 +164,7 @@ jobs:
           -DCMAKE_VERBOSE_MAKEFILE=ON  \
           -DWarpX_APP=OFF              \
           -DWarpX_LIB=OFF
-        cmake --build build -j 2
+        cmake --build build -j 4
 
         ccache -s
         du -hs ~/.cache/ccache
@@ -208,7 +208,7 @@ jobs:
           -DWarpX_PSATD=ON             \
           -DWarpX_PYTHON=ON            \
           -DWarpX_QED_TABLE_GEN=ON
-        cmake --build build -j 2 --target pip_install
+        cmake --build build -j 4 --target pip_install
 
         ccache -s
         du -hs ~/.cache/ccache
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 8e2bb00f1db..eee40e72965 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -38,7 +38,7 @@ jobs:
               -DWarpX_MPI=OFF             `
               -DWarpX_PYTHON=ON
         if(!$?) { Exit $LASTEXITCODE }
-        cmake --build build --config Debug --parallel 2
+        cmake --build build --config Debug --parallel 4
         if(!$?) { Exit $LASTEXITCODE }
 
         python3 -m pip install --upgrade pip
@@ -96,7 +96,7 @@ jobs:
               -DWarpX_MPI=OFF               ^
               -DWarpX_OPENPMD=ON
         if errorlevel 1 exit 1
-        cmake --build build --config Release --parallel 2
+        cmake --build build --config Release --parallel 4
         if errorlevel 1 exit 1
 
         cmake --build build --config Release --target install

From 6e87dd52995b6faf16a2bf76bd873c6447f76548 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Tue, 6 Feb 2024 13:01:58 -0800
Subject: [PATCH 13/13] Update GitHub Action versions (#4674)

* Bump actions/upload-artifact from 3 to 4

  * Bump github/codeql-action from 2 to 3

  * Bump actions/checkout from 3 to 4

  * Bump actions/setup-python from 4 to 5

  * Bump actions/cache from 3 to 4
---
 .github/workflows/clang_tidy.yml           |  6 +++---
 .github/workflows/cleanup-cache-postpr.yml |  2 +-
 .github/workflows/cleanup-cache.yml        |  2 +-
 .github/workflows/codeql.yml               | 14 +++++++-------
 .github/workflows/cuda.yml                 | 16 ++++++++--------
 .github/workflows/hip.yml                  | 10 +++++-----
 .github/workflows/insitu.yml               |  6 +++---
 .github/workflows/intel.yml                | 14 +++++++-------
 .github/workflows/macos.yml                |  6 +++---
 .github/workflows/post-pr.yml              |  2 +-
 .github/workflows/source.yml               |  2 +-
 .github/workflows/ubuntu.yml               | 22 +++++++++++-----------
 .github/workflows/windows.yml              | 12 ++++++------
 13 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/.github/workflows/clang_tidy.yml b/.github/workflows/clang_tidy.yml
index 2a30696fb8c..6a1172802a8 100644
--- a/.github/workflows/clang_tidy.yml
+++ b/.github/workflows/clang_tidy.yml
@@ -12,12 +12,12 @@ jobs:
     runs-on: ubuntu-22.04
     if: github.event.pull_request.draft == false
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       run: |
         .github/workflows/dependencies/clang14.sh
     - name: set up cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -66,7 +66,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/cleanup-cache-postpr.yml b/.github/workflows/cleanup-cache-postpr.yml
index 978e9c28f04..9a2ffb0f61a 100644
--- a/.github/workflows/cleanup-cache-postpr.yml
+++ b/.github/workflows/cleanup-cache-postpr.yml
@@ -16,7 +16,7 @@ jobs:
     env:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Clean up ccache
         run: |
           gh extension install actions/gh-actions-cache
diff --git a/.github/workflows/cleanup-cache.yml b/.github/workflows/cleanup-cache.yml
index 6421bbf4215..bd1a518acf4 100644
--- a/.github/workflows/cleanup-cache.yml
+++ b/.github/workflows/cleanup-cache.yml
@@ -16,7 +16,7 @@ jobs:
     env:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Clean up ccache
         run: |
           gh extension install actions/gh-actions-cache
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 008d82af239..bc0bee545cc 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -29,7 +29,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Install Packages (C++)
         if: ${{ matrix.language == 'cpp' }}
@@ -44,7 +44,7 @@ jobs:
 
       - name: Set Up Cache
         if: ${{ matrix.language == 'cpp' }}
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cache/ccache
           key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -57,14 +57,14 @@ jobs:
           $CMAKE -S . -B build -DWarpX_OPENPMD=ON
 
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@v2
+        uses: github/codeql-action/init@v3
         with:
           config-file: ./.github/codeql/warpx-codeql.yml
           languages: ${{ matrix.language }}
           queries: +security-and-quality
 
       - name: Build (py)
-        uses: github/codeql-action/autobuild@v2
+        uses: github/codeql-action/autobuild@v3
         if: ${{ matrix.language == 'python' }}
 
       - name: Build (C++)
@@ -86,7 +86,7 @@ jobs:
           $CMAKE --build build -j 4
 
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@v2
+        uses: github/codeql-action/analyze@v3
         with:
           category: "/language:${{ matrix.language }}"
           upload: False
@@ -107,7 +107,7 @@ jobs:
           output: sarif-results/${{ matrix.language }}.sarif
 
       - name: Upload SARIF
-        uses: github/codeql-action/upload-sarif@v2
+        uses: github/codeql-action/upload-sarif@v3
         with:
           sarif_file: sarif-results/${{ matrix.language }}.sarif
 
@@ -120,7 +120,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index ed8b315c4e9..3546eb8e9eb 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -19,8 +19,8 @@ jobs:
       CXXFLAGS: "-Werror"
       CMAKE_GENERATOR: Ninja
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v4
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
       name: Install Python
       with:
         python-version: '3.x'
@@ -28,7 +28,7 @@ jobs:
       run: |
         .github/workflows/dependencies/nvcc11-3.sh
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -92,12 +92,12 @@ jobs:
     runs-on: ubuntu-20.04
     if: github.event.pull_request.draft == false
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       run: |
         .github/workflows/dependencies/nvcc11-8.sh
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -129,11 +129,11 @@ jobs:
     #  # For NVHPC, Ninja is slower than the default:
     #  CMAKE_GENERATOR: Ninja
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Dependencies
       run: .github/workflows/dependencies/nvhpc.sh
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -197,7 +197,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/hip.yml b/.github/workflows/hip.yml
index 51cadc89604..0e311f061ef 100644
--- a/.github/workflows/hip.yml
+++ b/.github/workflows/hip.yml
@@ -15,12 +15,12 @@ jobs:
       CMAKE_GENERATOR: Ninja
     if: github.event.pull_request.draft == false
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       shell: bash
       run: .github/workflows/dependencies/hip.sh
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -74,12 +74,12 @@ jobs:
       CMAKE_GENERATOR: Ninja
     if: github.event.pull_request.draft == false
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       shell: bash
       run: .github/workflows/dependencies/hip.sh
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -135,7 +135,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/insitu.yml b/.github/workflows/insitu.yml
index 6006c3e5c5b..42923d3df8e 100644
--- a/.github/workflows/insitu.yml
+++ b/.github/workflows/insitu.yml
@@ -20,7 +20,7 @@ jobs:
     container:
       image: senseiinsitu/ci:fedora35-amrex-20220613
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Configure
       run: |
         cmake -S . -B build     \
@@ -41,7 +41,7 @@ jobs:
     container:
       image: alpinedav/ascent:0.9.2
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Configure
       run: |
         . /ascent_docker_setup_env.sh
@@ -61,7 +61,7 @@ jobs:
             max_step = 40              \
             diag1.intervals = 30:40:10 \
             diag1.format = ascent
-    - uses: actions/upload-artifact@v2
+    - uses: actions/upload-artifact@v4
       with:
         name: ascent-test-artifacts
         path: |
diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml
index 1731f6e3723..3b1d6b546a4 100644
--- a/.github/workflows/intel.yml
+++ b/.github/workflows/intel.yml
@@ -17,12 +17,12 @@ jobs:
     #env:
     #  CMAKE_GENERATOR: Ninja
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       run: |
         .github/workflows/dependencies/icc.sh
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -82,13 +82,13 @@ jobs:
     #  CMAKE_GENERATOR: Ninja
     if: github.event.pull_request.draft == false
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       shell: bash
       run: |
         .github/workflows/dependencies/dpcpp.sh
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -146,13 +146,13 @@ jobs:
     #  CMAKE_GENERATOR: Ninja
     if: github.event.pull_request.draft == false
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       shell: bash
       run: |
         .github/workflows/dependencies/dpcpp.sh
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -204,7 +204,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index 0e8819032e3..f34f9f3534d 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -17,7 +17,7 @@ jobs:
       # For macOS, Ninja is slower than the default:
       #CMAKE_GENERATOR: Ninja
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       run: |
         set +e
@@ -45,7 +45,7 @@ jobs:
         python3 -m pip install --upgrade build packaging setuptools wheel
         python3 -m pip install --upgrade mpi4py
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: /Users/runner/Library/Caches/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -97,7 +97,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/post-pr.yml b/.github/workflows/post-pr.yml
index f5b914033b7..2768ef376cc 100644
--- a/.github/workflows/post-pr.yml
+++ b/.github/workflows/post-pr.yml
@@ -13,7 +13,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/source.yml b/.github/workflows/source.yml
index 08050768894..a1c29416b3e 100644
--- a/.github/workflows/source.yml
+++ b/.github/workflows/source.yml
@@ -18,7 +18,7 @@ jobs:
     runs-on: ubuntu-22.04
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Non-ASCII characters
       run: .github/workflows/source/hasNonASCII
     - name: TABs
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
index 239da17be64..cf4b375ce00 100644
--- a/.github/workflows/ubuntu.yml
+++ b/.github/workflows/ubuntu.yml
@@ -14,12 +14,12 @@ jobs:
     env:
       CXXFLAGS: "-Werror"
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       run: |
         .github/workflows/dependencies/gcc.sh
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -54,12 +54,12 @@ jobs:
       CXX: "g++-12"
       CC: "gcc-12"
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       run: |
         .github/workflows/dependencies/gcc12.sh
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -94,12 +94,12 @@ jobs:
       CXX: "g++-12"
       CC: "gcc-12"
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       run: |
         .github/workflows/dependencies/gcc12_blaspp_lapackpp.sh
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -141,13 +141,13 @@ jobs:
       CMAKE_GENERATOR: Ninja
       CXXFLAGS: "-Werror"
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       run: |
         .github/workflows/dependencies/gcc.sh
         sudo apt-get install -y libopenmpi-dev openmpi-bin
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -179,12 +179,12 @@ jobs:
       # On CI for this test, Ninja is slower than the default:
       #CMAKE_GENERATOR: Ninja
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install dependencies
       run: |
         .github/workflows/dependencies/pyfull.sh
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/ccache
         key: ccache-${{ github.workflow }}-${{ github.job }}-git-${{ github.sha }}
@@ -227,7 +227,7 @@ jobs:
           PR_NUMBER: ${{ github.event.number }}
         run: |
           echo $PR_NUMBER > pr_number.txt
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: pr_number
           path: pr_number.txt
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index eee40e72965..2ef74cdb7f9 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -12,12 +12,12 @@ jobs:
     runs-on: windows-latest
     if: github.event.pull_request.draft == false
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v4
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
       with:
         python-version: '3.x'
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       # - once stored under a key, they become immutable (even if local cache path content changes)
       # - for a refresh the key has to change, e.g., hash of a tracked file in the key
       with:
@@ -63,13 +63,13 @@ jobs:
     runs-on: windows-2019
     if: github.event.pull_request.draft == false
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v4
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
       with:
         python-version: '3.8'
     - uses: seanmiddleditch/gha-setup-ninja@master
     - name: CCache Cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       # - once stored under a key, they become immutable (even if local cache path content changes)
       # - for a refresh the key has to change, e.g., hash of a tracked file in the key
       with: