Merge pull request #15 from RRZE-HPC/clusterpair_neigh_simd

Cluster Pair - SIMD Vectorized Neighbor Build and Support for Multiple Atom Types
RRZE-HPC · Nov 12, 2024 · 25efac7 · 25efac7
2 parents d4c7360 + 1490875
commit 25efac7
Show file tree

Hide file tree

Showing 41 changed files with 3,491 additions and 1,059 deletions.
diff --git a/README.md b/README.md
@@ -18,13 +18,13 @@ options are available:
 SIMD other than NONE.
 - **SIMD:** Instruction set (available options: NONE, SSE, AVX, AVX\_FMA, AVX2, AVX512).
 - **MASK\_REGISTERS:** Use AVX512 mask registers (always true when ISA is set to AVX512).
-- **OPT\_SCHEME:** Optimization algorithm (available options: lammps, gromacs).
+- **OPT\_SCHEME:** Optimization algorithm (available options: verletlist, clusterpair).
 - **ENABLE\_LIKWID:** Enable likwid to make use of HPM counters.
 - **DATA\_TYPE:** Floating-point precision (available options: SP, DP).
 - **DATA\_LAYOUT:** Data layout for atom vector properties (available options: AOS, SOA).
 - **ASM\_SYNTAX:** Assembly syntax to use when generating assembly files (available options: ATT, INTEL).
 - **DEBUG:** Toggle debug mode.
-- **EXPLICIT\_TYPES:** Explicitly store and load atom types.
+- **ONE\_ATOM\_TYPE:** Simulate only one atom type and do not perform table lookup for parameters.
 - **MEM\_TRACER:** Trace memory addresses for cache simulator.
 - **INDEX\_TRACER:** Trace indexes and distances for gather-md.
 - **COMPUTE\_STATS:** Compute statistics.
@@ -38,7 +38,6 @@ Configurations for GROMACS MxN optimization scheme:
 
 - **USE\_REFERENCE\_VERSION:** Use reference version (only for correction purposes).
 - **XTC\_OUTPUT:** Enable XTC output.
-- **HALF\_NEIGHBOR\_LISTS\_CHECK\_CJ:** Check if j-clusters are local when decreasing the reaction force.
 
 Configurations for CUDA:
 
@@ -83,12 +82,19 @@ TBD
 
 ## Citations
 
+Rafael Ravedutti Lucio Machado, Jan Eitzinger, Jan Laukemann, Georg Hager, Harald
+Köstler and Gerhard Wellein: MD-Bench: A performance-focused prototyping harness for
+state-of-the-art short-range molecular dynamics algorithms. Future Generation
+Computer Systems ([FGCS](https://www.sciencedirect.com/journal/future-generation-computer-systems)), Volume 149, 2023, Pages 25-38, ISSN 0167-739X, DOI:
+[https://doi.org/10.1016/j.future.2023.06.023](https://doi.org/10.1016/j.future.2023.06.023)
+
 Rafael Ravedutti Lucio Machado, Jan Eitzinger, Harald Köstler, and Gerhard
 Wellein: MD-Bench: A generic proxy-app toolbox for state-of-the-art molecular
 dynamics algorithms. Accepted for [PPAM](https://ppam.edu.pl/) 2022, the 14th
 International Conference on Parallel Processing and Applied Mathematics, Gdansk,
 Poland, September 11-14, 2022. PPAM 2022 Best Paper Award. Preprint:
-[arXiv:2207.13094](https://arxiv.org/abs/2207.13094)
+[arXiv:2207.13094](https://arxiv.org/abs/2207.13094), DOI:
+[https://dl.acm.org/doi/10.1007/978-3-031-30442-2_24](https://dl.acm.org/doi/10.1007/978-3-031-30442-2_24)
 
 ## Credits
 

diff --git a/TODO.md b/TODO.md
@@ -1,9 +1,3 @@
-- Cluster Pair
-    - Use SIMD for distance calculation when building neighbor lists
-    - More than one atom type
-
-- By default, use several atom types (change EXPLICIT\_TYPES option to ONE\_ATOM\_TYPE and set it to false by default)
-
 - Allow to resort atoms at a separate frequency independent of the neighboring
 frequency
 - Integrate and fix Super-Cluster GPU code

diff --git a/config.mk b/config.mk
@@ -1,28 +1,28 @@
 # Compiler tool chain (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
-TOOLCHAIN ?= CLANG
+TOOLCHAIN ?= ICC
 # ISA of instruction code (X86/ARM)
 ISA ?= X86
 # Instruction set for instrinsic kernels (NONE/<X86-SIMD>/<ARM-SIMD>)
 # with X86-SIMD options: NONE/SSE/AVX/AVX_FMA/AVX2/AVX512
 # with ARM-SIMD options: NONE/NEON/SVE/SVE2 (SVE not width-agnostic yet!)
-SIMD ?= AVX2
+SIMD ?= AVX512
 # Optimization scheme (verletlist/clusterpair)
-OPT_SCHEME ?= verletlist
+OPT_SCHEME ?= clusterpair
 # Enable likwid (true or false)
 ENABLE_LIKWID ?= false
 # Enable OpenMP parallelization (true or false)
-ENABLE_OPENMP ?= true
+ENABLE_OPENMP ?= false
 # SP or DP
-DATA_TYPE ?= DP
+DATA_TYPE ?= SP
 # AOS or SOA
 DATA_LAYOUT ?= AOS
 # Debug
 DEBUG ?= false
 
 # Sort atoms when reneighboring (true or false)
 SORT_ATOMS ?= false
-# Explicitly store and load atom types (true or false)
-EXPLICIT_TYPES ?= false
+# Simulate only for one atom type, without table lookup for parameters (true or false)
+ONE_ATOM_TYPE ?= false
 # Trace memory addresses for cache simulator (true or false)
 MEM_TRACER ?= false
 # Trace indexes and distances for gather-md (true or false)
@@ -39,8 +39,6 @@ ENABLE_OMP_SIMD ?= false
 USE_REFERENCE_VERSION ?= false
 # Enable XTC output (a GROMACS file format for trajectories)
 XTC_OUTPUT ?= false
-# Check if cj is local when decreasing reaction force
-HALF_NEIGHBOR_LISTS_CHECK_CJ ?= true
 
 # Configurations for CUDA
 # Use CUDA pinned memory to optimize transfers
@@ -66,11 +64,15 @@ ifeq ($(strip $(SIMD)), NONE)
 else
 ifeq ($(strip $(ISA)),ARM)
     ifeq ($(strip $(SIMD)), NEON)
+        __ISA_NEON__=true
         __SIMD_WIDTH_DBL__=2
     else ifeq ($(strip $(SIMD)), SVE)
+        __ISA_SVE__=true
 		# needs further specification
         __SIMD_WIDTH_DBL__=2
     else ifeq ($(strip $(SIMD)), SVE2)
+        __ISA_SVE__=true
+        __ISA_SVE2__=true
         # needs further specification
         __SIMD_WIDTH_DBL__=2
     endif
@@ -120,8 +122,8 @@ ifeq ($(strip $(SORT_ATOMS)),true)
     DEFINES += -DSORT_ATOMS
 endif
 
-ifeq ($(strip $(EXPLICIT_TYPES)),true)
-    DEFINES += -DEXPLICIT_TYPES
+ifeq ($(strip $(ONE_ATOM_TYPE)),true)
+    DEFINES += -DONE_ATOM_TYPE
 endif
 
 ifeq ($(strip $(MEM_TRACER)),true)
@@ -144,10 +146,6 @@ ifeq ($(strip $(USE_REFERENCE_VERSION)),true)
     DEFINES += -DUSE_REFERENCE_VERSION
 endif
 
-ifeq ($(strip $(HALF_NEIGHBOR_LISTS_CHECK_CJ)),true)
-    DEFINES += -DHALF_NEIGHBOR_LISTS_CHECK_CJ
-endif
-
 ifeq ($(strip $(DEBUG)),true)
     DEFINES += -DDEBUG
 endif
@@ -180,6 +178,18 @@ ifeq ($(strip $(__ISA_AVX512__)),true)
     DEFINES += -D__ISA_AVX512__
 endif
 
+ifeq ($(strip $(__ISA_NEON__)),true)
+    DEFINES += -D__ISA_NEON__
+endif
+
+ifeq ($(strip $(__ISA_SVE__)),true)
+    DEFINES += -D__ISA_SVE__
+endif
+
+ifeq ($(strip $(__ISA_SVE2__)),true)
+    DEFINES += -D__ISA_SVE2__
+endif
+
 ifeq ($(strip $(ENABLE_OMP_SIMD)),true)
     DEFINES += -DENABLE_OMP_SIMD
 endif

diff --git a/src/clusterpair/atom.c b/src/clusterpair/atom.c
@@ -25,7 +25,7 @@ void initAtom(Atom* atom)
     atom->cl_x            = NULL;
     atom->cl_v            = NULL;
     atom->cl_f            = NULL;
-    atom->cl_type         = NULL;
+    atom->cl_t            = NULL;
     atom->Natoms          = 0;
     atom->Nlocal          = 0;
     atom->Nghost          = 0;
@@ -670,7 +670,7 @@ void growClusters(Atom* atom)
         ALIGNMENT,
         atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT),
         nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
-    atom->cl_type      = (int*)reallocate(atom->cl_type,
+    atom->cl_t         = (int*)reallocate(atom->cl_t,
         ALIGNMENT,
         atom->Nclusters_max * CLUSTER_M * sizeof(int),
         nold * CLUSTER_M * sizeof(int));

diff --git a/src/clusterpair/atom.h b/src/clusterpair/atom.h
@@ -40,7 +40,7 @@ typedef struct {
     MD_FLOAT* cl_x;
     MD_FLOAT* cl_v;
     MD_FLOAT* cl_f;
-    int* cl_type;
+    int* cl_t;
     Cluster *iclusters, *jclusters;
     int* icluster_bin;
     int dummy_cj;

diff --git a/src/clusterpair/force.c b/src/clusterpair/force.c
@@ -17,23 +17,22 @@ void initForce(Parameter* param)
         computeForce = computeForceEam;
         break;
     case FF_LJ:
-#ifdef USE_REFERENCE_VERSION
+#if defined(CLUSTERPAIR_KERNEL_REF)
         computeForce = computeForceLJRef;
-#else
+#elif defined(CLUSTERPAIR_KERNEL_4XN)
         if (param->half_neigh) {
             computeForce = computeForceLJ4xnHalfNeigh;
         } else {
-#ifdef CUDA_TARGET
-            computeForce = computeForceLJCUDA;
-#else
-            // Simd2xNN (here used for single-precision)
-#if VECTOR_WIDTH > CLUSTER_M * 2
-            computeForce = computeForceLJ2xnnFullNeigh;
-#else // Simd4xN
             computeForce = computeForceLJ4xnFullNeigh;
-#endif
-#endif
         }
+#elif defined(CLUSTERPAIR_KERNEL_2XNN)
+        if (param->half_neigh) {
+            computeForce = computeForceLJ2xnnHalfNeigh;
+        } else {
+            computeForce = computeForceLJ2xnnFullNeigh;
+        }
+#elif defined(CLUSTERPAIR_KERNEL_CUDA)
+        computeForce = computeForceLJCUDA;
 #endif
     }
 }
diff --git a/src/clusterpair/force.h b/src/clusterpair/force.h
@@ -31,28 +31,57 @@ extern double computeForceEam(Parameter*, Atom*, Neighbor*, Stats*);
 // Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
 // Cuda: M=8, N=VECTOR_WIDTH
 
+/* Comments from GROMACS:
+ *
+ * We need to choose if we want 2x(N+N) or 4xN kernels.
+ * This is based on the SIMD acceleration choice and CPU information
+ * detected at runtime.
+ *
+ * 4xN calculates more (zero) interactions, but has less pair-search
+ * work and much better kernel instruction scheduling.
+ *
+ * Up till now we have only seen that on Intel Sandy/Ivy Bridge,
+ * which doesn't have FMA, both the analytical and tabulated Ewald
+ * kernels have similar pair rates for 4x8 and 2x(4+4), so we choose
+ * 2x(4+4) because it results in significantly fewer pairs.
+ * For RF, the raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+ * 10% with HT, 50% without HT. As we currently don't detect the actual
+ * use of HT, use 4x8 to avoid a potential performance hit.
+ * On Intel Haswell 4x8 is always faster.
+ *
+ *
+ * The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
+ * Currently the 2xNN SIMD kernels only make sense with:
+ *  8-way SIMD: 4x4 setup, performance wise only useful on CPUs without FMA or on AMD Zen1
+ * 16-way SIMD: 4x8 setup, used in single precision with 512 bit wide SIMD
+ */
+
 #ifdef CUDA_TARGET
 extern double computeForceLJCUDA(Parameter*, Atom*, Neighbor*, Stats*);
 #undef VECTOR_WIDTH
 #define VECTOR_WIDTH 8
-#define KERNEL_NAME  "CUDA"
-#define CLUSTER_M    8
-#define CLUSTER_N    VECTOR_WIDTH
-#define UNROLL_J     1
+#define CLUSTERPAIR_KERNEL_CUDA
+#define KERNEL_NAME "CUDA"
+#define CLUSTER_M   8
+#define CLUSTER_N   VECTOR_WIDTH
+#define UNROLL_J    1
 #else
 #ifdef USE_REFERENCE_VERSION
+#define CLUSTERPAIR_KERNEL_REF
 #define KERNEL_NAME "Reference"
-#define CLUSTER_M    1
-#define CLUSTER_N    VECTOR_WIDTH
+#define CLUSTER_M   1
+#define CLUSTER_N   VECTOR_WIDTH
 #else
 #define CLUSTER_M 4
 // Simd2xNN (here used for single-precision)
 #if VECTOR_WIDTH > CLUSTER_M * 2
+#define CLUSTERPAIR_KERNEL_2XNN
 #define KERNEL_NAME "Simd2xNN"
 #define CLUSTER_N   (VECTOR_WIDTH / 2)
 #define UNROLL_I    4
 #define UNROLL_J    2
 #else // Simd4xN
+#define CLUSTERPAIR_KERNEL_4XN
 #define KERNEL_NAME "Simd4xN"
 #define CLUSTER_N   VECTOR_WIDTH
 #define UNROLL_I    4