From 6d0eec1f337018d15c614e75fb6b15ccc989dc6c Mon Sep 17 00:00:00 2001
From: Leonardo Solis Vasquez <leonardosolis86@gmail.com>
Date: Fri, 5 Jan 2024 08:31:57 -0500
Subject: [PATCH 01/12] Offloading reductions to Tensor Cores.

---
 cuda/calcMergeEneGra.cu |  54 +++++++++++++--
 cuda/kernels.cu         | 146 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 193 insertions(+), 7 deletions(-)

diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu
index 1e2f0d54..e5616050 100644
--- a/cuda/calcMergeEneGra.cu
+++ b/cuda/calcMergeEneGra.cu
@@ -643,22 +643,62 @@ __device__ void gpu_calc_energrad(
 	}
 
 	// Do a reduction over the total gradient containing prepared "gradient_intra_*" values
-	REDUCEFLOATSUM(torque_rot.x, pFloatAccumulator);
-	REDUCEFLOATSUM(torque_rot.y, pFloatAccumulator);
-	REDUCEFLOATSUM(torque_rot.z, pFloatAccumulator);
+//	REDUCEFLOATSUM(torque_rot.x, pFloatAccumulator);
+//	REDUCEFLOATSUM(torque_rot.y, pFloatAccumulator);
+//	REDUCEFLOATSUM(torque_rot.z, pFloatAccumulator);
 
 	// TODO
 	// -------------------------------------------------------
 	// Obtaining energy and translation-related gradients
 	// -------------------------------------------------------
 	// reduction over partial energies and prepared "gradient_intra_*" values
-	REDUCEFLOATSUM(energy, pFloatAccumulator);
+//	REDUCEFLOATSUM(energy, pFloatAccumulator);
+
+	/* Begin: Reduction using tensor units */
+
+	// 1. Convert data-to-be-reduced from float to half
+	// and place it in a shared memory array
+	__shared__ __align__(256) half data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK];
+	data_to_be_reduced[4*threadIdx.x] = __float2half(torque_rot.x);
+	data_to_be_reduced[4*threadIdx.x + 1] = __float2half(torque_rot.y);
+	data_to_be_reduced[4*threadIdx.x + 2] = __float2half(torque_rot.z);
+	data_to_be_reduced[4*threadIdx.x + 3] = __float2half(energy);
+
+	// 2. Perform reduction via tensor units
+	reduce_via_tensor_units(data_to_be_reduced);
+
+	// 3. Retrieve results from shared memory
+	torque_rot.x = __half2float(data_to_be_reduced[0]);
+	torque_rot.y = __half2float(data_to_be_reduced[1]);
+	torque_rot.z = __half2float(data_to_be_reduced[2]);
+	energy = __half2float(data_to_be_reduced[3]);
+
+	/* End: Reduction using tensor units */
+
 #if defined (DEBUG_ENERGY_KERNEL)
 	REDUCEFLOATSUM(intraE, pFloatAccumulator);
 #endif
-	REDUCEFLOATSUM(gx, pFloatAccumulator);
-	REDUCEFLOATSUM(gy, pFloatAccumulator);
-	REDUCEFLOATSUM(gz, pFloatAccumulator);
+//	REDUCEFLOATSUM(gx, pFloatAccumulator);
+//	REDUCEFLOATSUM(gy, pFloatAccumulator);
+//	REDUCEFLOATSUM(gz, pFloatAccumulator);
+
+	/* Begin: Reduction using tensor units */
+
+	// 1. Convert data-to-be-reduced from float to half
+	// and place it in a shared memory array
+	data_to_be_reduced[4*threadIdx.x] = __float2half(gx);
+	data_to_be_reduced[4*threadIdx.x + 1] = __float2half(gy);
+	data_to_be_reduced[4*threadIdx.x + 2] = __float2half(gz);
+
+	// 2. Perform reduction via tensor units
+	reduce_via_tensor_units(data_to_be_reduced);
+
+	// 3. Retrieve results from shared memory
+	gx = __half2float(data_to_be_reduced[0]);
+	gy = __half2float(data_to_be_reduced[1]);
+	gz = __half2float(data_to_be_reduced[2]);
+
+	/* End: Reduction using tensor units */
 
 	global_energy = energy;
 #ifndef FLOAT_GRADIENTS
diff --git a/cuda/kernels.cu b/cuda/kernels.cu
index c98c69b5..4c477ddd 100644
--- a/cuda/kernels.cu
+++ b/cuda/kernels.cu
@@ -93,6 +93,152 @@ __device__ inline int64_t ullitolli(uint64_t u)
 #define ATOMICADDF32(pAccumulator, value) atomicAdd(pAccumulator, (value))
 #define ATOMICSUBF32(pAccumulator, value) atomicAdd(pAccumulator, -(value))
 
+/* Begin: Reduction using tensor units */
+/*
+ * Half-precision support
+ * https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF__MISC.html
+ */
+#include <cuda_fp16.h>
+
+/*
+ * Tensor Cores
+ * https://developer.nvidia.com/blog/programming-tensor-cores-cuda-9
+ *
+ * Don't forget to compile specifying the architecture, e.g., sm_86.
+ * For AutoDock-GPU, this can be done via the TARGETS option.
+ * make DEVICE=GPU TESTLS=ad NUMWI=64 TARGETS=86 test
+ * https://stackoverflow.com/a/53634598/1616865
+ */
+#include <mma.h>
+using namespace nvcuda;
+
+#define TILE_SIZE (16 * 16)
+
+constexpr int rowscols_M = 16;	// Number of rows (or cols) in the M dimension
+constexpr int rowscols_N = 16;	// Number of rows (or cols) in the N dimension
+constexpr int rowscols_K = 16;	// Number of rows (or cols) in the K dimension
+
+// Half constants
+// CUDART_ONE_FP16 was not recognized by the NVCC compiler
+// So its value is indicated explicitly
+// https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS.html#group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS
+#define HALF_ONE __ushort_as_half((unsigned short)0x3C00U)
+#define HALF_ZERO __ushort_as_half((unsigned short)0x0000U)
+
+__device__ void fill_Q(half *Q_data) {
+
+	half I4[16] = {
+		HALF_ONE, HALF_ZERO, HALF_ZERO, HALF_ZERO,
+		HALF_ZERO, HALF_ONE, HALF_ZERO, HALF_ZERO,
+		HALF_ZERO, HALF_ZERO, HALF_ONE, HALF_ZERO,
+		HALF_ZERO, HALF_ZERO, HALF_ZERO, HALF_ONE
+	};
+
+	/*
+	// Naive implementation: a single thread fills data in
+	if (threadIdx.x == 0) {
+		for (uint i = 0; i < 4; i++) {	// How many rows (of 4x4 blocks) are there in matrix A?
+			for (uint j = 0; j < 4; j++) {	// How many cols (of 4x4 blocks) are there in matrix A?
+				for (uint ii = 0; ii < 4; ii++) {
+					for (uint jj = 0; jj < 4; jj++) {
+						Q_data[4*i + 64*j + ii + 16*jj] = I4 [4*ii + jj];
+					}
+				}
+			}
+		}
+	}
+	*/
+
+	// Slightly improved multi-threaded implementation
+	for (uint i = threadIdx.x; i < 4; i+=blockDim.x) {	// How many rows (of 4x4 blocks) are there in matrix A?
+		for (uint j = 0; j < 4; j++) {	// How many cols (of 4x4 blocks) are there in matrix A?
+			for (uint ii = 0; ii < 4; ii++) {
+				for (uint jj = 0; jj < 4; jj++) {
+					Q_data[4*i + 64*j + ii + 16*jj] = I4 [4*ii + jj];
+				}
+			}
+		}
+	}
+
+	/*
+	// Further improved multi-threaded implementation
+	// (It didn't provide significant performance improvements -> commented out)
+	// Fusing two outer loops into a single one
+	// To do that: coeffs = 4i + 64j
+	constexpr uint coeffs [16] = {0, 64, 128, 192, 4, 68, 132, 196, 8, 72, 136, 200, 12, 76, 140, 204};
+	for (uint k = threadIdx.x; k < 16; k+=blockDim.x) {
+		for (uint ii = 0; ii < 4; ii++) {
+			for (uint jj = 0; jj < 4; jj++) {
+				Q_data[coeffs[k] + ii + 16*jj] = I4 [4*ii + jj];
+			}
+		}	
+	}
+	*/
+
+	/*
+	// Enable this block to print matrix values
+	if (blockIdx.x == 0 && threadIdx.x == 0) {
+		printf("\nQ_data");
+		for (uint i = 0; i < 16 * 16; i++) {
+			if ((i % 16) == 0) {printf("\n[Row %u]: ", i/16);}
+			printf(" %2.2f ", __half2float(Q_data[i]));
+		}
+		printf("\n");
+    }
+	*/
+}
+
+// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
+// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
+// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
+__device__ void reduce_via_tensor_units(half *data_to_be_reduced) {
+
+	__syncthreads();
+
+	if (threadIdx.x <= 31) { // Only one warp performs reduction
+		__shared__ __align__ (256) half Q_data[TILE_SIZE];
+
+		fill_Q(Q_data);
+
+		__shared__ __align__ (256) half tmp[TILE_SIZE];
+
+		// Declaring and filling fragments - Those are *not* shared
+		wmma::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, half, wmma::col_major> frag_P;
+		wmma::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, half> frag_V;
+
+		wmma::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, half, wmma::col_major> frag_Q;
+		wmma::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, half, wmma::col_major> frag_W;
+		wmma::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, half> frag_C;
+
+		wmma::fill_fragment(frag_P, HALF_ONE); // P: only ones
+		wmma::fill_fragment(frag_V, HALF_ZERO); // Output: initialize to zeros
+		wmma::fill_fragment(frag_C, HALF_ZERO); // Final result
+		wmma::load_matrix_sync(frag_Q, Q_data, 16);
+
+		// 1. Accumulate the values: V <- AP + V
+		for(uint i = 0; i < (4 * NUM_OF_THREADS_PER_BLOCK)/TILE_SIZE; i++){
+			const unsigned int offset = i * TILE_SIZE;
+			wmma::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, half, wmma::col_major> frag_A;
+			wmma::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16);
+			wmma::mma_sync(frag_V, frag_A, frag_P, frag_V);
+		}
+
+		// W <- V (required since we need V as a "wmma::matrix_b")
+		wmma::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major);
+		wmma::load_matrix_sync(frag_W, tmp, 16);
+
+		// 2. Perform line sum: C <- QW + C (zero)
+		wmma::mma_sync(frag_C, frag_Q, frag_W, frag_C);
+
+		// 3. Store result in shared memory
+		wmma::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major);
+	}
+
+	__syncthreads();
+}
+
+/* End: Reduction using tensor units */
+
 #define REDUCEFLOATSUM(value, pAccumulator) \
 	if (threadIdx.x == 0) \
 	{ \

From 9df1e350e09b47315984de36573ffd2b5a9d7d65 Mon Sep 17 00:00:00 2001
From: Leonardo Solis Vasquez <leonardosolis86@gmail.com>
Date: Tue, 9 Jan 2024 09:06:11 -0500
Subject: [PATCH 02/12] Asserting minimum number-of-threads-per-block (= 64)
 for the matrix-based reduction method to work correctly.

---
 host/src/performdocking.cpp.Cuda | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/host/src/performdocking.cpp.Cuda b/host/src/performdocking.cpp.Cuda
index c35b12e4..cf65d33c 100644
--- a/host/src/performdocking.cpp.Cuda
+++ b/host/src/performdocking.cpp.Cuda
@@ -583,6 +583,23 @@ parameters argc and argv:
 */
 	uint32_t kernel7_gxsize = 0;
 	uint32_t kernel7_lxsize = threadsPerBlock;
+
+/* Begin: Reduction using tensor units */
+
+// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
+// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
+// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
+
+// threadsPerBlock should be at least 64 for the proposed matrix-based method to work.
+// Reason:
+//  "A 256-element matrix is used to store the values to be reduced,
+//  and each thread holds exactly four values,
+//  which results in a minimum of 64 threads to fill a single matrix."
+
+assert(("Matrix-based reductions in ADADELTA: threadsPerBlock should be at least 64!", kernel7_lxsize >= 64));
+
+/* End: Reduction using tensor units */
+
 	uint32_t kernel8_gxsize = 0;
 	uint32_t kernel8_lxsize = threadsPerBlock;
 	if (cData.dockpars.lsearch_rate != 0.0f) {

From 7798aa422e176a02c629ce864ef0f5c63d64d17b Mon Sep 17 00:00:00 2001
From: Leonardo Solis Vasquez <leonardosolis86@gmail.com>
Date: Tue, 9 Jan 2024 09:10:23 -0500
Subject: [PATCH 03/12] Minor improvements in comments.

---
 cuda/calcMergeEneGra.cu | 8 ++++++++
 cuda/kernels.cu         | 8 +++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu
index e5616050..54e00936 100644
--- a/cuda/calcMergeEneGra.cu
+++ b/cuda/calcMergeEneGra.cu
@@ -656,6 +656,10 @@ __device__ void gpu_calc_energrad(
 
 	/* Begin: Reduction using tensor units */
 
+	// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
+	// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
+	// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
+
 	// 1. Convert data-to-be-reduced from float to half
 	// and place it in a shared memory array
 	__shared__ __align__(256) half data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK];
@@ -684,6 +688,10 @@ __device__ void gpu_calc_energrad(
 
 	/* Begin: Reduction using tensor units */
 
+	// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
+	// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
+	// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
+
 	// 1. Convert data-to-be-reduced from float to half
 	// and place it in a shared memory array
 	data_to_be_reduced[4*threadIdx.x] = __float2half(gx);
diff --git a/cuda/kernels.cu b/cuda/kernels.cu
index 4c477ddd..894ccd62 100644
--- a/cuda/kernels.cu
+++ b/cuda/kernels.cu
@@ -94,6 +94,11 @@ __device__ inline int64_t ullitolli(uint64_t u)
 #define ATOMICSUBF32(pAccumulator, value) atomicAdd(pAccumulator, -(value))
 
 /* Begin: Reduction using tensor units */
+
+// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
+// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
+// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
+
 /*
  * Half-precision support
  * https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF__MISC.html
@@ -188,9 +193,6 @@ __device__ void fill_Q(half *Q_data) {
 	*/
 }
 
-// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
-// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
-// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
 __device__ void reduce_via_tensor_units(half *data_to_be_reduced) {
 
 	__syncthreads();

From 10b07fa6a7defde4db8557ccbec41d4cb537de6e Mon Sep 17 00:00:00 2001
From: Leonardo Solis Vasquez <leonardosolis86@gmail.com>
Date: Thu, 11 Jan 2024 13:48:11 -0500
Subject: [PATCH 04/12] Adding TENSOR directive to enable/disable usage of
 tensor cores for reductions in ADADELTA.

---
 Makefile.Cuda                    |  9 +++++---
 cuda/calcMergeEneGra.cu          | 36 +++++++++++++++++++-------------
 cuda/kernels.cu                  |  2 ++
 host/src/performdocking.cpp.Cuda | 24 +++++++++++----------
 4 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/Makefile.Cuda b/Makefile.Cuda
index 3356cf8d..42018021 100644
--- a/Makefile.Cuda
+++ b/Makefile.Cuda
@@ -54,6 +54,9 @@ ifeq ($(OVERLAP), ON)
 	PIPELINE=-DUSE_PIPELINE -fopenmp
 endif
 
+ifeq ($(TENSOR), ON)
+	NVTENSOR=-DUSE_NVTENSOR
+endif
 
 BIN := $(wildcard $(TARGET)*)
 
@@ -184,7 +187,7 @@ unlink-code:
 	rm -f $(HOST_INC_DIR)/performdocking.h $(HOST_SRC_DIR)/performdocking.cpp
 
 kernels: $(KERNEL_SRC)
-	$(NVCC) $(NWI) $(REP) $(CUDA_FLAGS) $(IFLAGS) $(CUDA_INCLUDES) -c $(KRNL_DIR)/kernels.cu
+	$(NVCC) $(NWI) $(REP) $(CUDA_FLAGS) $(IFLAGS) $(CUDA_INCLUDES) $(NVTENSOR) -c $(KRNL_DIR)/kernels.cu
 
 otool: unlink-code
 	@echo "Building" $(TOOL_TARGET) "..."
@@ -192,7 +195,7 @@ otool: unlink-code
 	$(shell ls $(HOST_SRC_DIR)/*.cpp) \
 	$(TOOL_CFLAGS) \
 	-o$(BIN_DIR)/$(TOOL_TARGET) \
-	$(PIPELINE) $(OPT) -DTOOLMODE $(REP)
+	$(PIPELINE) $(NVTENSOR) $(OPT) -DTOOLMODE $(REP)
 
 odock: check-env-all kernels link-code
 	@echo "Building" $(TARGET) "..."
@@ -201,7 +204,7 @@ odock: check-env-all kernels link-code
 	$(CFLAGS) \
 	$(LIB_CUDA) \
 	-o$(BIN_DIR)/$(TARGET) \
-	$(DEV) $(NWI) $(PIPELINE) $(OPT) $(DD) $(REP) $(KFLAGS)
+	$(DEV) $(NWI) $(PIPELINE) $(NVTENSOR) $(OPT) $(DD) $(REP) $(KFLAGS)
 
 # Example
 # 1ac8: for testing gradients of translation and rotation genes
diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu
index 54e00936..c563356f 100644
--- a/cuda/calcMergeEneGra.cu
+++ b/cuda/calcMergeEneGra.cu
@@ -642,18 +642,7 @@ __device__ void gpu_calc_energrad(
 		torque_rot.z += tr.z;
 	}
 
-	// Do a reduction over the total gradient containing prepared "gradient_intra_*" values
-//	REDUCEFLOATSUM(torque_rot.x, pFloatAccumulator);
-//	REDUCEFLOATSUM(torque_rot.y, pFloatAccumulator);
-//	REDUCEFLOATSUM(torque_rot.z, pFloatAccumulator);
-
-	// TODO
-	// -------------------------------------------------------
-	// Obtaining energy and translation-related gradients
-	// -------------------------------------------------------
-	// reduction over partial energies and prepared "gradient_intra_*" values
-//	REDUCEFLOATSUM(energy, pFloatAccumulator);
-
+#ifdef USE_NVTENSOR
 	/* Begin: Reduction using tensor units */
 
 	// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
@@ -678,14 +667,26 @@ __device__ void gpu_calc_energrad(
 	energy = __half2float(data_to_be_reduced[3]);
 
 	/* End: Reduction using tensor units */
+#else
+	// Reduction over the total gradient containing prepared "gradient_intra_*" values
+	REDUCEFLOATSUM(torque_rot.x, pFloatAccumulator);
+	REDUCEFLOATSUM(torque_rot.y, pFloatAccumulator);
+	REDUCEFLOATSUM(torque_rot.z, pFloatAccumulator);
+
+	// Reduction over partial energies and prepared "gradient_intra_*" values
+	REDUCEFLOATSUM(energy, pFloatAccumulator);
+#endif
+
+	// TODO
+	// -------------------------------------------------------
+	// Obtaining energy and translation-related gradients
+	// -------------------------------------------------------
 
 #if defined (DEBUG_ENERGY_KERNEL)
 	REDUCEFLOATSUM(intraE, pFloatAccumulator);
 #endif
-//	REDUCEFLOATSUM(gx, pFloatAccumulator);
-//	REDUCEFLOATSUM(gy, pFloatAccumulator);
-//	REDUCEFLOATSUM(gz, pFloatAccumulator);
 
+#ifdef USE_NVTENSOR
 	/* Begin: Reduction using tensor units */
 
 	// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
@@ -707,6 +708,11 @@ __device__ void gpu_calc_energrad(
 	gz = __half2float(data_to_be_reduced[2]);
 
 	/* End: Reduction using tensor units */
+#else
+	REDUCEFLOATSUM(gx, pFloatAccumulator);
+	REDUCEFLOATSUM(gy, pFloatAccumulator);
+	REDUCEFLOATSUM(gz, pFloatAccumulator);
+#endif
 
 	global_energy = energy;
 #ifndef FLOAT_GRADIENTS
diff --git a/cuda/kernels.cu b/cuda/kernels.cu
index 894ccd62..ba4fccc5 100644
--- a/cuda/kernels.cu
+++ b/cuda/kernels.cu
@@ -93,6 +93,7 @@ __device__ inline int64_t ullitolli(uint64_t u)
 #define ATOMICADDF32(pAccumulator, value) atomicAdd(pAccumulator, (value))
 #define ATOMICSUBF32(pAccumulator, value) atomicAdd(pAccumulator, -(value))
 
+#ifdef USE_NVTENSOR
 /* Begin: Reduction using tensor units */
 
 // Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
@@ -240,6 +241,7 @@ __device__ void reduce_via_tensor_units(half *data_to_be_reduced) {
 }
 
 /* End: Reduction using tensor units */
+#endif
 
 #define REDUCEFLOATSUM(value, pAccumulator) \
 	if (threadIdx.x == 0) \
diff --git a/host/src/performdocking.cpp.Cuda b/host/src/performdocking.cpp.Cuda
index cf65d33c..29e02e81 100644
--- a/host/src/performdocking.cpp.Cuda
+++ b/host/src/performdocking.cpp.Cuda
@@ -584,21 +584,23 @@ parameters argc and argv:
 	uint32_t kernel7_gxsize = 0;
 	uint32_t kernel7_lxsize = threadsPerBlock;
 
-/* Begin: Reduction using tensor units */
+	#ifdef USE_NVTENSOR
+	/* Begin: Reduction using tensor units */
 
-// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
-// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
-// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
+	// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
+	// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
+	// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
 
-// threadsPerBlock should be at least 64 for the proposed matrix-based method to work.
-// Reason:
-//  "A 256-element matrix is used to store the values to be reduced,
-//  and each thread holds exactly four values,
-//  which results in a minimum of 64 threads to fill a single matrix."
+	// threadsPerBlock should be at least 64 for the proposed matrix-based method to work.
+	// Reason:
+	//  "A 256-element matrix is used to store the values to be reduced,
+	//  and each thread holds exactly four values,
+	//  which results in a minimum of 64 threads to fill a single matrix."
 
-assert(("Matrix-based reductions in ADADELTA: threadsPerBlock should be at least 64!", kernel7_lxsize >= 64));
+	assert(("Matrix-based reductions in ADADELTA: threadsPerBlock should be at least 64!", kernel7_lxsize >= 64));
 
-/* End: Reduction using tensor units */
+	/* End: Reduction using tensor units */
+	#endif
 
 	uint32_t kernel8_gxsize = 0;
 	uint32_t kernel8_lxsize = threadsPerBlock;

From b2ab3fe6b79e08b7c1af5f11233070ac6c42d025 Mon Sep 17 00:00:00 2001
From: Leonardo Solis Vasquez <leonardosolis86@gmail.com>
Date: Fri, 16 Feb 2024 14:21:54 -0500
Subject: [PATCH 05/12] Enabling usage of WMMA Extension for single precision
 matmul using Tensor Cores and error correction (TCEC).

---
 .gitmodules             |  3 ++
 Makefile.Cuda           |  4 ++
 cuda/calcMergeEneGra.cu | 28 +++++++++++++
 cuda/kernels.cu         | 92 +++++++++++++++++++++++++++++++++++++----
 wmma_extension          |  1 +
 5 files changed, 120 insertions(+), 8 deletions(-)
 create mode 100644 .gitmodules
 create mode 160000 wmma_extension

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..0bdb553c
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "wmma_extension"]
+	path = wmma_extension
+	url = https://github.com/wmmae/wmma_extension
diff --git a/Makefile.Cuda b/Makefile.Cuda
index 42018021..31b9ba7b 100644
--- a/Makefile.Cuda
+++ b/Makefile.Cuda
@@ -56,6 +56,10 @@ endif
 
 ifeq ($(TENSOR), ON)
 	NVTENSOR=-DUSE_NVTENSOR
+
+	ifeq ($(TCEC), ON)
+		NVTENSOR+=-I./wmma_extension/include -DUSE_TCEC
+	endif
 endif
 
 BIN := $(wildcard $(TARGET)*)
diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu
index c563356f..94e2a10b 100644
--- a/cuda/calcMergeEneGra.cu
+++ b/cuda/calcMergeEneGra.cu
@@ -651,20 +651,35 @@ __device__ void gpu_calc_energrad(
 
 	// 1. Convert data-to-be-reduced from float to half
 	// and place it in a shared memory array
+	#ifdef USE_TCEC
+	__shared__ __align__(256) float data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK];
+	data_to_be_reduced[4*threadIdx.x] = torque_rot.x;
+	data_to_be_reduced[4*threadIdx.x + 1] = torque_rot.y;
+	data_to_be_reduced[4*threadIdx.x + 2] = torque_rot.z;
+	data_to_be_reduced[4*threadIdx.x + 3] = energy;
+	#else
 	__shared__ __align__(256) half data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK];
 	data_to_be_reduced[4*threadIdx.x] = __float2half(torque_rot.x);
 	data_to_be_reduced[4*threadIdx.x + 1] = __float2half(torque_rot.y);
 	data_to_be_reduced[4*threadIdx.x + 2] = __float2half(torque_rot.z);
 	data_to_be_reduced[4*threadIdx.x + 3] = __float2half(energy);
+	#endif
 
 	// 2. Perform reduction via tensor units
 	reduce_via_tensor_units(data_to_be_reduced);
 
 	// 3. Retrieve results from shared memory
+	#ifdef USE_TCEC
+	torque_rot.x = data_to_be_reduced[0];
+	torque_rot.y = data_to_be_reduced[1];
+	torque_rot.z = data_to_be_reduced[2];
+	energy = data_to_be_reduced[3];
+	#else
 	torque_rot.x = __half2float(data_to_be_reduced[0]);
 	torque_rot.y = __half2float(data_to_be_reduced[1]);
 	torque_rot.z = __half2float(data_to_be_reduced[2]);
 	energy = __half2float(data_to_be_reduced[3]);
+	#endif
 
 	/* End: Reduction using tensor units */
 #else
@@ -695,17 +710,30 @@ __device__ void gpu_calc_energrad(
 
 	// 1. Convert data-to-be-reduced from float to half
 	// and place it in a shared memory array
+
+	#ifdef USE_TCEC
+	data_to_be_reduced[4*threadIdx.x] = gx;
+	data_to_be_reduced[4*threadIdx.x + 1] = gy;
+	data_to_be_reduced[4*threadIdx.x + 2] = gz;
+	#else
 	data_to_be_reduced[4*threadIdx.x] = __float2half(gx);
 	data_to_be_reduced[4*threadIdx.x + 1] = __float2half(gy);
 	data_to_be_reduced[4*threadIdx.x + 2] = __float2half(gz);
+	#endif
 
 	// 2. Perform reduction via tensor units
 	reduce_via_tensor_units(data_to_be_reduced);
 
 	// 3. Retrieve results from shared memory
+	#ifdef USE_TCEC
+	gx = data_to_be_reduced[0];
+	gy = data_to_be_reduced[1];
+	gz = data_to_be_reduced[2];
+	#else
 	gx = __half2float(data_to_be_reduced[0]);
 	gy = __half2float(data_to_be_reduced[1]);
 	gz = __half2float(data_to_be_reduced[2]);
+	#endif
 
 	/* End: Reduction using tensor units */
 #else
diff --git a/cuda/kernels.cu b/cuda/kernels.cu
index ba4fccc5..689aa9fa 100644
--- a/cuda/kernels.cu
+++ b/cuda/kernels.cu
@@ -106,6 +106,17 @@ __device__ inline int64_t ullitolli(uint64_t u)
  */
 #include <cuda_fp16.h>
 
+	#ifdef USE_TCEC
+	/*
+	* WMMA Extension for single precision matmul using Tensor Cores
+	* and error correction technique (TCEC)
+	* https://github.com/wmmae/wmma_extension/blob/main/docs/mma_f32.md
+	*/
+	#include <wmma_extension/tcec/tcec.hpp>
+
+	using tf32 = nvcuda::wmma::precision::tf32;
+	#endif
+
 /*
  * Tensor Cores
  * https://developer.nvidia.com/blog/programming-tensor-cores-cuda-9
@@ -124,21 +135,36 @@ constexpr int rowscols_M = 16;	// Number of rows (or cols) in the M dimension
 constexpr int rowscols_N = 16;	// Number of rows (or cols) in the N dimension
 constexpr int rowscols_K = 16;	// Number of rows (or cols) in the K dimension
 
-// Half constants
-// CUDART_ONE_FP16 was not recognized by the NVCC compiler
-// So its value is indicated explicitly
-// https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS.html#group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS
-#define HALF_ONE __ushort_as_half((unsigned short)0x3C00U)
-#define HALF_ZERO __ushort_as_half((unsigned short)0x0000U)
+#ifndef USE_TCEC
+	// Half constants
+	// CUDART_ONE_FP16 was not recognized by the NVCC compiler
+	// So its value is indicated explicitly
+	// https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS.html#group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS
+	#define HALF_ONE __ushort_as_half((unsigned short)0x3C00U)
+	#define HALF_ZERO __ushort_as_half((unsigned short)0x0000U)
+#endif
 
+#ifdef USE_TCEC
+__device__ void fill_Q(float *Q_data) {
+#else
 __device__ void fill_Q(half *Q_data) {
+#endif
 
+	#ifdef USE_TCEC
+	float I4[16] = {
+		1.0f, 0.0f, 0.0f, 0.0f,
+		0.0f, 1.0f, 0.0f, 0.0f,
+		0.0f, 0.0f, 1.0f, 0.0f,
+		0.0f, 0.0f, 0.0f, 1.0f
+	};
+	#else
 	half I4[16] = {
 		HALF_ONE, HALF_ZERO, HALF_ZERO, HALF_ZERO,
 		HALF_ZERO, HALF_ONE, HALF_ZERO, HALF_ZERO,
 		HALF_ZERO, HALF_ZERO, HALF_ONE, HALF_ZERO,
 		HALF_ZERO, HALF_ZERO, HALF_ZERO, HALF_ONE
 	};
+	#endif
 
 	/*
 	// Naive implementation: a single thread fills data in
@@ -194,47 +220,97 @@ __device__ void fill_Q(half *Q_data) {
 	*/
 }
 
+#ifdef USE_TCEC
+__device__ void reduce_via_tensor_units(float *data_to_be_reduced) {
+#else
 __device__ void reduce_via_tensor_units(half *data_to_be_reduced) {
-
+#endif
 	__syncthreads();
 
 	if (threadIdx.x <= 31) { // Only one warp performs reduction
+		#ifdef USE_TCEC
+		__shared__ __align__ (256) float Q_data[TILE_SIZE];
+		#else
 		__shared__ __align__ (256) half Q_data[TILE_SIZE];
+		#endif
 
 		fill_Q(Q_data);
 
+		#ifdef USE_TCEC
+		__shared__ __align__ (256) float tmp[TILE_SIZE];
+		#else
 		__shared__ __align__ (256) half tmp[TILE_SIZE];
+		#endif
 
 		// Declaring and filling fragments - Those are *not* shared
+
+		#ifdef USE_TCEC
+		mtk::wmma::tcec::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_P;
+		mtk::wmma::tcec::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, tf32> frag_V;
+		#else
 		wmma::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, half, wmma::col_major> frag_P;
 		wmma::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, half> frag_V;
+		#endif
 
+		#ifdef USE_TCEC
+		mtk::wmma::tcec::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_Q;
+		mtk::wmma::tcec::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_W;
+		mtk::wmma::tcec::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, tf32> frag_C;
+		#else
 		wmma::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, half, wmma::col_major> frag_Q;
 		wmma::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, half, wmma::col_major> frag_W;
 		wmma::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, half> frag_C;
-
+		#endif
+
+		#ifdef USE_TCEC
+		mtk::wmma::tcec::fill_fragment(frag_P, 1.0f); // P: only ones
+		mtk::wmma::tcec::fill_fragment(frag_V, 0.0f); // Output: initialize to zeros
+		mtk::wmma::tcec::fill_fragment(frag_C, 0.0f); // Final result
+		mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_data, 16);
+		#else
 		wmma::fill_fragment(frag_P, HALF_ONE); // P: only ones
 		wmma::fill_fragment(frag_V, HALF_ZERO); // Output: initialize to zeros
 		wmma::fill_fragment(frag_C, HALF_ZERO); // Final result
 		wmma::load_matrix_sync(frag_Q, Q_data, 16);
+		#endif
 
 		// 1. Accumulate the values: V <- AP + V
 		for(uint i = 0; i < (4 * NUM_OF_THREADS_PER_BLOCK)/TILE_SIZE; i++){
 			const unsigned int offset = i * TILE_SIZE;
+
+			#ifdef USE_TCEC
+			mtk::wmma::tcec::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_A;
+			mtk::wmma::tcec::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16);
+			mtk::wmma::tcec::mma_sync(frag_V, frag_A, frag_P, frag_V);
+			#else
 			wmma::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, half, wmma::col_major> frag_A;
 			wmma::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16);
 			wmma::mma_sync(frag_V, frag_A, frag_P, frag_V);
+			#endif
 		}
 
 		// W <- V (required since we need V as a "wmma::matrix_b")
+		#ifdef USE_TCEC
+		mtk::wmma::tcec::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major);
+		mtk::wmma::tcec::load_matrix_sync(frag_W, tmp, 16);
+		#else
 		wmma::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major);
 		wmma::load_matrix_sync(frag_W, tmp, 16);
+		#endif
 
 		// 2. Perform line sum: C <- QW + C (zero)
+		#ifdef USE_TCEC
+		mtk::wmma::tcec::mma_sync(frag_C, frag_Q, frag_W, frag_C);
+		#else
 		wmma::mma_sync(frag_C, frag_Q, frag_W, frag_C);
+		#endif
 
 		// 3. Store result in shared memory
+		#ifdef USE_TCEC
+		mtk::wmma::tcec::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major);
+		#else
 		wmma::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major);
+		#endif
 	}
 
 	__syncthreads();
diff --git a/wmma_extension b/wmma_extension
new file mode 160000
index 00000000..7175b5f4
--- /dev/null
+++ b/wmma_extension
@@ -0,0 +1 @@
+Subproject commit 7175b5f4feea2a16b8d5b3045a357db010583c5e

From 0ec50e23bbd88c0a94bc8aa1b153b629bfaf9bc7 Mon Sep 17 00:00:00 2001
From: Leonardo Solis Vasquez <leonardosolis86@gmail.com>
Date: Fri, 19 Jul 2024 07:53:35 +0200
Subject: [PATCH 06/12] Starting to remove non-TCEC code.

---
 cuda/calcMergeEneGra.cu | 19 -----------------
 cuda/kernels.cu         | 47 -----------------------------------------
 2 files changed, 66 deletions(-)

diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu
index 94e2a10b..f86ea35f 100644
--- a/cuda/calcMergeEneGra.cu
+++ b/cuda/calcMergeEneGra.cu
@@ -657,12 +657,6 @@ __device__ void gpu_calc_energrad(
 	data_to_be_reduced[4*threadIdx.x + 1] = torque_rot.y;
 	data_to_be_reduced[4*threadIdx.x + 2] = torque_rot.z;
 	data_to_be_reduced[4*threadIdx.x + 3] = energy;
-	#else
-	__shared__ __align__(256) half data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK];
-	data_to_be_reduced[4*threadIdx.x] = __float2half(torque_rot.x);
-	data_to_be_reduced[4*threadIdx.x + 1] = __float2half(torque_rot.y);
-	data_to_be_reduced[4*threadIdx.x + 2] = __float2half(torque_rot.z);
-	data_to_be_reduced[4*threadIdx.x + 3] = __float2half(energy);
 	#endif
 
 	// 2. Perform reduction via tensor units
@@ -674,11 +668,6 @@ __device__ void gpu_calc_energrad(
 	torque_rot.y = data_to_be_reduced[1];
 	torque_rot.z = data_to_be_reduced[2];
 	energy = data_to_be_reduced[3];
-	#else
-	torque_rot.x = __half2float(data_to_be_reduced[0]);
-	torque_rot.y = __half2float(data_to_be_reduced[1]);
-	torque_rot.z = __half2float(data_to_be_reduced[2]);
-	energy = __half2float(data_to_be_reduced[3]);
 	#endif
 
 	/* End: Reduction using tensor units */
@@ -715,10 +704,6 @@ __device__ void gpu_calc_energrad(
 	data_to_be_reduced[4*threadIdx.x] = gx;
 	data_to_be_reduced[4*threadIdx.x + 1] = gy;
 	data_to_be_reduced[4*threadIdx.x + 2] = gz;
-	#else
-	data_to_be_reduced[4*threadIdx.x] = __float2half(gx);
-	data_to_be_reduced[4*threadIdx.x + 1] = __float2half(gy);
-	data_to_be_reduced[4*threadIdx.x + 2] = __float2half(gz);
 	#endif
 
 	// 2. Perform reduction via tensor units
@@ -729,10 +714,6 @@ __device__ void gpu_calc_energrad(
 	gx = data_to_be_reduced[0];
 	gy = data_to_be_reduced[1];
 	gz = data_to_be_reduced[2];
-	#else
-	gx = __half2float(data_to_be_reduced[0]);
-	gy = __half2float(data_to_be_reduced[1]);
-	gz = __half2float(data_to_be_reduced[2]);
 	#endif
 
 	/* End: Reduction using tensor units */
diff --git a/cuda/kernels.cu b/cuda/kernels.cu
index 689aa9fa..e8b721bb 100644
--- a/cuda/kernels.cu
+++ b/cuda/kernels.cu
@@ -135,19 +135,8 @@ constexpr int rowscols_M = 16;	// Number of rows (or cols) in the M dimension
 constexpr int rowscols_N = 16;	// Number of rows (or cols) in the N dimension
 constexpr int rowscols_K = 16;	// Number of rows (or cols) in the K dimension
 
-#ifndef USE_TCEC
-	// Half constants
-	// CUDART_ONE_FP16 was not recognized by the NVCC compiler
-	// So its value is indicated explicitly
-	// https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS.html#group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS
-	#define HALF_ONE __ushort_as_half((unsigned short)0x3C00U)
-	#define HALF_ZERO __ushort_as_half((unsigned short)0x0000U)
-#endif
-
 #ifdef USE_TCEC
 __device__ void fill_Q(float *Q_data) {
-#else
-__device__ void fill_Q(half *Q_data) {
 #endif
 
 	#ifdef USE_TCEC
@@ -157,13 +146,6 @@ __device__ void fill_Q(half *Q_data) {
 		0.0f, 0.0f, 1.0f, 0.0f,
 		0.0f, 0.0f, 0.0f, 1.0f
 	};
-	#else
-	half I4[16] = {
-		HALF_ONE, HALF_ZERO, HALF_ZERO, HALF_ZERO,
-		HALF_ZERO, HALF_ONE, HALF_ZERO, HALF_ZERO,
-		HALF_ZERO, HALF_ZERO, HALF_ONE, HALF_ZERO,
-		HALF_ZERO, HALF_ZERO, HALF_ZERO, HALF_ONE
-	};
 	#endif
 
 	/*
@@ -222,24 +204,18 @@ __device__ void fill_Q(half *Q_data) {
 
 #ifdef USE_TCEC
 __device__ void reduce_via_tensor_units(float *data_to_be_reduced) {
-#else
-__device__ void reduce_via_tensor_units(half *data_to_be_reduced) {
 #endif
 	__syncthreads();
 
 	if (threadIdx.x <= 31) { // Only one warp performs reduction
 		#ifdef USE_TCEC
 		__shared__ __align__ (256) float Q_data[TILE_SIZE];
-		#else
-		__shared__ __align__ (256) half Q_data[TILE_SIZE];
 		#endif
 
 		fill_Q(Q_data);
 
 		#ifdef USE_TCEC
 		__shared__ __align__ (256) float tmp[TILE_SIZE];
-		#else
-		__shared__ __align__ (256) half tmp[TILE_SIZE];
 		#endif
 
 		// Declaring and filling fragments - Those are *not* shared
@@ -247,19 +223,12 @@ __device__ void reduce_via_tensor_units(half *data_to_be_reduced) {
 		#ifdef USE_TCEC
 		mtk::wmma::tcec::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_P;
 		mtk::wmma::tcec::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, tf32> frag_V;
-		#else
-		wmma::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, half, wmma::col_major> frag_P;
-		wmma::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, half> frag_V;
 		#endif
 
 		#ifdef USE_TCEC
 		mtk::wmma::tcec::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_Q;
 		mtk::wmma::tcec::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_W;
 		mtk::wmma::tcec::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, tf32> frag_C;
-		#else
-		wmma::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, half, wmma::col_major> frag_Q;
-		wmma::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, half, wmma::col_major> frag_W;
-		wmma::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, half> frag_C;
 		#endif
 
 		#ifdef USE_TCEC
@@ -267,11 +236,6 @@ __device__ void reduce_via_tensor_units(half *data_to_be_reduced) {
 		mtk::wmma::tcec::fill_fragment(frag_V, 0.0f); // Output: initialize to zeros
 		mtk::wmma::tcec::fill_fragment(frag_C, 0.0f); // Final result
 		mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_data, 16);
-		#else
-		wmma::fill_fragment(frag_P, HALF_ONE); // P: only ones
-		wmma::fill_fragment(frag_V, HALF_ZERO); // Output: initialize to zeros
-		wmma::fill_fragment(frag_C, HALF_ZERO); // Final result
-		wmma::load_matrix_sync(frag_Q, Q_data, 16);
 		#endif
 
 		// 1. Accumulate the values: V <- AP + V
@@ -282,10 +246,6 @@ __device__ void reduce_via_tensor_units(half *data_to_be_reduced) {
 			mtk::wmma::tcec::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_A;
 			mtk::wmma::tcec::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16);
 			mtk::wmma::tcec::mma_sync(frag_V, frag_A, frag_P, frag_V);
-			#else
-			wmma::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, half, wmma::col_major> frag_A;
-			wmma::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16);
-			wmma::mma_sync(frag_V, frag_A, frag_P, frag_V);
 			#endif
 		}
 
@@ -293,23 +253,16 @@ __device__ void reduce_via_tensor_units(half *data_to_be_reduced) {
 		#ifdef USE_TCEC
 		mtk::wmma::tcec::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major);
 		mtk::wmma::tcec::load_matrix_sync(frag_W, tmp, 16);
-		#else
-		wmma::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major);
-		wmma::load_matrix_sync(frag_W, tmp, 16);
 		#endif
 
 		// 2. Perform line sum: C <- QW + C (zero)
 		#ifdef USE_TCEC
 		mtk::wmma::tcec::mma_sync(frag_C, frag_Q, frag_W, frag_C);
-		#else
-		wmma::mma_sync(frag_C, frag_Q, frag_W, frag_C);
 		#endif
 
 		// 3. Store result in shared memory
 		#ifdef USE_TCEC
 		mtk::wmma::tcec::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major);
-		#else
-		wmma::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major);
 		#endif
 	}
 

From 7be85ed3b2853b52552ff0d5bbb29f1f3d6ce84c Mon Sep 17 00:00:00 2001
From: Leonardo Solis Vasquez <leonardosolis86@gmail.com>
Date: Fri, 19 Jul 2024 08:16:47 +0200
Subject: [PATCH 07/12] Removing support for FP16. This is not needed when
 having TCEC.

---
 cuda/kernels.cu | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/cuda/kernels.cu b/cuda/kernels.cu
index e8b721bb..bc09e94e 100644
--- a/cuda/kernels.cu
+++ b/cuda/kernels.cu
@@ -100,12 +100,6 @@ __device__ inline int64_t ullitolli(uint64_t u)
 // "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
 // https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
 
-/*
- * Half-precision support
- * https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF__MISC.html
- */
-#include <cuda_fp16.h>
-
 	#ifdef USE_TCEC
 	/*
 	* WMMA Extension for single precision matmul using Tensor Cores
@@ -195,7 +189,7 @@ __device__ void fill_Q(float *Q_data) {
 		printf("\nQ_data");
 		for (uint i = 0; i < 16 * 16; i++) {
 			if ((i % 16) == 0) {printf("\n[Row %u]: ", i/16);}
-			printf(" %2.2f ", __half2float(Q_data[i]));
+			printf(" %2.2f ", Q_data[i]);
 		}
 		printf("\n");
     }

From 0b76241d5c1faf2de0c28c0f8d74c1439ed326ed Mon Sep 17 00:00:00 2001
From: Leonardo Solis Vasquez <leonardosolis86@gmail.com>
Date: Fri, 19 Jul 2024 08:49:12 +0200
Subject: [PATCH 08/12] TCEC=ON is enabled by default, and thus, not needed to
 be specified as flag.

---
 Makefile.Cuda           |  5 +----
 cuda/calcMergeEneGra.cu |  9 ---------
 cuda/kernels.cu         | 28 ----------------------------
 3 files changed, 1 insertion(+), 41 deletions(-)

diff --git a/Makefile.Cuda b/Makefile.Cuda
index 31b9ba7b..d8217ab5 100644
--- a/Makefile.Cuda
+++ b/Makefile.Cuda
@@ -56,10 +56,7 @@ endif
 
 ifeq ($(TENSOR), ON)
 	NVTENSOR=-DUSE_NVTENSOR
-
-	ifeq ($(TCEC), ON)
-		NVTENSOR+=-I./wmma_extension/include -DUSE_TCEC
-	endif
+	NVTENSOR+=-I./wmma_extension/include
 endif
 
 BIN := $(wildcard $(TARGET)*)
diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu
index f86ea35f..7b19a894 100644
--- a/cuda/calcMergeEneGra.cu
+++ b/cuda/calcMergeEneGra.cu
@@ -651,24 +651,20 @@ __device__ void gpu_calc_energrad(
 
 	// 1. Convert data-to-be-reduced from float to half
 	// and place it in a shared memory array
-	#ifdef USE_TCEC
 	__shared__ __align__(256) float data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK];
 	data_to_be_reduced[4*threadIdx.x] = torque_rot.x;
 	data_to_be_reduced[4*threadIdx.x + 1] = torque_rot.y;
 	data_to_be_reduced[4*threadIdx.x + 2] = torque_rot.z;
 	data_to_be_reduced[4*threadIdx.x + 3] = energy;
-	#endif
 
 	// 2. Perform reduction via tensor units
 	reduce_via_tensor_units(data_to_be_reduced);
 
 	// 3. Retrieve results from shared memory
-	#ifdef USE_TCEC
 	torque_rot.x = data_to_be_reduced[0];
 	torque_rot.y = data_to_be_reduced[1];
 	torque_rot.z = data_to_be_reduced[2];
 	energy = data_to_be_reduced[3];
-	#endif
 
 	/* End: Reduction using tensor units */
 #else
@@ -699,22 +695,17 @@ __device__ void gpu_calc_energrad(
 
 	// 1. Convert data-to-be-reduced from float to half
 	// and place it in a shared memory array
-
-	#ifdef USE_TCEC
 	data_to_be_reduced[4*threadIdx.x] = gx;
 	data_to_be_reduced[4*threadIdx.x + 1] = gy;
 	data_to_be_reduced[4*threadIdx.x + 2] = gz;
-	#endif
 
 	// 2. Perform reduction via tensor units
 	reduce_via_tensor_units(data_to_be_reduced);
 
 	// 3. Retrieve results from shared memory
-	#ifdef USE_TCEC
 	gx = data_to_be_reduced[0];
 	gy = data_to_be_reduced[1];
 	gz = data_to_be_reduced[2];
-	#endif
 
 	/* End: Reduction using tensor units */
 #else
diff --git a/cuda/kernels.cu b/cuda/kernels.cu
index bc09e94e..4be75d7c 100644
--- a/cuda/kernels.cu
+++ b/cuda/kernels.cu
@@ -100,16 +100,13 @@ __device__ inline int64_t ullitolli(uint64_t u)
 // "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
 // https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
 
-	#ifdef USE_TCEC
 	/*
 	* WMMA Extension for single precision matmul using Tensor Cores
 	* and error correction technique (TCEC)
 	* https://github.com/wmmae/wmma_extension/blob/main/docs/mma_f32.md
 	*/
 	#include <wmma_extension/tcec/tcec.hpp>
-
 	using tf32 = nvcuda::wmma::precision::tf32;
-	#endif
 
 /*
  * Tensor Cores
@@ -129,18 +126,14 @@ constexpr int rowscols_M = 16;	// Number of rows (or cols) in the M dimension
 constexpr int rowscols_N = 16;	// Number of rows (or cols) in the N dimension
 constexpr int rowscols_K = 16;	// Number of rows (or cols) in the K dimension
 
-#ifdef USE_TCEC
 __device__ void fill_Q(float *Q_data) {
-#endif
 
-	#ifdef USE_TCEC
 	float I4[16] = {
 		1.0f, 0.0f, 0.0f, 0.0f,
 		0.0f, 1.0f, 0.0f, 0.0f,
 		0.0f, 0.0f, 1.0f, 0.0f,
 		0.0f, 0.0f, 0.0f, 1.0f
 	};
-	#endif
 
 	/*
 	// Naive implementation: a single thread fills data in
@@ -196,68 +189,47 @@ __device__ void fill_Q(float *Q_data) {
 	*/
 }
 
-#ifdef USE_TCEC
 __device__ void reduce_via_tensor_units(float *data_to_be_reduced) {
-#endif
 	__syncthreads();
 
 	if (threadIdx.x <= 31) { // Only one warp performs reduction
-		#ifdef USE_TCEC
 		__shared__ __align__ (256) float Q_data[TILE_SIZE];
-		#endif
 
 		fill_Q(Q_data);
 
-		#ifdef USE_TCEC
 		__shared__ __align__ (256) float tmp[TILE_SIZE];
-		#endif
 
 		// Declaring and filling fragments - Those are *not* shared
-
-		#ifdef USE_TCEC
 		mtk::wmma::tcec::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_P;
 		mtk::wmma::tcec::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, tf32> frag_V;
-		#endif
 
-		#ifdef USE_TCEC
 		mtk::wmma::tcec::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_Q;
 		mtk::wmma::tcec::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_W;
 		mtk::wmma::tcec::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, tf32> frag_C;
-		#endif
 
-		#ifdef USE_TCEC
 		mtk::wmma::tcec::fill_fragment(frag_P, 1.0f); // P: only ones
 		mtk::wmma::tcec::fill_fragment(frag_V, 0.0f); // Output: initialize to zeros
 		mtk::wmma::tcec::fill_fragment(frag_C, 0.0f); // Final result
 		mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_data, 16);
-		#endif
 
 		// 1. Accumulate the values: V <- AP + V
 		for(uint i = 0; i < (4 * NUM_OF_THREADS_PER_BLOCK)/TILE_SIZE; i++){
 			const unsigned int offset = i * TILE_SIZE;
 
-			#ifdef USE_TCEC
 			mtk::wmma::tcec::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_A;
 			mtk::wmma::tcec::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16);
 			mtk::wmma::tcec::mma_sync(frag_V, frag_A, frag_P, frag_V);
-			#endif
 		}
 
 		// W <- V (required since we need V as a "wmma::matrix_b")
-		#ifdef USE_TCEC
 		mtk::wmma::tcec::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major);
 		mtk::wmma::tcec::load_matrix_sync(frag_W, tmp, 16);
-		#endif
 
 		// 2. Perform line sum: C <- QW + C (zero)
-		#ifdef USE_TCEC
 		mtk::wmma::tcec::mma_sync(frag_C, frag_Q, frag_W, frag_C);
-		#endif
 
 		// 3. Store result in shared memory
-		#ifdef USE_TCEC
 		mtk::wmma::tcec::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major);
-		#endif
 	}
 
 	__syncthreads();

From 51012a86b4e6be9413304eb8b6924ef88be44e88 Mon Sep 17 00:00:00 2001
From: atillack <atillack@scripps.edu>
Date: Tue, 23 Jul 2024 07:36:44 -0700
Subject: [PATCH 09/12] Temporarily undo changes in performdocking.cpp.Cuda to
 update commit.

---
 host/src/performdocking.cpp.Cuda | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/host/src/performdocking.cpp.Cuda b/host/src/performdocking.cpp.Cuda
index 29e02e81..c35b12e4 100644
--- a/host/src/performdocking.cpp.Cuda
+++ b/host/src/performdocking.cpp.Cuda
@@ -583,25 +583,6 @@ parameters argc and argv:
 */
 	uint32_t kernel7_gxsize = 0;
 	uint32_t kernel7_lxsize = threadsPerBlock;
-
-	#ifdef USE_NVTENSOR
-	/* Begin: Reduction using tensor units */
-
-	// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
-	// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
-	// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
-
-	// threadsPerBlock should be at least 64 for the proposed matrix-based method to work.
-	// Reason:
-	//  "A 256-element matrix is used to store the values to be reduced,
-	//  and each thread holds exactly four values,
-	//  which results in a minimum of 64 threads to fill a single matrix."
-
-	assert(("Matrix-based reductions in ADADELTA: threadsPerBlock should be at least 64!", kernel7_lxsize >= 64));
-
-	/* End: Reduction using tensor units */
-	#endif
-
 	uint32_t kernel8_gxsize = 0;
 	uint32_t kernel8_lxsize = threadsPerBlock;
 	if (cData.dockpars.lsearch_rate != 0.0f) {

From a6e205492c154db15dc39fcffabccfdd404235d6 Mon Sep 17 00:00:00 2001
From: atillack <atillack@scripps.edu>
Date: Tue, 23 Jul 2024 08:09:29 -0700
Subject: [PATCH 10/12] Added minimum NUMWI (64) and target architecture (80)
 for TENSOR code to Makefile.

---
 Makefile.Cuda | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/Makefile.Cuda b/Makefile.Cuda
index b1055039..9dbfe408 100644
--- a/Makefile.Cuda
+++ b/Makefile.Cuda
@@ -16,6 +16,13 @@ CPP = g++
 UNAME := $(shell uname)
 
 TARGETS = 52 60 61 70 80
+
+ifeq ($(TENSOR), ON)
+	NVTENSOR=-DUSE_NVTENSOR
+	NVTENSOR+=-I./wmma_extension/include
+	TARGETS = 80
+endif
+
 CUDA_TARGETS=$(foreach target,$(TARGETS),-gencode arch=compute_$(target),code=sm_$(target))
 
 ifeq ($(DEVICE), CPU)
@@ -54,11 +61,6 @@ ifeq ($(OVERLAP), ON)
 	PIPELINE=-DUSE_PIPELINE -fopenmp
 endif
 
-ifeq ($(TENSOR), ON)
-	NVTENSOR=-DUSE_NVTENSOR
-	NVTENSOR+=-I./wmma_extension/include
-endif
-
 BIN := $(wildcard $(TARGET)*)
 
 # ------------------------------------------------------
@@ -67,6 +69,9 @@ BIN := $(wildcard $(TARGET)*)
 NUMWI=
 
 ifeq ($(NUMWI), 32)
+ifeq ($(TENSOR), ON)
+$(error NUMWI needs to be at least 64 with TENSOR=ON)
+endif
 	NWI=-DN32WI
 	TARGET:=$(TARGET)_32wi
 else ifeq ($(NUMWI), 64)

From d3b329954956d0815f7188655a944a94b9a18924 Mon Sep 17 00:00:00 2001
From: atillack <atillack@scripps.edu>
Date: Thu, 25 Jul 2024 12:15:06 -0700
Subject: [PATCH 11/12] Optimization and cleanup of WMMA (tensor core) code.

---
 cuda/kernels.cu | 81 ++++++-------------------------------------------
 1 file changed, 10 insertions(+), 71 deletions(-)

diff --git a/cuda/kernels.cu b/cuda/kernels.cu
index 4be75d7c..f1560cbf 100644
--- a/cuda/kernels.cu
+++ b/cuda/kernels.cu
@@ -126,78 +126,11 @@ constexpr int rowscols_M = 16;	// Number of rows (or cols) in the M dimension
 constexpr int rowscols_N = 16;	// Number of rows (or cols) in the N dimension
 constexpr int rowscols_K = 16;	// Number of rows (or cols) in the K dimension
 
-__device__ void fill_Q(float *Q_data) {
-
-	float I4[16] = {
-		1.0f, 0.0f, 0.0f, 0.0f,
-		0.0f, 1.0f, 0.0f, 0.0f,
-		0.0f, 0.0f, 1.0f, 0.0f,
-		0.0f, 0.0f, 0.0f, 1.0f
-	};
-
-	/*
-	// Naive implementation: a single thread fills data in
-	if (threadIdx.x == 0) {
-		for (uint i = 0; i < 4; i++) {	// How many rows (of 4x4 blocks) are there in matrix A?
-			for (uint j = 0; j < 4; j++) {	// How many cols (of 4x4 blocks) are there in matrix A?
-				for (uint ii = 0; ii < 4; ii++) {
-					for (uint jj = 0; jj < 4; jj++) {
-						Q_data[4*i + 64*j + ii + 16*jj] = I4 [4*ii + jj];
-					}
-				}
-			}
-		}
-	}
-	*/
-
-	// Slightly improved multi-threaded implementation
-	for (uint i = threadIdx.x; i < 4; i+=blockDim.x) {	// How many rows (of 4x4 blocks) are there in matrix A?
-		for (uint j = 0; j < 4; j++) {	// How many cols (of 4x4 blocks) are there in matrix A?
-			for (uint ii = 0; ii < 4; ii++) {
-				for (uint jj = 0; jj < 4; jj++) {
-					Q_data[4*i + 64*j + ii + 16*jj] = I4 [4*ii + jj];
-				}
-			}
-		}
-	}
-
-	/*
-	// Further improved multi-threaded implementation
-	// (It didn't provide significant performance improvements -> commented out)
-	// Fusing two outer loops into a single one
-	// To do that: coeffs = 4i + 64j
-	constexpr uint coeffs [16] = {0, 64, 128, 192, 4, 68, 132, 196, 8, 72, 136, 200, 12, 76, 140, 204};
-	for (uint k = threadIdx.x; k < 16; k+=blockDim.x) {
-		for (uint ii = 0; ii < 4; ii++) {
-			for (uint jj = 0; jj < 4; jj++) {
-				Q_data[coeffs[k] + ii + 16*jj] = I4 [4*ii + jj];
-			}
-		}	
-	}
-	*/
-
-	/*
-	// Enable this block to print matrix values
-	if (blockIdx.x == 0 && threadIdx.x == 0) {
-		printf("\nQ_data");
-		for (uint i = 0; i < 16 * 16; i++) {
-			if ((i % 16) == 0) {printf("\n[Row %u]: ", i/16);}
-			printf(" %2.2f ", Q_data[i]);
-		}
-		printf("\n");
-    }
-	*/
-}
-
 __device__ void reduce_via_tensor_units(float *data_to_be_reduced) {
 	__syncthreads();
 
 	if (threadIdx.x <= 31) { // Only one warp performs reduction
-		__shared__ __align__ (256) float Q_data[TILE_SIZE];
-
-		fill_Q(Q_data);
-
-		__shared__ __align__ (256) float tmp[TILE_SIZE];
+		__shared__ __align__ (256) float Q_square[TILE_SIZE]; // storage for 16x16 matrix and 4x4 tiles of I4 matrix after
 
 		// Declaring and filling fragments - Those are *not* shared
 		mtk::wmma::tcec::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_P;
@@ -210,7 +143,6 @@ __device__ void reduce_via_tensor_units(float *data_to_be_reduced) {
 		mtk::wmma::tcec::fill_fragment(frag_P, 1.0f); // P: only ones
 		mtk::wmma::tcec::fill_fragment(frag_V, 0.0f); // Output: initialize to zeros
 		mtk::wmma::tcec::fill_fragment(frag_C, 0.0f); // Final result
-		mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_data, 16);
 
 		// 1. Accumulate the values: V <- AP + V
 		for(uint i = 0; i < (4 * NUM_OF_THREADS_PER_BLOCK)/TILE_SIZE; i++){
@@ -222,10 +154,17 @@ __device__ void reduce_via_tensor_units(float *data_to_be_reduced) {
 		}
 
 		// W <- V (required since we need V as a "wmma::matrix_b")
-		mtk::wmma::tcec::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major);
-		mtk::wmma::tcec::load_matrix_sync(frag_W, tmp, 16);
+		mtk::wmma::tcec::store_matrix_sync(Q_square, frag_V, 16, wmma::mem_col_major);
+		mtk::wmma::tcec::load_matrix_sync(frag_W, Q_square, 16);
 
 		// 2. Perform line sum: C <- QW + C (zero)
+		//    a) create a 4x4 tiled matrix containing 4x4 identity matrix in each tile:
+		//       - TENSOR=ON requires NUMWI to be larger than 32, so the following works and neatly gets rid of an additional function:
+		const unsigned int k  = (threadIdx.x<<3);
+		const unsigned int kk = 16 - (threadIdx.x>>1);
+		for(uint i = 0; i < 8; i++) Q_square[k + i] = ((i + kk) & 3) ? 0.0f : 1.0f;
+		mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_square, 16);
+		//    b) perform sum
 		mtk::wmma::tcec::mma_sync(frag_C, frag_Q, frag_W, frag_C);
 
 		// 3. Store result in shared memory

From 0dde87bbbc36052d6c401af1dabd44140764530c Mon Sep 17 00:00:00 2001
From: atillack <atillack@scripps.edu>
Date: Thu, 25 Jul 2024 12:52:12 -0700
Subject: [PATCH 12/12] Automatically set minimum tensor core compute
 capability 8.0 and added error message if device is below.

---
 Makefile                    | 6 +++++-
 host/src/performdocking.cpp | 7 +++++++
 test_cuda.sh                | 2 +-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 3e44cdc1..3b41fa03 100755
--- a/Makefile
+++ b/Makefile
@@ -27,7 +27,11 @@
 OVERLAP = ON
 
 ifeq ($(DEVICE), $(filter $(DEVICE),GPU CUDA))
-TARGETS_SUPPORTED := $(shell ./test_cuda.sh nvcc "$(GPU_INCLUDE_PATH)" "$(GPU_LIBRARY_PATH)" "$(TARGETS)" "$(DEVICE)")
+MIN_COMPUTE:=50
+ifeq ($(TENSOR), ON)
+MIN_COMPUTE:=80
+endif
+TARGETS_SUPPORTED := $(shell ./test_cuda.sh nvcc "$(GPU_INCLUDE_PATH)" "$(GPU_LIBRARY_PATH)" "$(TARGETS)" "$(MIN_COMPUTE)")
 # if user specifies DEVICE=GPU the test result determines wether CUDA will be used or not
 ifeq ($(TARGETS_SUPPORTED),)
 ifeq ($(DEVICE),CUDA)
diff --git a/host/src/performdocking.cpp b/host/src/performdocking.cpp
index 2375386c..a6e3d661 100644
--- a/host/src/performdocking.cpp
+++ b/host/src/performdocking.cpp
@@ -450,6 +450,13 @@ void setup_gpu_for_docking(
 	cudaDeviceProp props;
 	RTERROR(cudaGetDevice(&(cData.devnum)),"ERROR in cudaGetDevice:");
 	RTERROR(cudaGetDeviceProperties(&props,cData.devnum),"ERROR in cudaGetDeviceProperties:");
+#ifdef USE_NVTENSOR
+	if(props.major < 8){
+		printf("Error: Compute capability 8.0 or higher is needed for tensor core sum reductions.\n");
+		printf("       Available device %s has compute capability %d.%d.\n", props.name, props.major, props.minor);
+		exit(-1);
+	}
+#endif
 	tData.device_name = (char*) malloc(strlen(props.name)+32); // make sure array is large enough to hold device number text too
 	strcpy(tData.device_name, props.name);
 	if(gpuCount>1) snprintf(&tData.device_name[strlen(props.name)], strlen(props.name)+32, " (#%d / %d)",cData.devnum+1,gpuCount);
diff --git a/test_cuda.sh b/test_cuda.sh
index 12fb3d86..e39cceb0 100755
--- a/test_cuda.sh
+++ b/test_cuda.sh
@@ -26,7 +26,7 @@ if [[ "$4" != "" ]]; then
 	done
 	TARGETS="$4"
 else
-	TARGETS=`awk -F'_' '{ if(\$2>50) print \$2 }' <<< "$TARGETS_SUPPORTED" | tr "\n" " "`
+	TARGETS=`awk -F'_' "{ if(\\$2>=$5) print \\$2 }" <<< "$TARGETS_SUPPORTED" | tr "\n" " "`
 fi
 printf "Compiling for targets: %s\n" "$TARGETS" >&2
 cd "$script_dir"