Merge pull request #252 from L30nardoSV/tensorcores

Speeding up sum reductions in ADADELTA by using Tensor Cores
ccsb-scripps · Nov 19, 2024 · e63e6f6 · e63e6f6
2 parents a0c8c95 + 0dde87b
commit e63e6f6
Show file tree

Hide file tree

Showing 8 changed files with 171 additions and 9 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "wmma_extension"]
+	path = wmma_extension
+	url = https://github.com/wmmae/wmma_extension
diff --git a/Makefile b/Makefile
@@ -27,7 +27,11 @@
 OVERLAP = ON
 
 ifeq ($(DEVICE), $(filter $(DEVICE),GPU CUDA))
-TARGETS_SUPPORTED := $(shell ./test_cuda.sh nvcc "$(GPU_INCLUDE_PATH)" "$(GPU_LIBRARY_PATH)" "$(TARGETS)" "$(DEVICE)")
+MIN_COMPUTE:=50
+ifeq ($(TENSOR), ON)
+MIN_COMPUTE:=80
+endif
+TARGETS_SUPPORTED := $(shell ./test_cuda.sh nvcc "$(GPU_INCLUDE_PATH)" "$(GPU_LIBRARY_PATH)" "$(TARGETS)" "$(MIN_COMPUTE)")
 # if user specifies DEVICE=GPU the test result determines wether CUDA will be used or not
 ifeq ($(TARGETS_SUPPORTED),)
 ifeq ($(DEVICE),CUDA)

diff --git a/Makefile.Cuda b/Makefile.Cuda
@@ -16,6 +16,13 @@ CPP = g++
 UNAME := $(shell uname)
 
 TARGETS = 52 60 61 70 80
+
+ifeq ($(TENSOR), ON)
+	NVTENSOR=-DUSE_NVTENSOR
+	NVTENSOR+=-I./wmma_extension/include
+	TARGETS = 80
+endif
+
 CUDA_TARGETS=$(foreach target,$(TARGETS),-gencode arch=compute_$(target),code=sm_$(target))
 
 ifeq ($(DEVICE), CPU)
@@ -54,7 +61,6 @@ ifeq ($(OVERLAP), ON)
 	PIPELINE=-DUSE_PIPELINE -fopenmp
 endif
 
-
 BIN := $(wildcard $(TARGET)*)
 
 # ------------------------------------------------------
@@ -63,6 +69,9 @@ BIN := $(wildcard $(TARGET)*)
 NUMWI=
 
 ifeq ($(NUMWI), 32)
+ifeq ($(TENSOR), ON)
+$(error NUMWI needs to be at least 64 with TENSOR=ON)
+endif
 	NWI=-DN32WI
 	TARGET:=$(TARGET)_32wi
 else ifeq ($(NUMWI), 64)
@@ -177,15 +186,15 @@ TOOL_CFLAGS+=-DVERSION=\"$(GIT_VERSION)\"
 # ------------------------------------------------------
 
 kernels: $(KERNEL_SRC)
-	$(NVCC) $(NWI) $(REP) $(CUDA_FLAGS) $(IFLAGS) $(CUDA_INCLUDES) -c $(KRNL_DIR)/kernels.cu
+	$(NVCC) $(NWI) $(REP) $(CUDA_FLAGS) $(IFLAGS) $(CUDA_INCLUDES) $(NVTENSOR) -c $(KRNL_DIR)/kernels.cu
 
 otool:
 	@echo "Building" $(TOOL_TARGET) "..."
 	$(CPP) \
 	$(shell ls $(HOST_SRC_DIR)/*.cpp) \
 	$(TOOL_CFLAGS) \
 	-o$(BIN_DIR)/$(TOOL_TARGET) \
-	$(PIPELINE) $(OPT) -DTOOLMODE $(REP)
+	$(PIPELINE) $(NVTENSOR) $(OPT) -DTOOLMODE $(REP)
 
 odock: check-env-all kernels
 	@echo "Building" $(TARGET) "..."
@@ -194,7 +203,7 @@ odock: check-env-all kernels
 	$(CFLAGS) \
 	$(LIB_CUDA) \
 	-o$(BIN_DIR)/$(TARGET) \
-	$(DEV) $(NWI) $(PIPELINE) $(OPT) $(DD) $(REP) $(KFLAGS)
+	$(DEV) $(NWI) $(PIPELINE) $(NVTENSOR) $(OPT) $(DD) $(REP) $(KFLAGS)
 
 # Example
 # 1ac8: for testing gradients of translation and rotation genes

diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu
@@ -642,23 +642,77 @@ __device__ void gpu_calc_energrad(
 		torque_rot.z += tr.z;
 	}
 
-	// Do a reduction over the total gradient containing prepared "gradient_intra_*" values
+#ifdef USE_NVTENSOR
+	/* Begin: Reduction using tensor units */
+
+	// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
+	// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
+	// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
+
+	// 1. Convert data-to-be-reduced from float to half
+	// and place it in a shared memory array
+	__shared__ __align__(256) float data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK];
+	data_to_be_reduced[4*threadIdx.x] = torque_rot.x;
+	data_to_be_reduced[4*threadIdx.x + 1] = torque_rot.y;
+	data_to_be_reduced[4*threadIdx.x + 2] = torque_rot.z;
+	data_to_be_reduced[4*threadIdx.x + 3] = energy;
+
+	// 2. Perform reduction via tensor units
+	reduce_via_tensor_units(data_to_be_reduced);
+
+	// 3. Retrieve results from shared memory
+	torque_rot.x = data_to_be_reduced[0];
+	torque_rot.y = data_to_be_reduced[1];
+	torque_rot.z = data_to_be_reduced[2];
+	energy = data_to_be_reduced[3];
+
+	/* End: Reduction using tensor units */
+#else
+	// Reduction over the total gradient containing prepared "gradient_intra_*" values
 	REDUCEFLOATSUM(torque_rot.x, pFloatAccumulator);
 	REDUCEFLOATSUM(torque_rot.y, pFloatAccumulator);
 	REDUCEFLOATSUM(torque_rot.z, pFloatAccumulator);
 
+	// Reduction over partial energies and prepared "gradient_intra_*" values
+	REDUCEFLOATSUM(energy, pFloatAccumulator);
+#endif
+
 	// TODO
 	// -------------------------------------------------------
 	// Obtaining energy and translation-related gradients
 	// -------------------------------------------------------
-	// reduction over partial energies and prepared "gradient_intra_*" values
-	REDUCEFLOATSUM(energy, pFloatAccumulator);
+
 #if defined (DEBUG_ENERGY_KERNEL)
 	REDUCEFLOATSUM(intraE, pFloatAccumulator);
 #endif
+
+#ifdef USE_NVTENSOR
+	/* Begin: Reduction using tensor units */
+
+	// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
+	// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
+	// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
+
+	// 1. Convert data-to-be-reduced from float to half
+	// and place it in a shared memory array
+	data_to_be_reduced[4*threadIdx.x] = gx;
+	data_to_be_reduced[4*threadIdx.x + 1] = gy;
+	data_to_be_reduced[4*threadIdx.x + 2] = gz;
+
+	// 2. Perform reduction via tensor units
+	reduce_via_tensor_units(data_to_be_reduced);
+
+	// 3. Retrieve results from shared memory
+	gx = data_to_be_reduced[0];
+	gy = data_to_be_reduced[1];
+	gz = data_to_be_reduced[2];
+
+	/* End: Reduction using tensor units */
+#else
 	REDUCEFLOATSUM(gx, pFloatAccumulator);
 	REDUCEFLOATSUM(gy, pFloatAccumulator);
 	REDUCEFLOATSUM(gz, pFloatAccumulator);
+#endif
 
 	global_energy = energy;
 #ifndef FLOAT_GRADIENTS

diff --git a/cuda/kernels.cu b/cuda/kernels.cu
@@ -93,6 +93,90 @@ __device__ inline int64_t ullitolli(uint64_t u)
 #define ATOMICADDF32(pAccumulator, value) atomicAdd(pAccumulator, (value))
 #define ATOMICSUBF32(pAccumulator, value) atomicAdd(pAccumulator, -(value))
 
+#ifdef USE_NVTENSOR
+/* Begin: Reduction using tensor units */
+
+// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
+// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
+// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf
+
+	/*
+	* WMMA Extension for single precision matmul using Tensor Cores
+	* and error correction technique (TCEC)
+	* https://github.com/wmmae/wmma_extension/blob/main/docs/mma_f32.md
+	*/
+	#include <wmma_extension/tcec/tcec.hpp>
+	using tf32 = nvcuda::wmma::precision::tf32;
+
+/*
+ * Tensor Cores
+ * https://developer.nvidia.com/blog/programming-tensor-cores-cuda-9
+ *
+ * Don't forget to compile specifying the architecture, e.g., sm_86.
+ * For AutoDock-GPU, this can be done via the TARGETS option.
+ * make DEVICE=GPU TESTLS=ad NUMWI=64 TARGETS=86 test
+ * https://stackoverflow.com/a/53634598/1616865
+ */
+#include <mma.h>
+using namespace nvcuda;
+
+#define TILE_SIZE (16 * 16)
+
+constexpr int rowscols_M = 16;	// Number of rows (or cols) in the M dimension
+constexpr int rowscols_N = 16;	// Number of rows (or cols) in the N dimension
+constexpr int rowscols_K = 16;	// Number of rows (or cols) in the K dimension
+
+__device__ void reduce_via_tensor_units(float *data_to_be_reduced) {
+	__syncthreads();
+
+	if (threadIdx.x <= 31) { // Only one warp performs reduction
+		__shared__ __align__ (256) float Q_square[TILE_SIZE]; // storage for 16x16 matrix and 4x4 tiles of I4 matrix after
+
+		// Declaring and filling fragments - Those are *not* shared
+		mtk::wmma::tcec::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_P;
+		mtk::wmma::tcec::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, tf32> frag_V;
+
+		mtk::wmma::tcec::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_Q;
+		mtk::wmma::tcec::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_W;
+		mtk::wmma::tcec::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, tf32> frag_C;
+
+		mtk::wmma::tcec::fill_fragment(frag_P, 1.0f); // P: only ones
+		mtk::wmma::tcec::fill_fragment(frag_V, 0.0f); // Output: initialize to zeros
+		mtk::wmma::tcec::fill_fragment(frag_C, 0.0f); // Final result
+
+		// 1. Accumulate the values: V <- AP + V
+		for(uint i = 0; i < (4 * NUM_OF_THREADS_PER_BLOCK)/TILE_SIZE; i++){
+			const unsigned int offset = i * TILE_SIZE;
+
+			mtk::wmma::tcec::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_A;
+			mtk::wmma::tcec::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16);
+			mtk::wmma::tcec::mma_sync(frag_V, frag_A, frag_P, frag_V);
+		}
+
+		// W <- V (required since we need V as a "wmma::matrix_b")
+		mtk::wmma::tcec::store_matrix_sync(Q_square, frag_V, 16, wmma::mem_col_major);
+		mtk::wmma::tcec::load_matrix_sync(frag_W, Q_square, 16);
+
+		// 2. Perform line sum: C <- QW + C (zero)
+		//    a) create a 4x4 tiled matrix containing 4x4 identity matrix in each tile:
+		//       - TENSOR=ON requires NUMWI to be larger than 32, so the following works and neatly gets rid of an additional function:
+		const unsigned int k  = (threadIdx.x<<3);
+		const unsigned int kk = 16 - (threadIdx.x>>1);
+		for(uint i = 0; i < 8; i++) Q_square[k + i] = ((i + kk) & 3) ? 0.0f : 1.0f;
+		mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_square, 16);
+		//    b) perform sum
+		mtk::wmma::tcec::mma_sync(frag_C, frag_Q, frag_W, frag_C);
+
+		// 3. Store result in shared memory
+		mtk::wmma::tcec::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major);
+	}
+
+	__syncthreads();
+}
+
+/* End: Reduction using tensor units */
+#endif
+
 #define REDUCEFLOATSUM(value, pAccumulator) \
 	if (threadIdx.x == 0) \
 	{ \

diff --git a/host/src/performdocking.cpp b/host/src/performdocking.cpp
@@ -450,6 +450,13 @@ void setup_gpu_for_docking(
 	cudaDeviceProp props;
 	RTERROR(cudaGetDevice(&(cData.devnum)),"ERROR in cudaGetDevice:");
 	RTERROR(cudaGetDeviceProperties(&props,cData.devnum),"ERROR in cudaGetDeviceProperties:");
+#ifdef USE_NVTENSOR
+	if(props.major < 8){
+		printf("Error: Compute capability 8.0 or higher is needed for tensor core sum reductions.\n");
+		printf("       Available device %s has compute capability %d.%d.\n", props.name, props.major, props.minor);
+		exit(-1);
+	}
+#endif
 	tData.device_name = (char*) malloc(strlen(props.name)+32); // make sure array is large enough to hold device number text too
 	strcpy(tData.device_name, props.name);
 	if(gpuCount>1) snprintf(&tData.device_name[strlen(props.name)], strlen(props.name)+32, " (#%d / %d)",cData.devnum+1,gpuCount);

diff --git a/test_cuda.sh b/test_cuda.sh
@@ -26,7 +26,7 @@ if [[ "$4" != "" ]]; then
 	done
 	TARGETS="$4"
 else
-	TARGETS=`awk -F'_' '{ if(\$2>50) print \$2 }' <<< "$TARGETS_SUPPORTED" | tr "\n" " "`
+	TARGETS=`awk -F'_' "{ if(\\$2>=$5) print \\$2 }" <<< "$TARGETS_SUPPORTED" | tr "\n" " "`
 fi
 printf "Compiling for targets: %s\n" "$TARGETS" >&2
 cd "$script_dir"

diff --git a/wmma_extension b/wmma_extension