From 6d0eec1f337018d15c614e75fb6b15ccc989dc6c Mon Sep 17 00:00:00 2001 From: Leonardo Solis Vasquez Date: Fri, 5 Jan 2024 08:31:57 -0500 Subject: [PATCH 01/12] Offloading reductions to Tensor Cores. --- cuda/calcMergeEneGra.cu | 54 +++++++++++++-- cuda/kernels.cu | 146 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+), 7 deletions(-) diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu index 1e2f0d54..e5616050 100644 --- a/cuda/calcMergeEneGra.cu +++ b/cuda/calcMergeEneGra.cu @@ -643,22 +643,62 @@ __device__ void gpu_calc_energrad( } // Do a reduction over the total gradient containing prepared "gradient_intra_*" values - REDUCEFLOATSUM(torque_rot.x, pFloatAccumulator); - REDUCEFLOATSUM(torque_rot.y, pFloatAccumulator); - REDUCEFLOATSUM(torque_rot.z, pFloatAccumulator); +// REDUCEFLOATSUM(torque_rot.x, pFloatAccumulator); +// REDUCEFLOATSUM(torque_rot.y, pFloatAccumulator); +// REDUCEFLOATSUM(torque_rot.z, pFloatAccumulator); // TODO // ------------------------------------------------------- // Obtaining energy and translation-related gradients // ------------------------------------------------------- // reduction over partial energies and prepared "gradient_intra_*" values - REDUCEFLOATSUM(energy, pFloatAccumulator); +// REDUCEFLOATSUM(energy, pFloatAccumulator); + + /* Begin: Reduction using tensor units */ + + // 1. Convert data-to-be-reduced from float to half + // and place it in a shared memory array + __shared__ __align__(256) half data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK]; + data_to_be_reduced[4*threadIdx.x] = __float2half(torque_rot.x); + data_to_be_reduced[4*threadIdx.x + 1] = __float2half(torque_rot.y); + data_to_be_reduced[4*threadIdx.x + 2] = __float2half(torque_rot.z); + data_to_be_reduced[4*threadIdx.x + 3] = __float2half(energy); + + // 2. Perform reduction via tensor units + reduce_via_tensor_units(data_to_be_reduced); + + // 3. Retrieve results from shared memory + torque_rot.x = __half2float(data_to_be_reduced[0]); + torque_rot.y = __half2float(data_to_be_reduced[1]); + torque_rot.z = __half2float(data_to_be_reduced[2]); + energy = __half2float(data_to_be_reduced[3]); + + /* End: Reduction using tensor units */ + #if defined (DEBUG_ENERGY_KERNEL) REDUCEFLOATSUM(intraE, pFloatAccumulator); #endif - REDUCEFLOATSUM(gx, pFloatAccumulator); - REDUCEFLOATSUM(gy, pFloatAccumulator); - REDUCEFLOATSUM(gz, pFloatAccumulator); +// REDUCEFLOATSUM(gx, pFloatAccumulator); +// REDUCEFLOATSUM(gy, pFloatAccumulator); +// REDUCEFLOATSUM(gz, pFloatAccumulator); + + /* Begin: Reduction using tensor units */ + + // 1. Convert data-to-be-reduced from float to half + // and place it in a shared memory array + data_to_be_reduced[4*threadIdx.x] = __float2half(gx); + data_to_be_reduced[4*threadIdx.x + 1] = __float2half(gy); + data_to_be_reduced[4*threadIdx.x + 2] = __float2half(gz); + + // 2. Perform reduction via tensor units + reduce_via_tensor_units(data_to_be_reduced); + + // 3. Retrieve results from shared memory + gx = __half2float(data_to_be_reduced[0]); + gy = __half2float(data_to_be_reduced[1]); + gz = __half2float(data_to_be_reduced[2]); + + /* End: Reduction using tensor units */ global_energy = energy; #ifndef FLOAT_GRADIENTS diff --git a/cuda/kernels.cu b/cuda/kernels.cu index c98c69b5..4c477ddd 100644 --- a/cuda/kernels.cu +++ b/cuda/kernels.cu @@ -93,6 +93,152 @@ __device__ inline int64_t ullitolli(uint64_t u) #define ATOMICADDF32(pAccumulator, value) atomicAdd(pAccumulator, (value)) #define ATOMICSUBF32(pAccumulator, value) atomicAdd(pAccumulator, -(value)) +/* Begin: Reduction using tensor units */ +/* + * Half-precision support + * https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF__MISC.html + */ +#include + +/* + * Tensor Cores + * https://developer.nvidia.com/blog/programming-tensor-cores-cuda-9 + * + * Don't forget to compile specifying the architecture, e.g., sm_86. + * For AutoDock-GPU, this can be done via the TARGETS option. + * make DEVICE=GPU TESTLS=ad NUMWI=64 TARGETS=86 test + * https://stackoverflow.com/a/53634598/1616865 + */ +#include +using namespace nvcuda; + +#define TILE_SIZE (16 * 16) + +constexpr int rowscols_M = 16; // Number of rows (or cols) in the M dimension +constexpr int rowscols_N = 16; // Number of rows (or cols) in the N dimension +constexpr int rowscols_K = 16; // Number of rows (or cols) in the K dimension + +// Half constants +// CUDART_ONE_FP16 was not recognized by the NVCC compiler +// So its value is indicated explicitly +// https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS.html#group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS +#define HALF_ONE __ushort_as_half((unsigned short)0x3C00U) +#define HALF_ZERO __ushort_as_half((unsigned short)0x0000U) + +__device__ void fill_Q(half *Q_data) { + + half I4[16] = { + HALF_ONE, HALF_ZERO, HALF_ZERO, HALF_ZERO, + HALF_ZERO, HALF_ONE, HALF_ZERO, HALF_ZERO, + HALF_ZERO, HALF_ZERO, HALF_ONE, HALF_ZERO, + HALF_ZERO, HALF_ZERO, HALF_ZERO, HALF_ONE + }; + + /* + // Naive implementation: a single thread fills data in + if (threadIdx.x == 0) { + for (uint i = 0; i < 4; i++) { // How many rows (of 4x4 blocks) are there in matrix A? + for (uint j = 0; j < 4; j++) { // How many cols (of 4x4 blocks) are there in matrix A? + for (uint ii = 0; ii < 4; ii++) { + for (uint jj = 0; jj < 4; jj++) { + Q_data[4*i + 64*j + ii + 16*jj] = I4 [4*ii + jj]; + } + } + } + } + } + */ + + // Slightly improved multi-threaded implementation + for (uint i = threadIdx.x; i < 4; i+=blockDim.x) { // How many rows (of 4x4 blocks) are there in matrix A? + for (uint j = 0; j < 4; j++) { // How many cols (of 4x4 blocks) are there in matrix A? + for (uint ii = 0; ii < 4; ii++) { + for (uint jj = 0; jj < 4; jj++) { + Q_data[4*i + 64*j + ii + 16*jj] = I4 [4*ii + jj]; + } + } + } + } + + /* + // Further improved multi-threaded implementation + // (It didn't provide significant performance improvements -> commented out) + // Fusing two outer loops into a single one + // To do that: coeffs = 4i + 64j + constexpr uint coeffs [16] = {0, 64, 128, 192, 4, 68, 132, 196, 8, 72, 136, 200, 12, 76, 140, 204}; + for (uint k = threadIdx.x; k < 16; k+=blockDim.x) { + for (uint ii = 0; ii < 4; ii++) { + for (uint jj = 0; jj < 4; jj++) { + Q_data[coeffs[k] + ii + 16*jj] = I4 [4*ii + jj]; + } + } + } + */ + + /* + // Enable this block to print matrix values + if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("\nQ_data"); + for (uint i = 0; i < 16 * 16; i++) { + if ((i % 16) == 0) {printf("\n[Row %u]: ", i/16);} + printf(" %2.2f ", __half2float(Q_data[i])); + } + printf("\n"); + } + */ +} + +// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: +// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" +// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf +__device__ void reduce_via_tensor_units(half *data_to_be_reduced) { + + __syncthreads(); + + if (threadIdx.x <= 31) { // Only one warp performs reduction + __shared__ __align__ (256) half Q_data[TILE_SIZE]; + + fill_Q(Q_data); + + __shared__ __align__ (256) half tmp[TILE_SIZE]; + + // Declaring and filling fragments - Those are *not* shared + wmma::fragment frag_P; + wmma::fragment frag_V; + + wmma::fragment frag_Q; + wmma::fragment frag_W; + wmma::fragment frag_C; + + wmma::fill_fragment(frag_P, HALF_ONE); // P: only ones + wmma::fill_fragment(frag_V, HALF_ZERO); // Output: initialize to zeros + wmma::fill_fragment(frag_C, HALF_ZERO); // Final result + wmma::load_matrix_sync(frag_Q, Q_data, 16); + + // 1. Accumulate the values: V <- AP + V + for(uint i = 0; i < (4 * NUM_OF_THREADS_PER_BLOCK)/TILE_SIZE; i++){ + const unsigned int offset = i * TILE_SIZE; + wmma::fragment frag_A; + wmma::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16); + wmma::mma_sync(frag_V, frag_A, frag_P, frag_V); + } + + // W <- V (required since we need V as a "wmma::matrix_b") + wmma::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major); + wmma::load_matrix_sync(frag_W, tmp, 16); + + // 2. Perform line sum: C <- QW + C (zero) + wmma::mma_sync(frag_C, frag_Q, frag_W, frag_C); + + // 3. Store result in shared memory + wmma::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major); + } + + __syncthreads(); +} + +/* End: Reduction using tensor units */ + #define REDUCEFLOATSUM(value, pAccumulator) \ if (threadIdx.x == 0) \ { \ From 9df1e350e09b47315984de36573ffd2b5a9d7d65 Mon Sep 17 00:00:00 2001 From: Leonardo Solis Vasquez Date: Tue, 9 Jan 2024 09:06:11 -0500 Subject: [PATCH 02/12] Asserting minimum number-of-threads-per-block (= 64) for the matrix-based reduction method to work correctly. --- host/src/performdocking.cpp.Cuda | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/host/src/performdocking.cpp.Cuda b/host/src/performdocking.cpp.Cuda index c35b12e4..cf65d33c 100644 --- a/host/src/performdocking.cpp.Cuda +++ b/host/src/performdocking.cpp.Cuda @@ -583,6 +583,23 @@ parameters argc and argv: */ uint32_t kernel7_gxsize = 0; uint32_t kernel7_lxsize = threadsPerBlock; + +/* Begin: Reduction using tensor units */ + +// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: +// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" +// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf + +// threadsPerBlock should be at least 64 for the proposed matrix-based method to work. +// Reason: +// "A 256-element matrix is used to store the values to be reduced, +// and each thread holds exactly four values, +// which results in a minimum of 64 threads to fill a single matrix." + +assert(("Matrix-based reductions in ADADELTA: threadsPerBlock should be at least 64!", kernel7_lxsize >= 64)); + +/* End: Reduction using tensor units */ + uint32_t kernel8_gxsize = 0; uint32_t kernel8_lxsize = threadsPerBlock; if (cData.dockpars.lsearch_rate != 0.0f) { From 7798aa422e176a02c629ce864ef0f5c63d64d17b Mon Sep 17 00:00:00 2001 From: Leonardo Solis Vasquez Date: Tue, 9 Jan 2024 09:10:23 -0500 Subject: [PATCH 03/12] Minor improvements in comments. --- cuda/calcMergeEneGra.cu | 8 ++++++++ cuda/kernels.cu | 8 +++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu index e5616050..54e00936 100644 --- a/cuda/calcMergeEneGra.cu +++ b/cuda/calcMergeEneGra.cu @@ -656,6 +656,10 @@ __device__ void gpu_calc_energrad( /* Begin: Reduction using tensor units */ + // Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: + // "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" + // https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf + // 1. Convert data-to-be-reduced from float to half // and place it in a shared memory array __shared__ __align__(256) half data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK]; @@ -684,6 +688,10 @@ __device__ void gpu_calc_energrad( /* Begin: Reduction using tensor units */ + // Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: + // "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" + // https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf + // 1. Convert data-to-be-reduced from float to half // and place it in a shared memory array data_to_be_reduced[4*threadIdx.x] = __float2half(gx); diff --git a/cuda/kernels.cu b/cuda/kernels.cu index 4c477ddd..894ccd62 100644 --- a/cuda/kernels.cu +++ b/cuda/kernels.cu @@ -94,6 +94,11 @@ __device__ inline int64_t ullitolli(uint64_t u) #define ATOMICSUBF32(pAccumulator, value) atomicAdd(pAccumulator, -(value)) /* Begin: Reduction using tensor units */ + +// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: +// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" +// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf + /* * Half-precision support * https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF__MISC.html @@ -188,9 +193,6 @@ __device__ void fill_Q(half *Q_data) { */ } -// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: -// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" -// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf __device__ void reduce_via_tensor_units(half *data_to_be_reduced) { __syncthreads(); From 10b07fa6a7defde4db8557ccbec41d4cb537de6e Mon Sep 17 00:00:00 2001 From: Leonardo Solis Vasquez Date: Thu, 11 Jan 2024 13:48:11 -0500 Subject: [PATCH 04/12] Adding TENSOR directive to enable/disable usage of tensor cores for reductions in ADADELTA. --- Makefile.Cuda | 9 +++++--- cuda/calcMergeEneGra.cu | 36 +++++++++++++++++++------------- cuda/kernels.cu | 2 ++ host/src/performdocking.cpp.Cuda | 24 +++++++++++---------- 4 files changed, 42 insertions(+), 29 deletions(-) diff --git a/Makefile.Cuda b/Makefile.Cuda index 3356cf8d..42018021 100644 --- a/Makefile.Cuda +++ b/Makefile.Cuda @@ -54,6 +54,9 @@ ifeq ($(OVERLAP), ON) PIPELINE=-DUSE_PIPELINE -fopenmp endif +ifeq ($(TENSOR), ON) + NVTENSOR=-DUSE_NVTENSOR +endif BIN := $(wildcard $(TARGET)*) @@ -184,7 +187,7 @@ unlink-code: rm -f $(HOST_INC_DIR)/performdocking.h $(HOST_SRC_DIR)/performdocking.cpp kernels: $(KERNEL_SRC) - $(NVCC) $(NWI) $(REP) $(CUDA_FLAGS) $(IFLAGS) $(CUDA_INCLUDES) -c $(KRNL_DIR)/kernels.cu + $(NVCC) $(NWI) $(REP) $(CUDA_FLAGS) $(IFLAGS) $(CUDA_INCLUDES) $(NVTENSOR) -c $(KRNL_DIR)/kernels.cu otool: unlink-code @echo "Building" $(TOOL_TARGET) "..." @@ -192,7 +195,7 @@ otool: unlink-code $(shell ls $(HOST_SRC_DIR)/*.cpp) \ $(TOOL_CFLAGS) \ -o$(BIN_DIR)/$(TOOL_TARGET) \ - $(PIPELINE) $(OPT) -DTOOLMODE $(REP) + $(PIPELINE) $(NVTENSOR) $(OPT) -DTOOLMODE $(REP) odock: check-env-all kernels link-code @echo "Building" $(TARGET) "..." @@ -201,7 +204,7 @@ odock: check-env-all kernels link-code $(CFLAGS) \ $(LIB_CUDA) \ -o$(BIN_DIR)/$(TARGET) \ - $(DEV) $(NWI) $(PIPELINE) $(OPT) $(DD) $(REP) $(KFLAGS) + $(DEV) $(NWI) $(PIPELINE) $(NVTENSOR) $(OPT) $(DD) $(REP) $(KFLAGS) # Example # 1ac8: for testing gradients of translation and rotation genes diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu index 54e00936..c563356f 100644 --- a/cuda/calcMergeEneGra.cu +++ b/cuda/calcMergeEneGra.cu @@ -642,18 +642,7 @@ __device__ void gpu_calc_energrad( torque_rot.z += tr.z; } - // Do a reduction over the total gradient containing prepared "gradient_intra_*" values -// REDUCEFLOATSUM(torque_rot.x, pFloatAccumulator); -// REDUCEFLOATSUM(torque_rot.y, pFloatAccumulator); -// REDUCEFLOATSUM(torque_rot.z, pFloatAccumulator); - - // TODO - // ------------------------------------------------------- - // Obtaining energy and translation-related gradients - // ------------------------------------------------------- - // reduction over partial energies and prepared "gradient_intra_*" values -// REDUCEFLOATSUM(energy, pFloatAccumulator); - +#ifdef USE_NVTENSOR /* Begin: Reduction using tensor units */ // Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: @@ -678,14 +667,26 @@ __device__ void gpu_calc_energrad( energy = __half2float(data_to_be_reduced[3]); /* End: Reduction using tensor units */ +#else + // Reduction over the total gradient containing prepared "gradient_intra_*" values + REDUCEFLOATSUM(torque_rot.x, pFloatAccumulator); + REDUCEFLOATSUM(torque_rot.y, pFloatAccumulator); + REDUCEFLOATSUM(torque_rot.z, pFloatAccumulator); + + // Reduction over partial energies and prepared "gradient_intra_*" values + REDUCEFLOATSUM(energy, pFloatAccumulator); +#endif + + // TODO + // ------------------------------------------------------- + // Obtaining energy and translation-related gradients + // ------------------------------------------------------- #if defined (DEBUG_ENERGY_KERNEL) REDUCEFLOATSUM(intraE, pFloatAccumulator); #endif -// REDUCEFLOATSUM(gx, pFloatAccumulator); -// REDUCEFLOATSUM(gy, pFloatAccumulator); -// REDUCEFLOATSUM(gz, pFloatAccumulator); +#ifdef USE_NVTENSOR /* Begin: Reduction using tensor units */ // Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: @@ -707,6 +708,11 @@ __device__ void gpu_calc_energrad( gz = __half2float(data_to_be_reduced[2]); /* End: Reduction using tensor units */ +#else + REDUCEFLOATSUM(gx, pFloatAccumulator); + REDUCEFLOATSUM(gy, pFloatAccumulator); + REDUCEFLOATSUM(gz, pFloatAccumulator); +#endif global_energy = energy; #ifndef FLOAT_GRADIENTS diff --git a/cuda/kernels.cu b/cuda/kernels.cu index 894ccd62..ba4fccc5 100644 --- a/cuda/kernels.cu +++ b/cuda/kernels.cu @@ -93,6 +93,7 @@ __device__ inline int64_t ullitolli(uint64_t u) #define ATOMICADDF32(pAccumulator, value) atomicAdd(pAccumulator, (value)) #define ATOMICSUBF32(pAccumulator, value) atomicAdd(pAccumulator, -(value)) +#ifdef USE_NVTENSOR /* Begin: Reduction using tensor units */ // Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: @@ -240,6 +241,7 @@ __device__ void reduce_via_tensor_units(half *data_to_be_reduced) { } /* End: Reduction using tensor units */ +#endif #define REDUCEFLOATSUM(value, pAccumulator) \ if (threadIdx.x == 0) \ diff --git a/host/src/performdocking.cpp.Cuda b/host/src/performdocking.cpp.Cuda index cf65d33c..29e02e81 100644 --- a/host/src/performdocking.cpp.Cuda +++ b/host/src/performdocking.cpp.Cuda @@ -584,21 +584,23 @@ parameters argc and argv: uint32_t kernel7_gxsize = 0; uint32_t kernel7_lxsize = threadsPerBlock; -/* Begin: Reduction using tensor units */ + #ifdef USE_NVTENSOR + /* Begin: Reduction using tensor units */ -// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: -// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" -// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf + // Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: + // "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" + // https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf -// threadsPerBlock should be at least 64 for the proposed matrix-based method to work. -// Reason: -// "A 256-element matrix is used to store the values to be reduced, -// and each thread holds exactly four values, -// which results in a minimum of 64 threads to fill a single matrix." + // threadsPerBlock should be at least 64 for the proposed matrix-based method to work. + // Reason: + // "A 256-element matrix is used to store the values to be reduced, + // and each thread holds exactly four values, + // which results in a minimum of 64 threads to fill a single matrix." -assert(("Matrix-based reductions in ADADELTA: threadsPerBlock should be at least 64!", kernel7_lxsize >= 64)); + assert(("Matrix-based reductions in ADADELTA: threadsPerBlock should be at least 64!", kernel7_lxsize >= 64)); -/* End: Reduction using tensor units */ + /* End: Reduction using tensor units */ + #endif uint32_t kernel8_gxsize = 0; uint32_t kernel8_lxsize = threadsPerBlock; From b2ab3fe6b79e08b7c1af5f11233070ac6c42d025 Mon Sep 17 00:00:00 2001 From: Leonardo Solis Vasquez Date: Fri, 16 Feb 2024 14:21:54 -0500 Subject: [PATCH 05/12] Enabling usage of WMMA Extension for single precision matmul using Tensor Cores and error correction (TCEC). --- .gitmodules | 3 ++ Makefile.Cuda | 4 ++ cuda/calcMergeEneGra.cu | 28 +++++++++++++ cuda/kernels.cu | 92 +++++++++++++++++++++++++++++++++++++---- wmma_extension | 1 + 5 files changed, 120 insertions(+), 8 deletions(-) create mode 100644 .gitmodules create mode 160000 wmma_extension diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..0bdb553c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "wmma_extension"] + path = wmma_extension + url = https://github.com/wmmae/wmma_extension diff --git a/Makefile.Cuda b/Makefile.Cuda index 42018021..31b9ba7b 100644 --- a/Makefile.Cuda +++ b/Makefile.Cuda @@ -56,6 +56,10 @@ endif ifeq ($(TENSOR), ON) NVTENSOR=-DUSE_NVTENSOR + + ifeq ($(TCEC), ON) + NVTENSOR+=-I./wmma_extension/include -DUSE_TCEC + endif endif BIN := $(wildcard $(TARGET)*) diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu index c563356f..94e2a10b 100644 --- a/cuda/calcMergeEneGra.cu +++ b/cuda/calcMergeEneGra.cu @@ -651,20 +651,35 @@ __device__ void gpu_calc_energrad( // 1. Convert data-to-be-reduced from float to half // and place it in a shared memory array + #ifdef USE_TCEC + __shared__ __align__(256) float data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK]; + data_to_be_reduced[4*threadIdx.x] = torque_rot.x; + data_to_be_reduced[4*threadIdx.x + 1] = torque_rot.y; + data_to_be_reduced[4*threadIdx.x + 2] = torque_rot.z; + data_to_be_reduced[4*threadIdx.x + 3] = energy; + #else __shared__ __align__(256) half data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK]; data_to_be_reduced[4*threadIdx.x] = __float2half(torque_rot.x); data_to_be_reduced[4*threadIdx.x + 1] = __float2half(torque_rot.y); data_to_be_reduced[4*threadIdx.x + 2] = __float2half(torque_rot.z); data_to_be_reduced[4*threadIdx.x + 3] = __float2half(energy); + #endif // 2. Perform reduction via tensor units reduce_via_tensor_units(data_to_be_reduced); // 3. Retrieve results from shared memory + #ifdef USE_TCEC + torque_rot.x = data_to_be_reduced[0]; + torque_rot.y = data_to_be_reduced[1]; + torque_rot.z = data_to_be_reduced[2]; + energy = data_to_be_reduced[3]; + #else torque_rot.x = __half2float(data_to_be_reduced[0]); torque_rot.y = __half2float(data_to_be_reduced[1]); torque_rot.z = __half2float(data_to_be_reduced[2]); energy = __half2float(data_to_be_reduced[3]); + #endif /* End: Reduction using tensor units */ #else @@ -695,17 +710,30 @@ __device__ void gpu_calc_energrad( // 1. Convert data-to-be-reduced from float to half // and place it in a shared memory array + + #ifdef USE_TCEC + data_to_be_reduced[4*threadIdx.x] = gx; + data_to_be_reduced[4*threadIdx.x + 1] = gy; + data_to_be_reduced[4*threadIdx.x + 2] = gz; + #else data_to_be_reduced[4*threadIdx.x] = __float2half(gx); data_to_be_reduced[4*threadIdx.x + 1] = __float2half(gy); data_to_be_reduced[4*threadIdx.x + 2] = __float2half(gz); + #endif // 2. Perform reduction via tensor units reduce_via_tensor_units(data_to_be_reduced); // 3. Retrieve results from shared memory + #ifdef USE_TCEC + gx = data_to_be_reduced[0]; + gy = data_to_be_reduced[1]; + gz = data_to_be_reduced[2]; + #else gx = __half2float(data_to_be_reduced[0]); gy = __half2float(data_to_be_reduced[1]); gz = __half2float(data_to_be_reduced[2]); + #endif /* End: Reduction using tensor units */ #else diff --git a/cuda/kernels.cu b/cuda/kernels.cu index ba4fccc5..689aa9fa 100644 --- a/cuda/kernels.cu +++ b/cuda/kernels.cu @@ -106,6 +106,17 @@ __device__ inline int64_t ullitolli(uint64_t u) */ #include + #ifdef USE_TCEC + /* + * WMMA Extension for single precision matmul using Tensor Cores + * and error correction technique (TCEC) + * https://github.com/wmmae/wmma_extension/blob/main/docs/mma_f32.md + */ + #include + + using tf32 = nvcuda::wmma::precision::tf32; + #endif + /* * Tensor Cores * https://developer.nvidia.com/blog/programming-tensor-cores-cuda-9 @@ -124,21 +135,36 @@ constexpr int rowscols_M = 16; // Number of rows (or cols) in the M dimension constexpr int rowscols_N = 16; // Number of rows (or cols) in the N dimension constexpr int rowscols_K = 16; // Number of rows (or cols) in the K dimension -// Half constants -// CUDART_ONE_FP16 was not recognized by the NVCC compiler -// So its value is indicated explicitly -// https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS.html#group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS -#define HALF_ONE __ushort_as_half((unsigned short)0x3C00U) -#define HALF_ZERO __ushort_as_half((unsigned short)0x0000U) +#ifndef USE_TCEC + // Half constants + // CUDART_ONE_FP16 was not recognized by the NVCC compiler + // So its value is indicated explicitly + // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS.html#group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS + #define HALF_ONE __ushort_as_half((unsigned short)0x3C00U) + #define HALF_ZERO __ushort_as_half((unsigned short)0x0000U) +#endif +#ifdef USE_TCEC +__device__ void fill_Q(float *Q_data) { +#else __device__ void fill_Q(half *Q_data) { +#endif + #ifdef USE_TCEC + float I4[16] = { + 1.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 1.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 1.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 1.0f + }; + #else half I4[16] = { HALF_ONE, HALF_ZERO, HALF_ZERO, HALF_ZERO, HALF_ZERO, HALF_ONE, HALF_ZERO, HALF_ZERO, HALF_ZERO, HALF_ZERO, HALF_ONE, HALF_ZERO, HALF_ZERO, HALF_ZERO, HALF_ZERO, HALF_ONE }; + #endif /* // Naive implementation: a single thread fills data in @@ -194,47 +220,97 @@ __device__ void fill_Q(half *Q_data) { */ } +#ifdef USE_TCEC +__device__ void reduce_via_tensor_units(float *data_to_be_reduced) { +#else __device__ void reduce_via_tensor_units(half *data_to_be_reduced) { - +#endif __syncthreads(); if (threadIdx.x <= 31) { // Only one warp performs reduction + #ifdef USE_TCEC + __shared__ __align__ (256) float Q_data[TILE_SIZE]; + #else __shared__ __align__ (256) half Q_data[TILE_SIZE]; + #endif fill_Q(Q_data); + #ifdef USE_TCEC + __shared__ __align__ (256) float tmp[TILE_SIZE]; + #else __shared__ __align__ (256) half tmp[TILE_SIZE]; + #endif // Declaring and filling fragments - Those are *not* shared + + #ifdef USE_TCEC + mtk::wmma::tcec::fragment frag_P; + mtk::wmma::tcec::fragment frag_V; + #else wmma::fragment frag_P; wmma::fragment frag_V; + #endif + #ifdef USE_TCEC + mtk::wmma::tcec::fragment frag_Q; + mtk::wmma::tcec::fragment frag_W; + mtk::wmma::tcec::fragment frag_C; + #else wmma::fragment frag_Q; wmma::fragment frag_W; wmma::fragment frag_C; - + #endif + + #ifdef USE_TCEC + mtk::wmma::tcec::fill_fragment(frag_P, 1.0f); // P: only ones + mtk::wmma::tcec::fill_fragment(frag_V, 0.0f); // Output: initialize to zeros + mtk::wmma::tcec::fill_fragment(frag_C, 0.0f); // Final result + mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_data, 16); + #else wmma::fill_fragment(frag_P, HALF_ONE); // P: only ones wmma::fill_fragment(frag_V, HALF_ZERO); // Output: initialize to zeros wmma::fill_fragment(frag_C, HALF_ZERO); // Final result wmma::load_matrix_sync(frag_Q, Q_data, 16); + #endif // 1. Accumulate the values: V <- AP + V for(uint i = 0; i < (4 * NUM_OF_THREADS_PER_BLOCK)/TILE_SIZE; i++){ const unsigned int offset = i * TILE_SIZE; + + #ifdef USE_TCEC + mtk::wmma::tcec::fragment frag_A; + mtk::wmma::tcec::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16); + mtk::wmma::tcec::mma_sync(frag_V, frag_A, frag_P, frag_V); + #else wmma::fragment frag_A; wmma::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16); wmma::mma_sync(frag_V, frag_A, frag_P, frag_V); + #endif } // W <- V (required since we need V as a "wmma::matrix_b") + #ifdef USE_TCEC + mtk::wmma::tcec::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major); + mtk::wmma::tcec::load_matrix_sync(frag_W, tmp, 16); + #else wmma::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major); wmma::load_matrix_sync(frag_W, tmp, 16); + #endif // 2. Perform line sum: C <- QW + C (zero) + #ifdef USE_TCEC + mtk::wmma::tcec::mma_sync(frag_C, frag_Q, frag_W, frag_C); + #else wmma::mma_sync(frag_C, frag_Q, frag_W, frag_C); + #endif // 3. Store result in shared memory + #ifdef USE_TCEC + mtk::wmma::tcec::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major); + #else wmma::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major); + #endif } __syncthreads(); diff --git a/wmma_extension b/wmma_extension new file mode 160000 index 00000000..7175b5f4 --- /dev/null +++ b/wmma_extension @@ -0,0 +1 @@ +Subproject commit 7175b5f4feea2a16b8d5b3045a357db010583c5e From 0ec50e23bbd88c0a94bc8aa1b153b629bfaf9bc7 Mon Sep 17 00:00:00 2001 From: Leonardo Solis Vasquez Date: Fri, 19 Jul 2024 07:53:35 +0200 Subject: [PATCH 06/12] Starting to remove non-TCEC code. --- cuda/calcMergeEneGra.cu | 19 ----------------- cuda/kernels.cu | 47 ----------------------------------------- 2 files changed, 66 deletions(-) diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu index 94e2a10b..f86ea35f 100644 --- a/cuda/calcMergeEneGra.cu +++ b/cuda/calcMergeEneGra.cu @@ -657,12 +657,6 @@ __device__ void gpu_calc_energrad( data_to_be_reduced[4*threadIdx.x + 1] = torque_rot.y; data_to_be_reduced[4*threadIdx.x + 2] = torque_rot.z; data_to_be_reduced[4*threadIdx.x + 3] = energy; - #else - __shared__ __align__(256) half data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK]; - data_to_be_reduced[4*threadIdx.x] = __float2half(torque_rot.x); - data_to_be_reduced[4*threadIdx.x + 1] = __float2half(torque_rot.y); - data_to_be_reduced[4*threadIdx.x + 2] = __float2half(torque_rot.z); - data_to_be_reduced[4*threadIdx.x + 3] = __float2half(energy); #endif // 2. Perform reduction via tensor units @@ -674,11 +668,6 @@ __device__ void gpu_calc_energrad( torque_rot.y = data_to_be_reduced[1]; torque_rot.z = data_to_be_reduced[2]; energy = data_to_be_reduced[3]; - #else - torque_rot.x = __half2float(data_to_be_reduced[0]); - torque_rot.y = __half2float(data_to_be_reduced[1]); - torque_rot.z = __half2float(data_to_be_reduced[2]); - energy = __half2float(data_to_be_reduced[3]); #endif /* End: Reduction using tensor units */ @@ -715,10 +704,6 @@ __device__ void gpu_calc_energrad( data_to_be_reduced[4*threadIdx.x] = gx; data_to_be_reduced[4*threadIdx.x + 1] = gy; data_to_be_reduced[4*threadIdx.x + 2] = gz; - #else - data_to_be_reduced[4*threadIdx.x] = __float2half(gx); - data_to_be_reduced[4*threadIdx.x + 1] = __float2half(gy); - data_to_be_reduced[4*threadIdx.x + 2] = __float2half(gz); #endif // 2. Perform reduction via tensor units @@ -729,10 +714,6 @@ __device__ void gpu_calc_energrad( gx = data_to_be_reduced[0]; gy = data_to_be_reduced[1]; gz = data_to_be_reduced[2]; - #else - gx = __half2float(data_to_be_reduced[0]); - gy = __half2float(data_to_be_reduced[1]); - gz = __half2float(data_to_be_reduced[2]); #endif /* End: Reduction using tensor units */ diff --git a/cuda/kernels.cu b/cuda/kernels.cu index 689aa9fa..e8b721bb 100644 --- a/cuda/kernels.cu +++ b/cuda/kernels.cu @@ -135,19 +135,8 @@ constexpr int rowscols_M = 16; // Number of rows (or cols) in the M dimension constexpr int rowscols_N = 16; // Number of rows (or cols) in the N dimension constexpr int rowscols_K = 16; // Number of rows (or cols) in the K dimension -#ifndef USE_TCEC - // Half constants - // CUDART_ONE_FP16 was not recognized by the NVCC compiler - // So its value is indicated explicitly - // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS.html#group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS - #define HALF_ONE __ushort_as_half((unsigned short)0x3C00U) - #define HALF_ZERO __ushort_as_half((unsigned short)0x0000U) -#endif - #ifdef USE_TCEC __device__ void fill_Q(float *Q_data) { -#else -__device__ void fill_Q(half *Q_data) { #endif #ifdef USE_TCEC @@ -157,13 +146,6 @@ __device__ void fill_Q(half *Q_data) { 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f }; - #else - half I4[16] = { - HALF_ONE, HALF_ZERO, HALF_ZERO, HALF_ZERO, - HALF_ZERO, HALF_ONE, HALF_ZERO, HALF_ZERO, - HALF_ZERO, HALF_ZERO, HALF_ONE, HALF_ZERO, - HALF_ZERO, HALF_ZERO, HALF_ZERO, HALF_ONE - }; #endif /* @@ -222,24 +204,18 @@ __device__ void fill_Q(half *Q_data) { #ifdef USE_TCEC __device__ void reduce_via_tensor_units(float *data_to_be_reduced) { -#else -__device__ void reduce_via_tensor_units(half *data_to_be_reduced) { #endif __syncthreads(); if (threadIdx.x <= 31) { // Only one warp performs reduction #ifdef USE_TCEC __shared__ __align__ (256) float Q_data[TILE_SIZE]; - #else - __shared__ __align__ (256) half Q_data[TILE_SIZE]; #endif fill_Q(Q_data); #ifdef USE_TCEC __shared__ __align__ (256) float tmp[TILE_SIZE]; - #else - __shared__ __align__ (256) half tmp[TILE_SIZE]; #endif // Declaring and filling fragments - Those are *not* shared @@ -247,19 +223,12 @@ __device__ void reduce_via_tensor_units(half *data_to_be_reduced) { #ifdef USE_TCEC mtk::wmma::tcec::fragment frag_P; mtk::wmma::tcec::fragment frag_V; - #else - wmma::fragment frag_P; - wmma::fragment frag_V; #endif #ifdef USE_TCEC mtk::wmma::tcec::fragment frag_Q; mtk::wmma::tcec::fragment frag_W; mtk::wmma::tcec::fragment frag_C; - #else - wmma::fragment frag_Q; - wmma::fragment frag_W; - wmma::fragment frag_C; #endif #ifdef USE_TCEC @@ -267,11 +236,6 @@ __device__ void reduce_via_tensor_units(half *data_to_be_reduced) { mtk::wmma::tcec::fill_fragment(frag_V, 0.0f); // Output: initialize to zeros mtk::wmma::tcec::fill_fragment(frag_C, 0.0f); // Final result mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_data, 16); - #else - wmma::fill_fragment(frag_P, HALF_ONE); // P: only ones - wmma::fill_fragment(frag_V, HALF_ZERO); // Output: initialize to zeros - wmma::fill_fragment(frag_C, HALF_ZERO); // Final result - wmma::load_matrix_sync(frag_Q, Q_data, 16); #endif // 1. Accumulate the values: V <- AP + V @@ -282,10 +246,6 @@ __device__ void reduce_via_tensor_units(half *data_to_be_reduced) { mtk::wmma::tcec::fragment frag_A; mtk::wmma::tcec::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16); mtk::wmma::tcec::mma_sync(frag_V, frag_A, frag_P, frag_V); - #else - wmma::fragment frag_A; - wmma::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16); - wmma::mma_sync(frag_V, frag_A, frag_P, frag_V); #endif } @@ -293,23 +253,16 @@ __device__ void reduce_via_tensor_units(half *data_to_be_reduced) { #ifdef USE_TCEC mtk::wmma::tcec::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major); mtk::wmma::tcec::load_matrix_sync(frag_W, tmp, 16); - #else - wmma::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major); - wmma::load_matrix_sync(frag_W, tmp, 16); #endif // 2. Perform line sum: C <- QW + C (zero) #ifdef USE_TCEC mtk::wmma::tcec::mma_sync(frag_C, frag_Q, frag_W, frag_C); - #else - wmma::mma_sync(frag_C, frag_Q, frag_W, frag_C); #endif // 3. Store result in shared memory #ifdef USE_TCEC mtk::wmma::tcec::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major); - #else - wmma::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major); #endif } From 7be85ed3b2853b52552ff0d5bbb29f1f3d6ce84c Mon Sep 17 00:00:00 2001 From: Leonardo Solis Vasquez Date: Fri, 19 Jul 2024 08:16:47 +0200 Subject: [PATCH 07/12] Removing support for FP16. This is not needed when having TCEC. --- cuda/kernels.cu | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/cuda/kernels.cu b/cuda/kernels.cu index e8b721bb..bc09e94e 100644 --- a/cuda/kernels.cu +++ b/cuda/kernels.cu @@ -100,12 +100,6 @@ __device__ inline int64_t ullitolli(uint64_t u) // "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" // https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf -/* - * Half-precision support - * https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF__MISC.html - */ -#include - #ifdef USE_TCEC /* * WMMA Extension for single precision matmul using Tensor Cores @@ -195,7 +189,7 @@ __device__ void fill_Q(float *Q_data) { printf("\nQ_data"); for (uint i = 0; i < 16 * 16; i++) { if ((i % 16) == 0) {printf("\n[Row %u]: ", i/16);} - printf(" %2.2f ", __half2float(Q_data[i])); + printf(" %2.2f ", Q_data[i]); } printf("\n"); } From 0b76241d5c1faf2de0c28c0f8d74c1439ed326ed Mon Sep 17 00:00:00 2001 From: Leonardo Solis Vasquez Date: Fri, 19 Jul 2024 08:49:12 +0200 Subject: [PATCH 08/12] TCEC=ON is enabled by default, and thus, not needed to be specified as flag. --- Makefile.Cuda | 5 +---- cuda/calcMergeEneGra.cu | 9 --------- cuda/kernels.cu | 28 ---------------------------- 3 files changed, 1 insertion(+), 41 deletions(-) diff --git a/Makefile.Cuda b/Makefile.Cuda index 31b9ba7b..d8217ab5 100644 --- a/Makefile.Cuda +++ b/Makefile.Cuda @@ -56,10 +56,7 @@ endif ifeq ($(TENSOR), ON) NVTENSOR=-DUSE_NVTENSOR - - ifeq ($(TCEC), ON) - NVTENSOR+=-I./wmma_extension/include -DUSE_TCEC - endif + NVTENSOR+=-I./wmma_extension/include endif BIN := $(wildcard $(TARGET)*) diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu index f86ea35f..7b19a894 100644 --- a/cuda/calcMergeEneGra.cu +++ b/cuda/calcMergeEneGra.cu @@ -651,24 +651,20 @@ __device__ void gpu_calc_energrad( // 1. Convert data-to-be-reduced from float to half // and place it in a shared memory array - #ifdef USE_TCEC __shared__ __align__(256) float data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK]; data_to_be_reduced[4*threadIdx.x] = torque_rot.x; data_to_be_reduced[4*threadIdx.x + 1] = torque_rot.y; data_to_be_reduced[4*threadIdx.x + 2] = torque_rot.z; data_to_be_reduced[4*threadIdx.x + 3] = energy; - #endif // 2. Perform reduction via tensor units reduce_via_tensor_units(data_to_be_reduced); // 3. Retrieve results from shared memory - #ifdef USE_TCEC torque_rot.x = data_to_be_reduced[0]; torque_rot.y = data_to_be_reduced[1]; torque_rot.z = data_to_be_reduced[2]; energy = data_to_be_reduced[3]; - #endif /* End: Reduction using tensor units */ #else @@ -699,22 +695,17 @@ __device__ void gpu_calc_energrad( // 1. Convert data-to-be-reduced from float to half // and place it in a shared memory array - - #ifdef USE_TCEC data_to_be_reduced[4*threadIdx.x] = gx; data_to_be_reduced[4*threadIdx.x + 1] = gy; data_to_be_reduced[4*threadIdx.x + 2] = gz; - #endif // 2. Perform reduction via tensor units reduce_via_tensor_units(data_to_be_reduced); // 3. Retrieve results from shared memory - #ifdef USE_TCEC gx = data_to_be_reduced[0]; gy = data_to_be_reduced[1]; gz = data_to_be_reduced[2]; - #endif /* End: Reduction using tensor units */ #else diff --git a/cuda/kernels.cu b/cuda/kernels.cu index bc09e94e..4be75d7c 100644 --- a/cuda/kernels.cu +++ b/cuda/kernels.cu @@ -100,16 +100,13 @@ __device__ inline int64_t ullitolli(uint64_t u) // "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" // https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf - #ifdef USE_TCEC /* * WMMA Extension for single precision matmul using Tensor Cores * and error correction technique (TCEC) * https://github.com/wmmae/wmma_extension/blob/main/docs/mma_f32.md */ #include - using tf32 = nvcuda::wmma::precision::tf32; - #endif /* * Tensor Cores @@ -129,18 +126,14 @@ constexpr int rowscols_M = 16; // Number of rows (or cols) in the M dimension constexpr int rowscols_N = 16; // Number of rows (or cols) in the N dimension constexpr int rowscols_K = 16; // Number of rows (or cols) in the K dimension -#ifdef USE_TCEC __device__ void fill_Q(float *Q_data) { -#endif - #ifdef USE_TCEC float I4[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f }; - #endif /* // Naive implementation: a single thread fills data in @@ -196,68 +189,47 @@ __device__ void fill_Q(float *Q_data) { */ } -#ifdef USE_TCEC __device__ void reduce_via_tensor_units(float *data_to_be_reduced) { -#endif __syncthreads(); if (threadIdx.x <= 31) { // Only one warp performs reduction - #ifdef USE_TCEC __shared__ __align__ (256) float Q_data[TILE_SIZE]; - #endif fill_Q(Q_data); - #ifdef USE_TCEC __shared__ __align__ (256) float tmp[TILE_SIZE]; - #endif // Declaring and filling fragments - Those are *not* shared - - #ifdef USE_TCEC mtk::wmma::tcec::fragment frag_P; mtk::wmma::tcec::fragment frag_V; - #endif - #ifdef USE_TCEC mtk::wmma::tcec::fragment frag_Q; mtk::wmma::tcec::fragment frag_W; mtk::wmma::tcec::fragment frag_C; - #endif - #ifdef USE_TCEC mtk::wmma::tcec::fill_fragment(frag_P, 1.0f); // P: only ones mtk::wmma::tcec::fill_fragment(frag_V, 0.0f); // Output: initialize to zeros mtk::wmma::tcec::fill_fragment(frag_C, 0.0f); // Final result mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_data, 16); - #endif // 1. Accumulate the values: V <- AP + V for(uint i = 0; i < (4 * NUM_OF_THREADS_PER_BLOCK)/TILE_SIZE; i++){ const unsigned int offset = i * TILE_SIZE; - #ifdef USE_TCEC mtk::wmma::tcec::fragment frag_A; mtk::wmma::tcec::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16); mtk::wmma::tcec::mma_sync(frag_V, frag_A, frag_P, frag_V); - #endif } // W <- V (required since we need V as a "wmma::matrix_b") - #ifdef USE_TCEC mtk::wmma::tcec::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major); mtk::wmma::tcec::load_matrix_sync(frag_W, tmp, 16); - #endif // 2. Perform line sum: C <- QW + C (zero) - #ifdef USE_TCEC mtk::wmma::tcec::mma_sync(frag_C, frag_Q, frag_W, frag_C); - #endif // 3. Store result in shared memory - #ifdef USE_TCEC mtk::wmma::tcec::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major); - #endif } __syncthreads(); From 51012a86b4e6be9413304eb8b6924ef88be44e88 Mon Sep 17 00:00:00 2001 From: atillack Date: Tue, 23 Jul 2024 07:36:44 -0700 Subject: [PATCH 09/12] Temporarily undo changes in performdocking.cpp.Cuda to update commit. --- host/src/performdocking.cpp.Cuda | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/host/src/performdocking.cpp.Cuda b/host/src/performdocking.cpp.Cuda index 29e02e81..c35b12e4 100644 --- a/host/src/performdocking.cpp.Cuda +++ b/host/src/performdocking.cpp.Cuda @@ -583,25 +583,6 @@ parameters argc and argv: */ uint32_t kernel7_gxsize = 0; uint32_t kernel7_lxsize = threadsPerBlock; - - #ifdef USE_NVTENSOR - /* Begin: Reduction using tensor units */ - - // Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: - // "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" - // https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf - - // threadsPerBlock should be at least 64 for the proposed matrix-based method to work. - // Reason: - // "A 256-element matrix is used to store the values to be reduced, - // and each thread holds exactly four values, - // which results in a minimum of 64 threads to fill a single matrix." - - assert(("Matrix-based reductions in ADADELTA: threadsPerBlock should be at least 64!", kernel7_lxsize >= 64)); - - /* End: Reduction using tensor units */ - #endif - uint32_t kernel8_gxsize = 0; uint32_t kernel8_lxsize = threadsPerBlock; if (cData.dockpars.lsearch_rate != 0.0f) { From a6e205492c154db15dc39fcffabccfdd404235d6 Mon Sep 17 00:00:00 2001 From: atillack Date: Tue, 23 Jul 2024 08:09:29 -0700 Subject: [PATCH 10/12] Added minimum NUMWI (64) and target architecture (80) for TENSOR code to Makefile. --- Makefile.Cuda | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Makefile.Cuda b/Makefile.Cuda index b1055039..9dbfe408 100644 --- a/Makefile.Cuda +++ b/Makefile.Cuda @@ -16,6 +16,13 @@ CPP = g++ UNAME := $(shell uname) TARGETS = 52 60 61 70 80 + +ifeq ($(TENSOR), ON) + NVTENSOR=-DUSE_NVTENSOR + NVTENSOR+=-I./wmma_extension/include + TARGETS = 80 +endif + CUDA_TARGETS=$(foreach target,$(TARGETS),-gencode arch=compute_$(target),code=sm_$(target)) ifeq ($(DEVICE), CPU) @@ -54,11 +61,6 @@ ifeq ($(OVERLAP), ON) PIPELINE=-DUSE_PIPELINE -fopenmp endif -ifeq ($(TENSOR), ON) - NVTENSOR=-DUSE_NVTENSOR - NVTENSOR+=-I./wmma_extension/include -endif - BIN := $(wildcard $(TARGET)*) # ------------------------------------------------------ @@ -67,6 +69,9 @@ BIN := $(wildcard $(TARGET)*) NUMWI= ifeq ($(NUMWI), 32) +ifeq ($(TENSOR), ON) +$(error NUMWI needs to be at least 64 with TENSOR=ON) +endif NWI=-DN32WI TARGET:=$(TARGET)_32wi else ifeq ($(NUMWI), 64) From d3b329954956d0815f7188655a944a94b9a18924 Mon Sep 17 00:00:00 2001 From: atillack Date: Thu, 25 Jul 2024 12:15:06 -0700 Subject: [PATCH 11/12] Optimization and cleanup of WMMA (tensor core) code. --- cuda/kernels.cu | 81 ++++++------------------------------------------- 1 file changed, 10 insertions(+), 71 deletions(-) diff --git a/cuda/kernels.cu b/cuda/kernels.cu index 4be75d7c..f1560cbf 100644 --- a/cuda/kernels.cu +++ b/cuda/kernels.cu @@ -126,78 +126,11 @@ constexpr int rowscols_M = 16; // Number of rows (or cols) in the M dimension constexpr int rowscols_N = 16; // Number of rows (or cols) in the N dimension constexpr int rowscols_K = 16; // Number of rows (or cols) in the K dimension -__device__ void fill_Q(float *Q_data) { - - float I4[16] = { - 1.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 1.0f - }; - - /* - // Naive implementation: a single thread fills data in - if (threadIdx.x == 0) { - for (uint i = 0; i < 4; i++) { // How many rows (of 4x4 blocks) are there in matrix A? - for (uint j = 0; j < 4; j++) { // How many cols (of 4x4 blocks) are there in matrix A? - for (uint ii = 0; ii < 4; ii++) { - for (uint jj = 0; jj < 4; jj++) { - Q_data[4*i + 64*j + ii + 16*jj] = I4 [4*ii + jj]; - } - } - } - } - } - */ - - // Slightly improved multi-threaded implementation - for (uint i = threadIdx.x; i < 4; i+=blockDim.x) { // How many rows (of 4x4 blocks) are there in matrix A? - for (uint j = 0; j < 4; j++) { // How many cols (of 4x4 blocks) are there in matrix A? - for (uint ii = 0; ii < 4; ii++) { - for (uint jj = 0; jj < 4; jj++) { - Q_data[4*i + 64*j + ii + 16*jj] = I4 [4*ii + jj]; - } - } - } - } - - /* - // Further improved multi-threaded implementation - // (It didn't provide significant performance improvements -> commented out) - // Fusing two outer loops into a single one - // To do that: coeffs = 4i + 64j - constexpr uint coeffs [16] = {0, 64, 128, 192, 4, 68, 132, 196, 8, 72, 136, 200, 12, 76, 140, 204}; - for (uint k = threadIdx.x; k < 16; k+=blockDim.x) { - for (uint ii = 0; ii < 4; ii++) { - for (uint jj = 0; jj < 4; jj++) { - Q_data[coeffs[k] + ii + 16*jj] = I4 [4*ii + jj]; - } - } - } - */ - - /* - // Enable this block to print matrix values - if (blockIdx.x == 0 && threadIdx.x == 0) { - printf("\nQ_data"); - for (uint i = 0; i < 16 * 16; i++) { - if ((i % 16) == 0) {printf("\n[Row %u]: ", i/16);} - printf(" %2.2f ", Q_data[i]); - } - printf("\n"); - } - */ -} - __device__ void reduce_via_tensor_units(float *data_to_be_reduced) { __syncthreads(); if (threadIdx.x <= 31) { // Only one warp performs reduction - __shared__ __align__ (256) float Q_data[TILE_SIZE]; - - fill_Q(Q_data); - - __shared__ __align__ (256) float tmp[TILE_SIZE]; + __shared__ __align__ (256) float Q_square[TILE_SIZE]; // storage for 16x16 matrix and 4x4 tiles of I4 matrix after // Declaring and filling fragments - Those are *not* shared mtk::wmma::tcec::fragment frag_P; @@ -210,7 +143,6 @@ __device__ void reduce_via_tensor_units(float *data_to_be_reduced) { mtk::wmma::tcec::fill_fragment(frag_P, 1.0f); // P: only ones mtk::wmma::tcec::fill_fragment(frag_V, 0.0f); // Output: initialize to zeros mtk::wmma::tcec::fill_fragment(frag_C, 0.0f); // Final result - mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_data, 16); // 1. Accumulate the values: V <- AP + V for(uint i = 0; i < (4 * NUM_OF_THREADS_PER_BLOCK)/TILE_SIZE; i++){ @@ -222,10 +154,17 @@ __device__ void reduce_via_tensor_units(float *data_to_be_reduced) { } // W <- V (required since we need V as a "wmma::matrix_b") - mtk::wmma::tcec::store_matrix_sync(tmp, frag_V, 16, wmma::mem_col_major); - mtk::wmma::tcec::load_matrix_sync(frag_W, tmp, 16); + mtk::wmma::tcec::store_matrix_sync(Q_square, frag_V, 16, wmma::mem_col_major); + mtk::wmma::tcec::load_matrix_sync(frag_W, Q_square, 16); // 2. Perform line sum: C <- QW + C (zero) + // a) create a 4x4 tiled matrix containing 4x4 identity matrix in each tile: + // - TENSOR=ON requires NUMWI to be larger than 32, so the following works and neatly gets rid of an additional function: + const unsigned int k = (threadIdx.x<<3); + const unsigned int kk = 16 - (threadIdx.x>>1); + for(uint i = 0; i < 8; i++) Q_square[k + i] = ((i + kk) & 3) ? 0.0f : 1.0f; + mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_square, 16); + // b) perform sum mtk::wmma::tcec::mma_sync(frag_C, frag_Q, frag_W, frag_C); // 3. Store result in shared memory From 0dde87bbbc36052d6c401af1dabd44140764530c Mon Sep 17 00:00:00 2001 From: atillack Date: Thu, 25 Jul 2024 12:52:12 -0700 Subject: [PATCH 12/12] Automatically set minimum tensor core compute capability 8.0 and added error message if device is below. --- Makefile | 6 +++++- host/src/performdocking.cpp | 7 +++++++ test_cuda.sh | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 3e44cdc1..3b41fa03 100755 --- a/Makefile +++ b/Makefile @@ -27,7 +27,11 @@ OVERLAP = ON ifeq ($(DEVICE), $(filter $(DEVICE),GPU CUDA)) -TARGETS_SUPPORTED := $(shell ./test_cuda.sh nvcc "$(GPU_INCLUDE_PATH)" "$(GPU_LIBRARY_PATH)" "$(TARGETS)" "$(DEVICE)") +MIN_COMPUTE:=50 +ifeq ($(TENSOR), ON) +MIN_COMPUTE:=80 +endif +TARGETS_SUPPORTED := $(shell ./test_cuda.sh nvcc "$(GPU_INCLUDE_PATH)" "$(GPU_LIBRARY_PATH)" "$(TARGETS)" "$(MIN_COMPUTE)") # if user specifies DEVICE=GPU the test result determines wether CUDA will be used or not ifeq ($(TARGETS_SUPPORTED),) ifeq ($(DEVICE),CUDA) diff --git a/host/src/performdocking.cpp b/host/src/performdocking.cpp index 2375386c..a6e3d661 100644 --- a/host/src/performdocking.cpp +++ b/host/src/performdocking.cpp @@ -450,6 +450,13 @@ void setup_gpu_for_docking( cudaDeviceProp props; RTERROR(cudaGetDevice(&(cData.devnum)),"ERROR in cudaGetDevice:"); RTERROR(cudaGetDeviceProperties(&props,cData.devnum),"ERROR in cudaGetDeviceProperties:"); +#ifdef USE_NVTENSOR + if(props.major < 8){ + printf("Error: Compute capability 8.0 or higher is needed for tensor core sum reductions.\n"); + printf(" Available device %s has compute capability %d.%d.\n", props.name, props.major, props.minor); + exit(-1); + } +#endif tData.device_name = (char*) malloc(strlen(props.name)+32); // make sure array is large enough to hold device number text too strcpy(tData.device_name, props.name); if(gpuCount>1) snprintf(&tData.device_name[strlen(props.name)], strlen(props.name)+32, " (#%d / %d)",cData.devnum+1,gpuCount); diff --git a/test_cuda.sh b/test_cuda.sh index 12fb3d86..e39cceb0 100755 --- a/test_cuda.sh +++ b/test_cuda.sh @@ -26,7 +26,7 @@ if [[ "$4" != "" ]]; then done TARGETS="$4" else - TARGETS=`awk -F'_' '{ if(\$2>50) print \$2 }' <<< "$TARGETS_SUPPORTED" | tr "\n" " "` + TARGETS=`awk -F'_' "{ if(\\$2>=$5) print \\$2 }" <<< "$TARGETS_SUPPORTED" | tr "\n" " "` fi printf "Compiling for targets: %s\n" "$TARGETS" >&2 cd "$script_dir"