Skip to content

Commit

Permalink
Merge pull request #252 from L30nardoSV/tensorcores
Browse files Browse the repository at this point in the history
Speeding up sum reductions in ADADELTA by using Tensor Cores
  • Loading branch information
atillack authored Nov 19, 2024
2 parents a0c8c95 + 0dde87b commit e63e6f6
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 9 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "wmma_extension"]
path = wmma_extension
url = https://github.com/wmmae/wmma_extension
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@
OVERLAP = ON

ifeq ($(DEVICE), $(filter $(DEVICE),GPU CUDA))
TARGETS_SUPPORTED := $(shell ./test_cuda.sh nvcc "$(GPU_INCLUDE_PATH)" "$(GPU_LIBRARY_PATH)" "$(TARGETS)" "$(DEVICE)")
MIN_COMPUTE:=50
ifeq ($(TENSOR), ON)
MIN_COMPUTE:=80
endif
TARGETS_SUPPORTED := $(shell ./test_cuda.sh nvcc "$(GPU_INCLUDE_PATH)" "$(GPU_LIBRARY_PATH)" "$(TARGETS)" "$(MIN_COMPUTE)")
# if user specifies DEVICE=GPU the test result determines wether CUDA will be used or not
ifeq ($(TARGETS_SUPPORTED),)
ifeq ($(DEVICE),CUDA)
Expand Down
17 changes: 13 additions & 4 deletions Makefile.Cuda
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ CPP = g++
UNAME := $(shell uname)

TARGETS = 52 60 61 70 80

ifeq ($(TENSOR), ON)
NVTENSOR=-DUSE_NVTENSOR
NVTENSOR+=-I./wmma_extension/include
TARGETS = 80
endif

CUDA_TARGETS=$(foreach target,$(TARGETS),-gencode arch=compute_$(target),code=sm_$(target))

ifeq ($(DEVICE), CPU)
Expand Down Expand Up @@ -54,7 +61,6 @@ ifeq ($(OVERLAP), ON)
PIPELINE=-DUSE_PIPELINE -fopenmp
endif


BIN := $(wildcard $(TARGET)*)

# ------------------------------------------------------
Expand All @@ -63,6 +69,9 @@ BIN := $(wildcard $(TARGET)*)
NUMWI=

ifeq ($(NUMWI), 32)
ifeq ($(TENSOR), ON)
$(error NUMWI needs to be at least 64 with TENSOR=ON)
endif
NWI=-DN32WI
TARGET:=$(TARGET)_32wi
else ifeq ($(NUMWI), 64)
Expand Down Expand Up @@ -177,15 +186,15 @@ TOOL_CFLAGS+=-DVERSION=\"$(GIT_VERSION)\"
# ------------------------------------------------------

kernels: $(KERNEL_SRC)
$(NVCC) $(NWI) $(REP) $(CUDA_FLAGS) $(IFLAGS) $(CUDA_INCLUDES) -c $(KRNL_DIR)/kernels.cu
$(NVCC) $(NWI) $(REP) $(CUDA_FLAGS) $(IFLAGS) $(CUDA_INCLUDES) $(NVTENSOR) -c $(KRNL_DIR)/kernels.cu

otool:
@echo "Building" $(TOOL_TARGET) "..."
$(CPP) \
$(shell ls $(HOST_SRC_DIR)/*.cpp) \
$(TOOL_CFLAGS) \
-o$(BIN_DIR)/$(TOOL_TARGET) \
$(PIPELINE) $(OPT) -DTOOLMODE $(REP)
$(PIPELINE) $(NVTENSOR) $(OPT) -DTOOLMODE $(REP)

odock: check-env-all kernels
@echo "Building" $(TARGET) "..."
Expand All @@ -194,7 +203,7 @@ odock: check-env-all kernels
$(CFLAGS) \
$(LIB_CUDA) \
-o$(BIN_DIR)/$(TARGET) \
$(DEV) $(NWI) $(PIPELINE) $(OPT) $(DD) $(REP) $(KFLAGS)
$(DEV) $(NWI) $(PIPELINE) $(NVTENSOR) $(OPT) $(DD) $(REP) $(KFLAGS)

# Example
# 1ac8: for testing gradients of translation and rotation genes
Expand Down
60 changes: 57 additions & 3 deletions cuda/calcMergeEneGra.cu
Original file line number Diff line number Diff line change
Expand Up @@ -642,23 +642,77 @@ __device__ void gpu_calc_energrad(
torque_rot.z += tr.z;
}

// Do a reduction over the total gradient containing prepared "gradient_intra_*" values
#ifdef USE_NVTENSOR
/* Begin: Reduction using tensor units */

// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf

// 1. Convert data-to-be-reduced from float to half
// and place it in a shared memory array
__shared__ __align__(256) float data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK];
data_to_be_reduced[4*threadIdx.x] = torque_rot.x;
data_to_be_reduced[4*threadIdx.x + 1] = torque_rot.y;
data_to_be_reduced[4*threadIdx.x + 2] = torque_rot.z;
data_to_be_reduced[4*threadIdx.x + 3] = energy;

// 2. Perform reduction via tensor units
reduce_via_tensor_units(data_to_be_reduced);

// 3. Retrieve results from shared memory
torque_rot.x = data_to_be_reduced[0];
torque_rot.y = data_to_be_reduced[1];
torque_rot.z = data_to_be_reduced[2];
energy = data_to_be_reduced[3];

/* End: Reduction using tensor units */
#else
// Reduction over the total gradient containing prepared "gradient_intra_*" values
REDUCEFLOATSUM(torque_rot.x, pFloatAccumulator);
REDUCEFLOATSUM(torque_rot.y, pFloatAccumulator);
REDUCEFLOATSUM(torque_rot.z, pFloatAccumulator);

// Reduction over partial energies and prepared "gradient_intra_*" values
REDUCEFLOATSUM(energy, pFloatAccumulator);
#endif

// TODO
// -------------------------------------------------------
// Obtaining energy and translation-related gradients
// -------------------------------------------------------
// reduction over partial energies and prepared "gradient_intra_*" values
REDUCEFLOATSUM(energy, pFloatAccumulator);

#if defined (DEBUG_ENERGY_KERNEL)
REDUCEFLOATSUM(intraE, pFloatAccumulator);
#endif

#ifdef USE_NVTENSOR
/* Begin: Reduction using tensor units */

// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf

// 1. Convert data-to-be-reduced from float to half
// and place it in a shared memory array
data_to_be_reduced[4*threadIdx.x] = gx;
data_to_be_reduced[4*threadIdx.x + 1] = gy;
data_to_be_reduced[4*threadIdx.x + 2] = gz;

// 2. Perform reduction via tensor units
reduce_via_tensor_units(data_to_be_reduced);

// 3. Retrieve results from shared memory
gx = data_to_be_reduced[0];
gy = data_to_be_reduced[1];
gz = data_to_be_reduced[2];

/* End: Reduction using tensor units */
#else
REDUCEFLOATSUM(gx, pFloatAccumulator);
REDUCEFLOATSUM(gy, pFloatAccumulator);
REDUCEFLOATSUM(gz, pFloatAccumulator);
#endif

global_energy = energy;
#ifndef FLOAT_GRADIENTS
Expand Down
84 changes: 84 additions & 0 deletions cuda/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,90 @@ __device__ inline int64_t ullitolli(uint64_t u)
#define ATOMICADDF32(pAccumulator, value) atomicAdd(pAccumulator, (value))
#define ATOMICSUBF32(pAccumulator, value) atomicAdd(pAccumulator, -(value))

#ifdef USE_NVTENSOR
/* Begin: Reduction using tensor units */

// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH:
// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx"
// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf

/*
* WMMA Extension for single precision matmul using Tensor Cores
* and error correction technique (TCEC)
* https://github.com/wmmae/wmma_extension/blob/main/docs/mma_f32.md
*/
#include <wmma_extension/tcec/tcec.hpp>
using tf32 = nvcuda::wmma::precision::tf32;

/*
* Tensor Cores
* https://developer.nvidia.com/blog/programming-tensor-cores-cuda-9
*
* Don't forget to compile specifying the architecture, e.g., sm_86.
* For AutoDock-GPU, this can be done via the TARGETS option.
* make DEVICE=GPU TESTLS=ad NUMWI=64 TARGETS=86 test
* https://stackoverflow.com/a/53634598/1616865
*/
#include <mma.h>
using namespace nvcuda;

#define TILE_SIZE (16 * 16)

constexpr int rowscols_M = 16; // Number of rows (or cols) in the M dimension
constexpr int rowscols_N = 16; // Number of rows (or cols) in the N dimension
constexpr int rowscols_K = 16; // Number of rows (or cols) in the K dimension

__device__ void reduce_via_tensor_units(float *data_to_be_reduced) {
__syncthreads();

if (threadIdx.x <= 31) { // Only one warp performs reduction
__shared__ __align__ (256) float Q_square[TILE_SIZE]; // storage for 16x16 matrix and 4x4 tiles of I4 matrix after

// Declaring and filling fragments - Those are *not* shared
mtk::wmma::tcec::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_P;
mtk::wmma::tcec::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, tf32> frag_V;

mtk::wmma::tcec::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_Q;
mtk::wmma::tcec::fragment<wmma::matrix_b, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_W;
mtk::wmma::tcec::fragment<wmma::accumulator, rowscols_M, rowscols_N, rowscols_K, tf32> frag_C;

mtk::wmma::tcec::fill_fragment(frag_P, 1.0f); // P: only ones
mtk::wmma::tcec::fill_fragment(frag_V, 0.0f); // Output: initialize to zeros
mtk::wmma::tcec::fill_fragment(frag_C, 0.0f); // Final result

// 1. Accumulate the values: V <- AP + V
for(uint i = 0; i < (4 * NUM_OF_THREADS_PER_BLOCK)/TILE_SIZE; i++){
const unsigned int offset = i * TILE_SIZE;

mtk::wmma::tcec::fragment<wmma::matrix_a, rowscols_M, rowscols_N, rowscols_K, tf32, wmma::col_major> frag_A;
mtk::wmma::tcec::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16);
mtk::wmma::tcec::mma_sync(frag_V, frag_A, frag_P, frag_V);
}

// W <- V (required since we need V as a "wmma::matrix_b")
mtk::wmma::tcec::store_matrix_sync(Q_square, frag_V, 16, wmma::mem_col_major);
mtk::wmma::tcec::load_matrix_sync(frag_W, Q_square, 16);

// 2. Perform line sum: C <- QW + C (zero)
// a) create a 4x4 tiled matrix containing 4x4 identity matrix in each tile:
// - TENSOR=ON requires NUMWI to be larger than 32, so the following works and neatly gets rid of an additional function:
const unsigned int k = (threadIdx.x<<3);
const unsigned int kk = 16 - (threadIdx.x>>1);
for(uint i = 0; i < 8; i++) Q_square[k + i] = ((i + kk) & 3) ? 0.0f : 1.0f;
mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_square, 16);
// b) perform sum
mtk::wmma::tcec::mma_sync(frag_C, frag_Q, frag_W, frag_C);

// 3. Store result in shared memory
mtk::wmma::tcec::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major);
}

__syncthreads();
}

/* End: Reduction using tensor units */
#endif

#define REDUCEFLOATSUM(value, pAccumulator) \
if (threadIdx.x == 0) \
{ \
Expand Down
7 changes: 7 additions & 0 deletions host/src/performdocking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,13 @@ void setup_gpu_for_docking(
cudaDeviceProp props;
RTERROR(cudaGetDevice(&(cData.devnum)),"ERROR in cudaGetDevice:");
RTERROR(cudaGetDeviceProperties(&props,cData.devnum),"ERROR in cudaGetDeviceProperties:");
#ifdef USE_NVTENSOR
if(props.major < 8){
printf("Error: Compute capability 8.0 or higher is needed for tensor core sum reductions.\n");
printf(" Available device %s has compute capability %d.%d.\n", props.name, props.major, props.minor);
exit(-1);
}
#endif
tData.device_name = (char*) malloc(strlen(props.name)+32); // make sure array is large enough to hold device number text too
strcpy(tData.device_name, props.name);
if(gpuCount>1) snprintf(&tData.device_name[strlen(props.name)], strlen(props.name)+32, " (#%d / %d)",cData.devnum+1,gpuCount);
Expand Down
2 changes: 1 addition & 1 deletion test_cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ if [[ "$4" != "" ]]; then
done
TARGETS="$4"
else
TARGETS=`awk -F'_' '{ if(\$2>50) print \$2 }' <<< "$TARGETS_SUPPORTED" | tr "\n" " "`
TARGETS=`awk -F'_' "{ if(\\$2>=$5) print \\$2 }" <<< "$TARGETS_SUPPORTED" | tr "\n" " "`
fi
printf "Compiling for targets: %s\n" "$TARGETS" >&2
cd "$script_dir"
Expand Down
1 change: 1 addition & 0 deletions wmma_extension
Submodule wmma_extension added at 7175b5

0 comments on commit e63e6f6

Please sign in to comment.