Skip to content

Commit

Permalink
Merge branch '44-cutoff-small-abs-values' into 'master'
Browse files Browse the repository at this point in the history
Add cu_cutoff support to gemm

See merge request mutsuki/CULiP!46
  • Loading branch information
enp1s0 committed Sep 16, 2022
2 parents 258ddfe + 1174ae1 commit d378490
Show file tree
Hide file tree
Showing 7 changed files with 119 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "src/cuda-exponent-distribution-statistics"]
path = src/cuda-exponent-distribution-statistics
url = https://github.com/enp1s0/cuda-exponent-distribution-statistics
[submodule "src/cuda-cutoff-small-abs-values"]
path = src/cuda-cutoff-small-abs-values
url = https://github.com/enp1s0/cuda-cutoff-small-abs-values
11 changes: 10 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,26 @@ if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda-exponent-distribution-statis
COMMAND_ERROR_IS_FATAL ANY)
endif()

if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda-cutoff-small-abs-values/Makefile)
execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive -- ${dir}
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
COMMAND_ERROR_IS_FATAL ANY)
endif()


foreach(library cublas)
set(lib_name culip_${library})
set(exp_lib_home ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda-exponent-distribution-statistics)
set(cutoff_lib_home ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda-cutoff-small-abs-values)
include_directories(${lib_name} PRIVATE ${exp_lib_home}/include ${exp_lib_home}/src/cutf/include)
include_directories(${lib_name} PRIVATE ${cutoff_lib_home}/include ${cutoff_lib_home}/src/cutf/include)
add_library(${lib_name} SHARED
${SRCDIR}/${library}.cu
${SRCDIR}/utils.cu
${SRCDIR}/utils.hpp
${SRCDIR}/params.hpp
${CMAKE_CURRENT_SOURCE_DIR}/src/cuda-exponent-distribution-statistics/src/main.cu
${exp_lib_home}/src/main.cu
${cutoff_lib_home}/src/main.cu
${HEADERS}
)

Expand Down
70 changes: 70 additions & 0 deletions src/cublas.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
#include <string.h>
#include <CULiP/cublas.hpp>
#include <cu_exp_statistics.hpp>
#include <cu_cutoff.hpp>
#include "utils.hpp"

#define CULIP_CUBLAS_LIBRARY_NAME "libcublas.so"
#define CULIP_CUBLAS_ENV_NAME "CULIP_CUBLAS_LIB_PATH"
#define CULIP_CUBLAS_DISABLE_ENV_NAME "CULIP_DISABLE_CUBLAS_PROFILING"
#define CULIP_EXP_STATS_ENABLE_ENV_NAME "CULIP_ENABLE_EXP_STATS"
#define CULIP_CUTOFF_THRESHOLD_ENV_NAME "CULIP_CUTOFF_THRESHOLD"

namespace {
mtk::cu_exp_statistics::result_t exp_stats(
Expand Down Expand Up @@ -48,6 +50,40 @@ mtk::cu_exp_statistics::result_t exp_stats(
}
return result;
}

void cutoff(
void* const ptr,
const std::size_t offset,
const std::size_t m,
const std::size_t n,
const std::size_t ld,
double threshold,
const cudaStream_t cuda_stream,
cudaDataType_t data_t
) {
switch (data_t) {
case CUDA_R_64F:
mtk::cu_cutoff::cutoff_small_abs_values(reinterpret_cast<double*>(ptr) + offset, m, n, ld, threshold, cuda_stream);
break;
case CUDA_R_32F:
mtk::cu_cutoff::cutoff_small_abs_values(reinterpret_cast<float*>(ptr) + offset, m, n, ld, threshold, cuda_stream);
break;
case CUDA_R_16F:
mtk::cu_cutoff::cutoff_small_abs_values(reinterpret_cast<half*>(ptr) + offset, m, n, ld, threshold, cuda_stream);
break;
case CUDA_C_64F:
mtk::cu_cutoff::cutoff_small_abs_values(reinterpret_cast<double2*>(ptr) + offset, m, n, ld, threshold, cuda_stream);
break;
case CUDA_C_32F:
mtk::cu_cutoff::cutoff_small_abs_values(reinterpret_cast<float2*>(ptr) + offset, m, n, ld, threshold, cuda_stream);
break;
case CUDA_C_16F:
mtk::cu_cutoff::cutoff_small_abs_values(reinterpret_cast<half2*>(ptr) + offset, m, n, ld, threshold, cuda_stream);
break;
default:
break;
}
}
} // unnamed namespace

extern "C" {
Expand Down Expand Up @@ -220,6 +256,22 @@ cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa,
void *C, cudaDataType_t Ctype, int ldc,
cublasComputeType_t computeType,
cublasGemmAlgo_t algo) {
const int cutoff_flag = (CULiP_profiling_control_array[CULiP_cublasGemmEx] == 0) && CULiP_is_profiling_enabled(CULIP_CUTOFF_THRESHOLD_ENV_NAME, false);
if (cutoff_flag) {
double threshold;
try {
const auto env_str = getenv(CULIP_CUTOFF_THRESHOLD_ENV_NAME);
threshold = std::stod(env_str);

cudaStream_t cuda_stream;
cublasGetStream(handle, &cuda_stream);
cutoff(const_cast<void*>(A), 0, (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, threshold, cuda_stream, Atype);
cutoff(const_cast<void*>(B), 0, (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, threshold, cuda_stream, Btype);
} catch(const std::exception& e) {
CULIBPROFILER_DEBUG_PRINT(printf("[CULiP Warning] invalid threshold (%s)\n", env_str));
}
}

const int profiling_flag = (CULiP_profiling_control_array[CULiP_cublasGemmEx] == 0) && CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME);

// Get the function pointer
Expand Down Expand Up @@ -457,6 +509,24 @@ cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle,
int batchCount,
cublasComputeType_t computeType,
cublasGemmAlgo_t algo) {
const int cutoff_flag = (CULiP_profiling_control_array[CULiP_cublasGemmStridedBatchedEx] == 0) && CULiP_is_profiling_enabled(CULIP_CUTOFF_THRESHOLD_ENV_NAME, false);
if (cutoff_flag) {
double threshold;
try {
const auto env_str = getenv(CULIP_CUTOFF_THRESHOLD_ENV_NAME);
threshold = std::stod(env_str);

cudaStream_t cuda_stream;
cublasGetStream(handle, &cuda_stream);
for (std::uint32_t i = 0; i < batchCount; i++) {
cutoff(const_cast<void*>(A), i * strideA, (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, threshold, cuda_stream, Atype);
cutoff(const_cast<void*>(B), i * strideB, (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, threshold, cuda_stream, Btype);
}
} catch(const std::exception& e) {
CULIBPROFILER_DEBUG_PRINT(printf("[CULiP Warning] invalid threshold (%s)\n", env_str));
}
}

const int profiling_flag = (CULiP_profiling_control_array[CULiP_cublasGemmStridedBatchedEx] == 0) && CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME);

// Get the function pointer
Expand Down
16 changes: 16 additions & 0 deletions src/cublas.gemm.template.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,22 @@ cublasStatus_t CULIP_FUNC_NAME(cublasHandle_t handle, cublasOperation_t transa,
const CULIP_TYPE *alpha, const CULIP_TYPE *A, int lda,
const CULIP_TYPE *B, int ldb, const CULIP_TYPE *beta, CULIP_TYPE *C,
int ldc) {
const int cutoff_flag = (CULiP_profiling_control_array[CULIP_FUNC_ENUM_NAME] == 0) && CULiP_is_profiling_enabled(CULIP_CUTOFF_THRESHOLD_ENV_NAME, false);
if (cutoff_flag) {
double threshold;
try {
const auto env_str = getenv(CULIP_CUTOFF_THRESHOLD_ENV_NAME);
threshold = std::stod(env_str);

cudaStream_t cuda_stream;
cublasGetStream(handle, &cuda_stream);
mtk::cu_cutoff::cutoff_small_abs_values(const_cast<CULIP_TYPE*>(A), (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, threshold, cuda_stream);
mtk::cu_cutoff::cutoff_small_abs_values(const_cast<CULIP_TYPE*>(B), (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, threshold, cuda_stream);
} catch(const std::exception& e) {
CULIBPROFILER_DEBUG_PRINT(printf("[CULiP Warning] invalid threshold (%s)\n", env_str));
}
}

const int profiling_flag = (CULiP_profiling_control_array[CULIP_FUNC_ENUM_NAME] == 0) && CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME);

// Get the function pointer
Expand Down
18 changes: 18 additions & 0 deletions src/cublas.gemm_strided_batched.template.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,24 @@ cublasStatus_t CULIP_FUNC_NAME (cublasHandle_t handle,
int ldc,
long long int strideC,
int batchCount) {
const int cutoff_flag = (CULiP_profiling_control_array[CULIP_FUNC_ENUM_NAME] == 0) && CULiP_is_profiling_enabled(CULIP_CUTOFF_THRESHOLD_ENV_NAME, false);
if (cutoff_flag) {
double threshold;
try {
const auto env_str = getenv(CULIP_CUTOFF_THRESHOLD_ENV_NAME);
threshold = std::stod(env_str);

cudaStream_t cuda_stream;
cublasGetStream(handle, &cuda_stream);
for (std::uint32_t i = 0; i < batchCount; i++) {
mtk::cu_cutoff::cutoff_small_abs_values(const_cast<CULIP_TYPE*>(A + i * strideA), (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, threshold, cuda_stream);
mtk::cu_cutoff::cutoff_small_abs_values(const_cast<CULIP_TYPE*>(B + i * strideB), (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, threshold, cuda_stream);
}
} catch(const std::exception& e) {
CULIBPROFILER_DEBUG_PRINT(printf("[CULiP Warning] invalid threshold (%s)\n", env_str));
}
}

const int profiling_flag = (CULiP_profiling_control_array[CULIP_FUNC_ENUM_NAME] == 0) && CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME);

// Get the function pointer
Expand Down
1 change: 1 addition & 0 deletions src/cuda-cutoff-small-abs-values
2 changes: 1 addition & 1 deletion src/cuda-exponent-distribution-statistics

0 comments on commit d378490

Please sign in to comment.