From 2cfea8486b7cec62fc7597e897406942aa3057e5 Mon Sep 17 00:00:00 2001 From: mutsuki Date: Thu, 25 Aug 2022 14:55:18 +0900 Subject: [PATCH 01/12] Add exp stats in gemm --- .gitmodules | 3 +++ src/cublas.cu | 4 +++- src/cublas.gemm.template.h | 17 +++++++++++++++++ src/cuda-exponent-distribution-statistics | 1 + src/params.hpp | 1 + src/utils.cu | 7 +++++++ src/utils.hpp | 10 ++++++++++ 7 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 .gitmodules create mode 160000 src/cuda-exponent-distribution-statistics diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..7f0f9d0 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "src/cuda-exponent-distribution-statistics"] + path = src/cuda-exponent-distribution-statistics + url = https://github.com/enp1s0/cuda-exponent-distribution-statistics diff --git a/src/cublas.cu b/src/cublas.cu index 9fc4fda..b6afe71 100644 --- a/src/cublas.cu +++ b/src/cublas.cu @@ -5,11 +5,13 @@ #include #include #include +#include #include "utils.hpp" #define CULIP_CUBLAS_LIBRARY_NAME "libcublas.so" #define CULIP_CUBLAS_ENV_NAME "CULIP_CUBLAS_LIB_PATH" -#define CULIP_CUBLAS_DISABLE_ENV_NAME "CULIP_PROFILING_CUBLAS_DISABLE" +#define CULIP_CUBLAS_DISABLE_ENV_NAME "CULIP_DISABLE_CUBLAS_PROFILING" +#define CULIP_EXP_STATS_ENABLE_ENV_NAME "CULIP_ENABLE_EXP_STATS" extern "C" { // dlopen cache diff --git a/src/cublas.gemm.template.h b/src/cublas.gemm.template.h index 25dad87..183ac93 100644 --- a/src/cublas.gemm.template.h +++ b/src/cublas.gemm.template.h @@ -5,6 +5,23 @@ cublasStatus_t CULIP_FUNC_NAME(cublasHandle_t handle, cublasOperation_t transa, int ldc) { const int profiling_flag = (CULiP_profiling_control_array[CULIP_FUNC_ENUM_NAME] == 0) && CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME); + const int exp_stats_flag = CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME); + //const int exp_stats_flag = (CULiP_profiling_control_array[CULIP_EXP_STATS_ENABLE_ENV_NAME] != 0) && CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME); + if (exp_stats_flag) { + cudaStream_t cuda_stream; + cublasGetStream(handle, &cuda_stream); + CULiP_exp_stats a_stats; + CULiP_exp_stats b_stats; + snprintf(a_stats.name, a_stats.name_length - 1, "A"); + snprintf(b_stats.name, b_stats.name_length - 1, "B"); + a_stats.stats = mtk::cu_exp_statistics::take_matrix_statistics(A, (transa == CUBLAS_OP_N ? m : k), (transb == CUBLAS_OP_N ? k : m), lda, cuda_stream); + b_stats.stats = mtk::cu_exp_statistics::take_matrix_statistics(B, (transa == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream); + mtk::cu_exp_statistics::to_json(a_stats.stats); + mtk::cu_exp_statistics::to_json(b_stats.stats); + CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats); + CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&b_stats); + } + // Get the function pointer cublasStatus_t (*cublas_lib_func)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const CULIP_TYPE*, const CULIP_TYPE*, int, const CULIP_TYPE*, int, const CULIP_TYPE*, CULIP_TYPE*, int); *(void**)(&cublas_lib_func) = CULiP_get_function_pointer(CULIP_CUBLAS_LIBRARY_NAME, CULIP_CUBLAS_ENV_NAME, __func__, &CULiP_cublas_lib_handle_cache); diff --git a/src/cuda-exponent-distribution-statistics b/src/cuda-exponent-distribution-statistics new file mode 160000 index 0000000..1dee8fb --- /dev/null +++ b/src/cuda-exponent-distribution-statistics @@ -0,0 +1 @@ +Subproject commit 1dee8fb13b084b9c04983854a4b7702063854c6f diff --git a/src/params.hpp b/src/params.hpp index abecbeb..3362b1c 100644 --- a/src/params.hpp +++ b/src/params.hpp @@ -2,5 +2,6 @@ #define __CULIP_PARAMS_HPP__ #define CULIP_RESULT_PREFIX "CULiP Result" +#define CULIP_EXP_STATS_PREFIX "CULiP ExpStats" #endif diff --git a/src/utils.cu b/src/utils.cu index 70d360b..37f3e25 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -23,6 +23,13 @@ extern "C" void CULiP_print_profile_result(void *profile_result_ptr) { printf("[%s][%s] %luns\n", CULIP_RESULT_PREFIX, profile_result.function_name, elapsed_time_us); } +extern "C" void CULiP_print_exp_stats_result(void *exp_stats_result_ptr) { + const CULiP_exp_stats exp_stats_result = + *((CULiP_exp_stats *)exp_stats_result_ptr); + + printf("[%s] %s: %s\n", CULIP_EXP_STATS_PREFIX, exp_stats_result.name, mtk::cu_exp_statistics::to_json(exp_stats_result.stats).c_str()); +} + // TODO: Make this function non-blocking using `cuLauchHostFunc` extern "C" void CULiP_launch_function(cudaStream_t cuda_stream, void (*fn)(void*), void* const arg) { cudaStreamSynchronize(cuda_stream); diff --git a/src/utils.hpp b/src/utils.hpp index e144ba8..6443db8 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -1,6 +1,7 @@ #ifndef __CUDALIBPROFILER_UTILS_HPP__ #define __CUDALIBPROFILER_UTILS_HPP__ #include +#include #include "params.hpp" #ifdef CULIBPROFILER_ENABLE_DEBUG_PRINT @@ -23,7 +24,16 @@ struct CULiP_profile_result { struct timespec end_timestamp; }; +struct CULiP_exp_stats { + // name + enum {name_length = 32}; + char name[name_length] = {0}; + + mtk::cu_exp_statistics::result_t stats; +}; + extern "C" void CULiP_print_profile_result(void* profile_result_ptr); +extern "C" void CULiP_print_exp_stats_result(void* exp_stats_result_ptr); // Call a given function on a given stream extern "C" void CULiP_launch_function(cudaStream_t cuda_stream, void (*fn)(void*), void* const arg); From b853ebdbd48fe860a6a45a685c8ae719fbf28082 Mon Sep 17 00:00:00 2001 From: mutsuki Date: Thu, 25 Aug 2022 15:36:22 +0900 Subject: [PATCH 02/12] Update env name CULIP_DISABLE_CUBLAS_PROFILING --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ad78e70..e726b73 100644 --- a/README.md +++ b/README.md @@ -47,12 +47,12 @@ Then the execution time is printed on stdout. To disable profiling at runtime, define an environment variable `CULIP_PROFILING_CUBLAS_DISABLE`. ```bash # Disable cuBLAS profiling -export CULIP_PROFILING_CUBLAS_DISABLE=1 +export CULIP_DISABLE_CUBLAS_PROFILING=1 # Enable cuBLAS profiling -export CULIP_PROFILING_CUBLAS_DISABLE=0 +export CULIP_DISABLE_CUBLAS_PROFILING=0 # or -unset CULIP_PROFILING_CUBLAS_DISABLE +unset CULIP_DISABLE_CUBLAS_PROFILING ``` ## Profiling control API From 9f44c6c8d5f0693494a209856e32c70c402c95de Mon Sep 17 00:00:00 2001 From: mutsuki Date: Thu, 25 Aug 2022 16:39:46 +0900 Subject: [PATCH 03/12] Update exp stats for gemm --- src/cublas.gemm.template.h | 33 ++++++++++++++++----------------- src/utils.cu | 10 ++++++---- src/utils.hpp | 2 +- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/src/cublas.gemm.template.h b/src/cublas.gemm.template.h index 183ac93..ee517ed 100644 --- a/src/cublas.gemm.template.h +++ b/src/cublas.gemm.template.h @@ -5,23 +5,6 @@ cublasStatus_t CULIP_FUNC_NAME(cublasHandle_t handle, cublasOperation_t transa, int ldc) { const int profiling_flag = (CULiP_profiling_control_array[CULIP_FUNC_ENUM_NAME] == 0) && CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME); - const int exp_stats_flag = CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME); - //const int exp_stats_flag = (CULiP_profiling_control_array[CULIP_EXP_STATS_ENABLE_ENV_NAME] != 0) && CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME); - if (exp_stats_flag) { - cudaStream_t cuda_stream; - cublasGetStream(handle, &cuda_stream); - CULiP_exp_stats a_stats; - CULiP_exp_stats b_stats; - snprintf(a_stats.name, a_stats.name_length - 1, "A"); - snprintf(b_stats.name, b_stats.name_length - 1, "B"); - a_stats.stats = mtk::cu_exp_statistics::take_matrix_statistics(A, (transa == CUBLAS_OP_N ? m : k), (transb == CUBLAS_OP_N ? k : m), lda, cuda_stream); - b_stats.stats = mtk::cu_exp_statistics::take_matrix_statistics(B, (transa == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream); - mtk::cu_exp_statistics::to_json(a_stats.stats); - mtk::cu_exp_statistics::to_json(b_stats.stats); - CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats); - CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&b_stats); - } - // Get the function pointer cublasStatus_t (*cublas_lib_func)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const CULIP_TYPE*, const CULIP_TYPE*, int, const CULIP_TYPE*, int, const CULIP_TYPE*, CULIP_TYPE*, int); *(void**)(&cublas_lib_func) = CULiP_get_function_pointer(CULIP_CUBLAS_LIBRARY_NAME, CULIP_CUBLAS_ENV_NAME, __func__, &CULiP_cublas_lib_handle_cache); @@ -52,5 +35,21 @@ cublasStatus_t CULIP_FUNC_NAME(cublasHandle_t handle, cublasOperation_t transa, CULiP_launch_function(cuda_stream, &CULiP_print_profile_result, (void*)&profile_result); } + const int exp_stats_flag = (CULiP_profiling_control_array[CULIP_FUNC_ENUM_NAME] == 0) && CULiP_is_profiling_enabled(CULIP_EXP_STATS_ENABLE_ENV_NAME, false); + if (exp_stats_flag) { + cudaStream_t cuda_stream; + cublasGetStream(handle, &cuda_stream); + CULiP_exp_stats a_stats; + CULiP_exp_stats b_stats; + snprintf(a_stats.name, a_stats.name_length - 1, "A"); + snprintf(b_stats.name, b_stats.name_length - 1, "B"); + a_stats.stats = mtk::cu_exp_statistics::take_matrix_statistics(A, (transa == CUBLAS_OP_N ? m : k), (transb == CUBLAS_OP_N ? k : m), lda, cuda_stream); + b_stats.stats = mtk::cu_exp_statistics::take_matrix_statistics(B, (transa == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream); + mtk::cu_exp_statistics::to_json(a_stats.stats); + mtk::cu_exp_statistics::to_json(b_stats.stats); + CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats); + CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&b_stats); + } + return result; } diff --git a/src/utils.cu b/src/utils.cu index 37f3e25..0e4a6f1 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -21,6 +21,7 @@ extern "C" void CULiP_print_profile_result(void *profile_result_ptr) { ((long)profile_result.end_timestamp.tv_nsec - (long)profile_result.start_timestamp.tv_nsec); printf("[%s][%s] %luns\n", CULIP_RESULT_PREFIX, profile_result.function_name, elapsed_time_us); + fflush(stdout); } extern "C" void CULiP_print_exp_stats_result(void *exp_stats_result_ptr) { @@ -28,6 +29,7 @@ extern "C" void CULiP_print_exp_stats_result(void *exp_stats_result_ptr) { *((CULiP_exp_stats *)exp_stats_result_ptr); printf("[%s] %s: %s\n", CULIP_EXP_STATS_PREFIX, exp_stats_result.name, mtk::cu_exp_statistics::to_json(exp_stats_result.stats).c_str()); + fflush(stdout); } // TODO: Make this function non-blocking using `cuLauchHostFunc` @@ -68,13 +70,13 @@ extern "C" void* CULiP_get_function_pointer(const char* const library_name, cons } // Profiling status -extern "C" int CULiP_is_profiling_enabled(const char* env_name) { +extern "C" int CULiP_is_profiling_enabled(const char* env_name, const bool disable_if_set) { const char* value = getenv(env_name); if (value == NULL) { - return 1; + return disable_if_set; } if (strcmp(value, "0") == 0) { - return 0; + return disable_if_set; } - return 1; + return !disable_if_set; } diff --git a/src/utils.hpp b/src/utils.hpp index 6443db8..b1bb0e6 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -42,5 +42,5 @@ extern "C" void CULiP_launch_function(cudaStream_t cuda_stream, void (*fn)(void* extern "C" void* CULiP_get_function_pointer(const char* const library_name, const char* const env_name, const char* const function_name, void** CULiP_haldle_cache); // Profiling status -extern "C" int CULiP_is_profiling_enabled(const char* env_name); +extern "C" int CULiP_is_profiling_enabled(const char* env_name, const bool disable_if_set = true); #endif From 16fca1110f94ead138786a8c0bb85fc0cf99f080 Mon Sep 17 00:00:00 2001 From: mutsuki Date: Thu, 25 Aug 2022 19:30:55 +0900 Subject: [PATCH 04/12] Update submodule cuda-exponent-distribution-statistics --- src/cuda-exponent-distribution-statistics | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cuda-exponent-distribution-statistics b/src/cuda-exponent-distribution-statistics index 1dee8fb..116545a 160000 --- a/src/cuda-exponent-distribution-statistics +++ b/src/cuda-exponent-distribution-statistics @@ -1 +1 @@ -Subproject commit 1dee8fb13b084b9c04983854a4b7702063854c6f +Subproject commit 116545a4c5e83e697281428fcf307bcfeb0ea527 From 8ffa707759cec95927846ffc509fdeb7de6a81f5 Mon Sep 17 00:00:00 2001 From: mutsuki Date: Thu, 25 Aug 2022 20:01:00 +0900 Subject: [PATCH 05/12] Add submodule cu_exp_statistics --- CMakeLists.txt | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f58e2c3..e00cdb4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,7 @@ project(culip LANGUAGES CXX CUDA) find_package(CUDA 10.0 REQUIRED) find_package(CUDAToolkit REQUIRED) +find_package(Git REQUIRED) # CUDA/CXX foreach(lang CXX CUDA) @@ -20,13 +21,23 @@ set(SRCDIR src) file(GLOB HEADERS "${INCDIR}/CULiP/*.hpp") +if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda-exponent-distribution-statistics/Makefile) + execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive -- ${dir} + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + COMMAND_ERROR_IS_FATAL ANY) +endif() + + foreach(library cublas) - SET(lib_name culip_${library}) + set(lib_name culip_${library}) + set(exp_lib_home ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda-exponent-distribution-statistics) + include_directories(${lib_name} PRIVATE ${exp_lib_home}/include ${exp_lib_home}/src/cutf/include) add_library(${lib_name} SHARED ${SRCDIR}/${library}.cu ${SRCDIR}/utils.cu ${SRCDIR}/utils.hpp ${SRCDIR}/params.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda-exponent-distribution-statistics/src/main.cu ${HEADERS} ) From be28095a170ccde088dc927d0e43d4072fc3ee68 Mon Sep 17 00:00:00 2001 From: mutsuki Date: Thu, 25 Aug 2022 20:15:47 +0900 Subject: [PATCH 06/12] Add exp stats for gemmEx and gemm_strided_batch --- src/cublas.cu | 52 ++++++++++++++++++++++ src/cublas.gemm_strided_batched.template.h | 19 ++++++++ 2 files changed, 71 insertions(+) diff --git a/src/cublas.cu b/src/cublas.cu index b6afe71..e39b9b0 100644 --- a/src/cublas.cu +++ b/src/cublas.cu @@ -13,6 +13,42 @@ #define CULIP_CUBLAS_DISABLE_ENV_NAME "CULIP_DISABLE_CUBLAS_PROFILING" #define CULIP_EXP_STATS_ENABLE_ENV_NAME "CULIP_ENABLE_EXP_STATS" +namespace { +mtk::cu_exp_statistics::result_t exp_stats( + const void* const ptr, + const std::size_t m, + const std::size_t n, + const std::size_t ld, + const cudaStream_t cuda_stream, + cudaDataType_t data_t + ) { + mtk::cu_exp_statistics::result_t result; + switch (data_t) { + case CUDA_R_64F: + result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr), m, n, ld, cuda_stream); + break; + case CUDA_R_32F: + result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr), m, n, ld, cuda_stream); + break; + case CUDA_R_16F: + result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr), m, n, ld, cuda_stream); + break; + case CUDA_C_64F: + result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr), m, n, ld, cuda_stream); + break; + case CUDA_C_32F: + result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr), m, n, ld, cuda_stream); + break; + case CUDA_C_16F: + result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr), m, n, ld, cuda_stream); + break; + default: + break; + } + return result; +} +} // unnamed namespace + extern "C" { // dlopen cache void* CULiP_cublas_lib_handle_cache = NULL; @@ -220,6 +256,22 @@ cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, CULiP_launch_function(cuda_stream, &CULiP_print_profile_result, (void*)&profile_result); } + const int exp_stats_flag = (CULiP_profiling_control_array[CULiP_cublasGemmEx] == 0) && CULiP_is_profiling_enabled(CULIP_EXP_STATS_ENABLE_ENV_NAME, false); + if (exp_stats_flag) { + cudaStream_t cuda_stream; + cublasGetStream(handle, &cuda_stream); + CULiP_exp_stats a_stats; + CULiP_exp_stats b_stats; + snprintf(a_stats.name, a_stats.name_length - 1, "A"); + snprintf(b_stats.name, b_stats.name_length - 1, "B"); + a_stats.stats = exp_stats(A, (transa == CUBLAS_OP_N ? m : k), (transb == CUBLAS_OP_N ? k : m), lda, cuda_stream, Atype); + b_stats.stats = exp_stats(B, (transa == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream, Btype); + mtk::cu_exp_statistics::to_json(a_stats.stats); + mtk::cu_exp_statistics::to_json(b_stats.stats); + CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats); + CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&b_stats); + } + return result; } diff --git a/src/cublas.gemm_strided_batched.template.h b/src/cublas.gemm_strided_batched.template.h index db4f58a..e12cd64 100644 --- a/src/cublas.gemm_strided_batched.template.h +++ b/src/cublas.gemm_strided_batched.template.h @@ -48,5 +48,24 @@ cublasStatus_t CULIP_FUNC_NAME (cublasHandle_t handle, CULiP_launch_function(cuda_stream, &CULiP_print_profile_result, (void*)&profile_result); } + const int exp_stats_flag = (CULiP_profiling_control_array[CULIP_FUNC_ENUM_NAME] == 0) && CULiP_is_profiling_enabled(CULIP_EXP_STATS_ENABLE_ENV_NAME, false); + if (exp_stats_flag) { + cudaStream_t cuda_stream; + cublasGetStream(handle, &cuda_stream); + CULiP_exp_stats a_stats; + CULiP_exp_stats b_stats; + snprintf(a_stats.name, a_stats.name_length - 1, "A"); + snprintf(b_stats.name, b_stats.name_length - 1, "B"); + for (std::uint32_t i = 0; i < batchCount; i++) { + a_stats.stats += mtk::cu_exp_statistics::take_matrix_statistics(A + i * strideA, (transa == CUBLAS_OP_N ? m : k), (transb == CUBLAS_OP_N ? k : m), lda, cuda_stream); + b_stats.stats += mtk::cu_exp_statistics::take_matrix_statistics(B + i * strideB, (transa == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream); + } + mtk::cu_exp_statistics::to_json(a_stats.stats); + mtk::cu_exp_statistics::to_json(b_stats.stats); + CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats); + CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&b_stats); + } + + return result; } From daff5fcf96faaf774d19d0d9d82cc9d32bf8a841 Mon Sep 17 00:00:00 2001 From: mutsuki Date: Thu, 25 Aug 2022 20:57:16 +0900 Subject: [PATCH 07/12] Fix arguments of take_matrix_statistics --- src/cublas.cu | 4 ++-- src/cublas.gemm.template.h | 4 ++-- src/cublas.gemm_strided_batched.template.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cublas.cu b/src/cublas.cu index e39b9b0..f155026 100644 --- a/src/cublas.cu +++ b/src/cublas.cu @@ -264,8 +264,8 @@ cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, CULiP_exp_stats b_stats; snprintf(a_stats.name, a_stats.name_length - 1, "A"); snprintf(b_stats.name, b_stats.name_length - 1, "B"); - a_stats.stats = exp_stats(A, (transa == CUBLAS_OP_N ? m : k), (transb == CUBLAS_OP_N ? k : m), lda, cuda_stream, Atype); - b_stats.stats = exp_stats(B, (transa == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream, Btype); + a_stats.stats = exp_stats(A, (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, cuda_stream, Atype); + b_stats.stats = exp_stats(B, (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream, Btype); mtk::cu_exp_statistics::to_json(a_stats.stats); mtk::cu_exp_statistics::to_json(b_stats.stats); CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats); diff --git a/src/cublas.gemm.template.h b/src/cublas.gemm.template.h index ee517ed..dd1be75 100644 --- a/src/cublas.gemm.template.h +++ b/src/cublas.gemm.template.h @@ -43,8 +43,8 @@ cublasStatus_t CULIP_FUNC_NAME(cublasHandle_t handle, cublasOperation_t transa, CULiP_exp_stats b_stats; snprintf(a_stats.name, a_stats.name_length - 1, "A"); snprintf(b_stats.name, b_stats.name_length - 1, "B"); - a_stats.stats = mtk::cu_exp_statistics::take_matrix_statistics(A, (transa == CUBLAS_OP_N ? m : k), (transb == CUBLAS_OP_N ? k : m), lda, cuda_stream); - b_stats.stats = mtk::cu_exp_statistics::take_matrix_statistics(B, (transa == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream); + a_stats.stats = mtk::cu_exp_statistics::take_matrix_statistics(A, (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, cuda_stream); + b_stats.stats = mtk::cu_exp_statistics::take_matrix_statistics(B, (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream); mtk::cu_exp_statistics::to_json(a_stats.stats); mtk::cu_exp_statistics::to_json(b_stats.stats); CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats); diff --git a/src/cublas.gemm_strided_batched.template.h b/src/cublas.gemm_strided_batched.template.h index e12cd64..207ec36 100644 --- a/src/cublas.gemm_strided_batched.template.h +++ b/src/cublas.gemm_strided_batched.template.h @@ -57,8 +57,8 @@ cublasStatus_t CULIP_FUNC_NAME (cublasHandle_t handle, snprintf(a_stats.name, a_stats.name_length - 1, "A"); snprintf(b_stats.name, b_stats.name_length - 1, "B"); for (std::uint32_t i = 0; i < batchCount; i++) { - a_stats.stats += mtk::cu_exp_statistics::take_matrix_statistics(A + i * strideA, (transa == CUBLAS_OP_N ? m : k), (transb == CUBLAS_OP_N ? k : m), lda, cuda_stream); - b_stats.stats += mtk::cu_exp_statistics::take_matrix_statistics(B + i * strideB, (transa == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream); + a_stats.stats += mtk::cu_exp_statistics::take_matrix_statistics(A + i * strideA, (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, cuda_stream); + b_stats.stats += mtk::cu_exp_statistics::take_matrix_statistics(B + i * strideB, (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream); } mtk::cu_exp_statistics::to_json(a_stats.stats); mtk::cu_exp_statistics::to_json(b_stats.stats); From 59dd8dafcfe0af7947a1e14bd87b25a2a5e8ca2e Mon Sep 17 00:00:00 2001 From: mutsuki Date: Thu, 25 Aug 2022 21:06:09 +0900 Subject: [PATCH 08/12] Update cublas doc --- docs/cublas.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/cublas.md b/docs/cublas.md index 90a17d8..1482f31 100644 --- a/docs/cublas.md +++ b/docs/cublas.md @@ -1,5 +1,7 @@ # Supported cuBLAS functions list +- "(*)" : the exponent distribution stats is available + ## Level 2 - GEMV @@ -28,7 +30,7 @@ ## Level 3 -- GEMM +- GEMM (*) - `cublasDgemm` - `cublasSgemm` - `cublasHgemm` @@ -42,7 +44,7 @@ - `cublasCgemmBatched` - `cublasZgemmBatched` - `cublasGemmExBatched` -- GEMM STRIDED BATCHED +- GEMM STRIDED BATCHED (*) - `cublasDgemmStridedBatched` - `cublasSgemmStridedBatched` - `cublasHgemmStridedBatched` From 685889c6a93eea1a8d787db66b18265edd585cdd Mon Sep 17 00:00:00 2001 From: mutsuki Date: Thu, 25 Aug 2022 21:06:33 +0900 Subject: [PATCH 09/12] Add support for `cublasGemmStridedBatchedEx` --- src/cublas.cu | 37 ++++++++++++++++------ src/cublas.gemm_strided_batched.template.h | 1 - 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/cublas.cu b/src/cublas.cu index f155026..e96ff0b 100644 --- a/src/cublas.cu +++ b/src/cublas.cu @@ -16,6 +16,7 @@ namespace { mtk::cu_exp_statistics::result_t exp_stats( const void* const ptr, + const std::size_t offset, const std::size_t m, const std::size_t n, const std::size_t ld, @@ -25,22 +26,22 @@ mtk::cu_exp_statistics::result_t exp_stats( mtk::cu_exp_statistics::result_t result; switch (data_t) { case CUDA_R_64F: - result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr), m, n, ld, cuda_stream); + result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr) + offset, m, n, ld, cuda_stream); break; case CUDA_R_32F: - result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr), m, n, ld, cuda_stream); + result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr) + offset, m, n, ld, cuda_stream); break; case CUDA_R_16F: - result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr), m, n, ld, cuda_stream); + result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr) + offset, m, n, ld, cuda_stream); break; case CUDA_C_64F: - result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr), m, n, ld, cuda_stream); + result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr) + offset, m, n, ld, cuda_stream); break; case CUDA_C_32F: - result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr), m, n, ld, cuda_stream); + result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr) + offset, m, n, ld, cuda_stream); break; case CUDA_C_16F: - result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr), m, n, ld, cuda_stream); + result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast(ptr) + offset, m, n, ld, cuda_stream); break; default: break; @@ -264,8 +265,8 @@ cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, CULiP_exp_stats b_stats; snprintf(a_stats.name, a_stats.name_length - 1, "A"); snprintf(b_stats.name, b_stats.name_length - 1, "B"); - a_stats.stats = exp_stats(A, (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, cuda_stream, Atype); - b_stats.stats = exp_stats(B, (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream, Btype); + a_stats.stats = exp_stats(A, 0, (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, cuda_stream, Atype); + b_stats.stats = exp_stats(B, 0, (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream, Btype); mtk::cu_exp_statistics::to_json(a_stats.stats); mtk::cu_exp_statistics::to_json(b_stats.stats); CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats); @@ -456,7 +457,7 @@ cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle, int batchCount, cublasComputeType_t computeType, cublasGemmAlgo_t algo) { - const int profiling_flag = (CULiP_profiling_control_array[CULiP_cublasGemmBatchedEx] == 0) && CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME); + const int profiling_flag = (CULiP_profiling_control_array[CULiP_cublasGemmStridedBatchedEx] == 0) && CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME); // Get the function pointer cublasStatus_t (*cublas_lib_func)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void*, const void*, cudaDataType_t, int, long long int, const void*, cudaDataType_t, int, long long int, const void*, void*, cudaDataType_t, int, long long int, int, cublasComputeType_t, cublasGemmAlgo_t); @@ -493,6 +494,24 @@ cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle, CULiP_launch_function(cuda_stream, &CULiP_print_profile_result, (void*)&profile_result); } + const int exp_stats_flag = (CULiP_profiling_control_array[CULiP_cublasGemmStridedBatchedEx] == 0) && CULiP_is_profiling_enabled(CULIP_EXP_STATS_ENABLE_ENV_NAME, false); + if (exp_stats_flag) { + cudaStream_t cuda_stream; + cublasGetStream(handle, &cuda_stream); + CULiP_exp_stats a_stats; + CULiP_exp_stats b_stats; + snprintf(a_stats.name, a_stats.name_length - 1, "A"); + snprintf(b_stats.name, b_stats.name_length - 1, "B"); + for (std::uint32_t i = 0; i < batchCount; i++) { + a_stats.stats += exp_stats(A, i * strideA, (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, cuda_stream, Atype); + b_stats.stats += exp_stats(B, i * strideB, (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream, Btype); + } + mtk::cu_exp_statistics::to_json(a_stats.stats); + mtk::cu_exp_statistics::to_json(b_stats.stats); + CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats); + CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&b_stats); + } + return result; } diff --git a/src/cublas.gemm_strided_batched.template.h b/src/cublas.gemm_strided_batched.template.h index 207ec36..f6444b8 100644 --- a/src/cublas.gemm_strided_batched.template.h +++ b/src/cublas.gemm_strided_batched.template.h @@ -66,6 +66,5 @@ cublasStatus_t CULIP_FUNC_NAME (cublasHandle_t handle, CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&b_stats); } - return result; } From a7f40720bf1742bd5ca477480aa681224e91687a Mon Sep 17 00:00:00 2001 From: mutsuki Date: Thu, 25 Aug 2022 21:08:28 +0900 Subject: [PATCH 10/12] Add `CULiP_cublasGemmStridedBatchedEx` --- include/CULiP/cublas.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/CULiP/cublas.hpp b/include/CULiP/cublas.hpp index 6aff6fa..6ef6c31 100644 --- a/include/CULiP/cublas.hpp +++ b/include/CULiP/cublas.hpp @@ -36,12 +36,13 @@ enum CULiP_cublas_control_t { CULiP_cublasHgemmBatched, CULiP_cublasCgemmBatched, CULiP_cublasZgemmBatched, + CULiP_cublasGemmBatchedEx, CULiP_cublasDgemmStridedBatched, CULiP_cublasSgemmStridedBatched, CULiP_cublasHgemmStridedBatched, CULiP_cublasCgemmStridedBatched, CULiP_cublasZgemmStridedBatched, - CULiP_cublasGemmBatchedEx, + CULiP_cublasGemmStridedBatchedEx, CULiP_cublasDsyrk, CULiP_cublasSsyrk, CULiP_cublasCsyrk, From 157d305112bf67c21064a0d78232f0c9400c7893 Mon Sep 17 00:00:00 2001 From: mutsuki Date: Thu, 25 Aug 2022 21:08:52 +0900 Subject: [PATCH 11/12] Add an explanation about ExpStats --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index e726b73..aa16046 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,12 @@ export CULIP_DISABLE_CUBLAS_PROFILING=0 unset CULIP_DISABLE_CUBLAS_PROFILING ``` +To enable exponent statistics, set an environmental variable `CULIP_ENABLE_EXP_STATS`. +```bash +# Enable exponent statistics +export CULIP_ENABLE_EXP_STATS=1 +``` + ## Profiling control API CULiP provides profiling control API. From eb6a22701e5744918a14c2725a6fb67b970c0e2e Mon Sep 17 00:00:00 2001 From: mutsuki Date: Thu, 25 Aug 2022 21:13:11 +0900 Subject: [PATCH 12/12] Fix typo --- src/utils.cu | 10 +++++----- src/utils.hpp | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/utils.cu b/src/utils.cu index 0e4a6f1..acd0f6f 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -40,7 +40,7 @@ extern "C" void CULiP_launch_function(cudaStream_t cuda_stream, void (*fn)(void* } // Function loader -extern "C" void* CULiP_get_function_pointer(const char* const library_name, const char* const env_name, const char* const function_name, void** CULiP_haldle_cache) { +extern "C" void* CULiP_get_function_pointer(const char* const library_name, const char* const env_name, const char* const function_name, void** CULiP_handle_cache) { CULIBPROFILER_DEBUG_PRINT(printf("[CULiP Debug][%s] start\n", function_name)); // Get the real library path @@ -50,9 +50,9 @@ extern "C" void* CULiP_get_function_pointer(const char* const library_name, cons } // Open the library - if (*CULiP_haldle_cache == NULL) { - *CULiP_haldle_cache = dlopen(library_path, RTLD_NOW); - if (*CULiP_haldle_cache == NULL) { + if (*CULiP_handle_cache == NULL) { + *CULiP_handle_cache = dlopen(library_path, RTLD_NOW); + if (*CULiP_handle_cache == NULL) { fprintf(stderr, "[CULiP ERROR] Failed to load the real library %s\n", library_path); exit(1); } @@ -60,7 +60,7 @@ extern "C" void* CULiP_get_function_pointer(const char* const library_name, cons } // Get function pointer - void* function_ptr = dlsym(*CULiP_haldle_cache, function_name); + void* function_ptr = dlsym(*CULiP_handle_cache, function_name); if (function_ptr == NULL) { fprintf(stderr, "[CULiP ERROR] Failed to load the function %s\n", __func__); exit(1); diff --git a/src/utils.hpp b/src/utils.hpp index b1bb0e6..f883e06 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -39,7 +39,7 @@ extern "C" void CULiP_print_exp_stats_result(void* exp_stats_result_ptr); extern "C" void CULiP_launch_function(cudaStream_t cuda_stream, void (*fn)(void*), void* const arg); // Function loader -extern "C" void* CULiP_get_function_pointer(const char* const library_name, const char* const env_name, const char* const function_name, void** CULiP_haldle_cache); +extern "C" void* CULiP_get_function_pointer(const char* const library_name, const char* const env_name, const char* const function_name, void** CULiP_handle_cache); // Profiling status extern "C" int CULiP_is_profiling_enabled(const char* env_name, const bool disable_if_set = true);