Skip to content

Commit

Permalink
Merge branch '43-taking-exponent-distribution-statistics' into 'master'
Browse files Browse the repository at this point in the history
Add taking exponent distribution statistics

See merge request mutsuki/CULiP!45
  • Loading branch information
enp1s0 committed Aug 25, 2022
2 parents 3ef59cc + eb6a227 commit 258ddfe
Show file tree
Hide file tree
Showing 12 changed files with 171 additions and 20 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "src/cuda-exponent-distribution-statistics"]
path = src/cuda-exponent-distribution-statistics
url = https://github.com/enp1s0/cuda-exponent-distribution-statistics
13 changes: 12 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ project(culip LANGUAGES CXX CUDA)

find_package(CUDA 10.0 REQUIRED)
find_package(CUDAToolkit REQUIRED)
find_package(Git REQUIRED)

# CUDA/CXX
foreach(lang CXX CUDA)
Expand All @@ -20,13 +21,23 @@ set(SRCDIR src)

file(GLOB HEADERS "${INCDIR}/CULiP/*.hpp")

if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda-exponent-distribution-statistics/Makefile)
execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive -- ${dir}
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
COMMAND_ERROR_IS_FATAL ANY)
endif()


foreach(library cublas)
SET(lib_name culip_${library})
set(lib_name culip_${library})
set(exp_lib_home ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda-exponent-distribution-statistics)
include_directories(${lib_name} PRIVATE ${exp_lib_home}/include ${exp_lib_home}/src/cutf/include)
add_library(${lib_name} SHARED
${SRCDIR}/${library}.cu
${SRCDIR}/utils.cu
${SRCDIR}/utils.hpp
${SRCDIR}/params.hpp
${CMAKE_CURRENT_SOURCE_DIR}/src/cuda-exponent-distribution-statistics/src/main.cu
${HEADERS}
)

Expand Down
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,18 @@ Then the execution time is printed on stdout.
To disable profiling at runtime, define an environment variable `CULIP_PROFILING_CUBLAS_DISABLE`.
```bash
# Disable cuBLAS profiling
export CULIP_PROFILING_CUBLAS_DISABLE=1
export CULIP_DISABLE_CUBLAS_PROFILING=1

# Enable cuBLAS profiling
export CULIP_PROFILING_CUBLAS_DISABLE=0
export CULIP_DISABLE_CUBLAS_PROFILING=0
# or
unset CULIP_PROFILING_CUBLAS_DISABLE
unset CULIP_DISABLE_CUBLAS_PROFILING
```

To enable exponent statistics, set an environmental variable `CULIP_ENABLE_EXP_STATS`.
```bash
# Enable exponent statistics
export CULIP_ENABLE_EXP_STATS=1
```

## Profiling control API
Expand Down
6 changes: 4 additions & 2 deletions docs/cublas.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Supported cuBLAS functions list

- "(*)" : the exponent distribution stats is available

## Level 2

- GEMV
Expand Down Expand Up @@ -28,7 +30,7 @@

## Level 3

- GEMM
- GEMM (*)
- `cublasDgemm`
- `cublasSgemm`
- `cublasHgemm`
Expand All @@ -42,7 +44,7 @@
- `cublasCgemmBatched`
- `cublasZgemmBatched`
- `cublasGemmExBatched`
- GEMM STRIDED BATCHED
- GEMM STRIDED BATCHED (*)
- `cublasDgemmStridedBatched`
- `cublasSgemmStridedBatched`
- `cublasHgemmStridedBatched`
Expand Down
3 changes: 2 additions & 1 deletion include/CULiP/cublas.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,13 @@ enum CULiP_cublas_control_t {
CULiP_cublasHgemmBatched,
CULiP_cublasCgemmBatched,
CULiP_cublasZgemmBatched,
CULiP_cublasGemmBatchedEx,
CULiP_cublasDgemmStridedBatched,
CULiP_cublasSgemmStridedBatched,
CULiP_cublasHgemmStridedBatched,
CULiP_cublasCgemmStridedBatched,
CULiP_cublasZgemmStridedBatched,
CULiP_cublasGemmBatchedEx,
CULiP_cublasGemmStridedBatchedEx,
CULiP_cublasDsyrk,
CULiP_cublasSsyrk,
CULiP_cublasCsyrk,
Expand Down
77 changes: 75 additions & 2 deletions src/cublas.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,50 @@
#include <stdio.h>
#include <string.h>
#include <CULiP/cublas.hpp>
#include <cu_exp_statistics.hpp>
#include "utils.hpp"

#define CULIP_CUBLAS_LIBRARY_NAME "libcublas.so"
#define CULIP_CUBLAS_ENV_NAME "CULIP_CUBLAS_LIB_PATH"
#define CULIP_CUBLAS_DISABLE_ENV_NAME "CULIP_PROFILING_CUBLAS_DISABLE"
#define CULIP_CUBLAS_DISABLE_ENV_NAME "CULIP_DISABLE_CUBLAS_PROFILING"
#define CULIP_EXP_STATS_ENABLE_ENV_NAME "CULIP_ENABLE_EXP_STATS"

namespace {
mtk::cu_exp_statistics::result_t exp_stats(
const void* const ptr,
const std::size_t offset,
const std::size_t m,
const std::size_t n,
const std::size_t ld,
const cudaStream_t cuda_stream,
cudaDataType_t data_t
) {
mtk::cu_exp_statistics::result_t result;
switch (data_t) {
case CUDA_R_64F:
result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast<const double*>(ptr) + offset, m, n, ld, cuda_stream);
break;
case CUDA_R_32F:
result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast<const float*>(ptr) + offset, m, n, ld, cuda_stream);
break;
case CUDA_R_16F:
result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast<const half*>(ptr) + offset, m, n, ld, cuda_stream);
break;
case CUDA_C_64F:
result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast<const double2*>(ptr) + offset, m, n, ld, cuda_stream);
break;
case CUDA_C_32F:
result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast<const float2*>(ptr) + offset, m, n, ld, cuda_stream);
break;
case CUDA_C_16F:
result = mtk::cu_exp_statistics::take_matrix_statistics(reinterpret_cast<const half2*>(ptr) + offset, m, n, ld, cuda_stream);
break;
default:
break;
}
return result;
}
} // unnamed namespace

extern "C" {
// dlopen cache
Expand Down Expand Up @@ -218,6 +257,22 @@ cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa,
CULiP_launch_function(cuda_stream, &CULiP_print_profile_result, (void*)&profile_result);
}

const int exp_stats_flag = (CULiP_profiling_control_array[CULiP_cublasGemmEx] == 0) && CULiP_is_profiling_enabled(CULIP_EXP_STATS_ENABLE_ENV_NAME, false);
if (exp_stats_flag) {
cudaStream_t cuda_stream;
cublasGetStream(handle, &cuda_stream);
CULiP_exp_stats a_stats;
CULiP_exp_stats b_stats;
snprintf(a_stats.name, a_stats.name_length - 1, "A");
snprintf(b_stats.name, b_stats.name_length - 1, "B");
a_stats.stats = exp_stats(A, 0, (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, cuda_stream, Atype);
b_stats.stats = exp_stats(B, 0, (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream, Btype);
mtk::cu_exp_statistics::to_json(a_stats.stats);
mtk::cu_exp_statistics::to_json(b_stats.stats);
CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats);
CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&b_stats);
}

return result;
}

Expand Down Expand Up @@ -402,7 +457,7 @@ cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle,
int batchCount,
cublasComputeType_t computeType,
cublasGemmAlgo_t algo) {
const int profiling_flag = (CULiP_profiling_control_array[CULiP_cublasGemmBatchedEx] == 0) && CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME);
const int profiling_flag = (CULiP_profiling_control_array[CULiP_cublasGemmStridedBatchedEx] == 0) && CULiP_is_profiling_enabled(CULIP_CUBLAS_DISABLE_ENV_NAME);

// Get the function pointer
cublasStatus_t (*cublas_lib_func)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void*, const void*, cudaDataType_t, int, long long int, const void*, cudaDataType_t, int, long long int, const void*, void*, cudaDataType_t, int, long long int, int, cublasComputeType_t, cublasGemmAlgo_t);
Expand Down Expand Up @@ -439,6 +494,24 @@ cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle,
CULiP_launch_function(cuda_stream, &CULiP_print_profile_result, (void*)&profile_result);
}

const int exp_stats_flag = (CULiP_profiling_control_array[CULiP_cublasGemmStridedBatchedEx] == 0) && CULiP_is_profiling_enabled(CULIP_EXP_STATS_ENABLE_ENV_NAME, false);
if (exp_stats_flag) {
cudaStream_t cuda_stream;
cublasGetStream(handle, &cuda_stream);
CULiP_exp_stats a_stats;
CULiP_exp_stats b_stats;
snprintf(a_stats.name, a_stats.name_length - 1, "A");
snprintf(b_stats.name, b_stats.name_length - 1, "B");
for (std::uint32_t i = 0; i < batchCount; i++) {
a_stats.stats += exp_stats(A, i * strideA, (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, cuda_stream, Atype);
b_stats.stats += exp_stats(B, i * strideB, (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream, Btype);
}
mtk::cu_exp_statistics::to_json(a_stats.stats);
mtk::cu_exp_statistics::to_json(b_stats.stats);
CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats);
CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&b_stats);
}

return result;
}

Expand Down
16 changes: 16 additions & 0 deletions src/cublas.gemm.template.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,21 @@ cublasStatus_t CULIP_FUNC_NAME(cublasHandle_t handle, cublasOperation_t transa,
CULiP_launch_function(cuda_stream, &CULiP_print_profile_result, (void*)&profile_result);
}

const int exp_stats_flag = (CULiP_profiling_control_array[CULIP_FUNC_ENUM_NAME] == 0) && CULiP_is_profiling_enabled(CULIP_EXP_STATS_ENABLE_ENV_NAME, false);
if (exp_stats_flag) {
cudaStream_t cuda_stream;
cublasGetStream(handle, &cuda_stream);
CULiP_exp_stats a_stats;
CULiP_exp_stats b_stats;
snprintf(a_stats.name, a_stats.name_length - 1, "A");
snprintf(b_stats.name, b_stats.name_length - 1, "B");
a_stats.stats = mtk::cu_exp_statistics::take_matrix_statistics(A, (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, cuda_stream);
b_stats.stats = mtk::cu_exp_statistics::take_matrix_statistics(B, (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream);
mtk::cu_exp_statistics::to_json(a_stats.stats);
mtk::cu_exp_statistics::to_json(b_stats.stats);
CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats);
CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&b_stats);
}

return result;
}
18 changes: 18 additions & 0 deletions src/cublas.gemm_strided_batched.template.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,23 @@ cublasStatus_t CULIP_FUNC_NAME (cublasHandle_t handle,
CULiP_launch_function(cuda_stream, &CULiP_print_profile_result, (void*)&profile_result);
}

const int exp_stats_flag = (CULiP_profiling_control_array[CULIP_FUNC_ENUM_NAME] == 0) && CULiP_is_profiling_enabled(CULIP_EXP_STATS_ENABLE_ENV_NAME, false);
if (exp_stats_flag) {
cudaStream_t cuda_stream;
cublasGetStream(handle, &cuda_stream);
CULiP_exp_stats a_stats;
CULiP_exp_stats b_stats;
snprintf(a_stats.name, a_stats.name_length - 1, "A");
snprintf(b_stats.name, b_stats.name_length - 1, "B");
for (std::uint32_t i = 0; i < batchCount; i++) {
a_stats.stats += mtk::cu_exp_statistics::take_matrix_statistics(A + i * strideA, (transa == CUBLAS_OP_N ? m : k), (transa == CUBLAS_OP_N ? k : m), lda, cuda_stream);
b_stats.stats += mtk::cu_exp_statistics::take_matrix_statistics(B + i * strideB, (transb == CUBLAS_OP_N ? k : n), (transb == CUBLAS_OP_N ? n : k), ldb, cuda_stream);
}
mtk::cu_exp_statistics::to_json(a_stats.stats);
mtk::cu_exp_statistics::to_json(b_stats.stats);
CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&a_stats);
CULiP_launch_function(cuda_stream, &CULiP_print_exp_stats_result, (void*)&b_stats);
}

return result;
}
1 change: 1 addition & 0 deletions src/cuda-exponent-distribution-statistics
1 change: 1 addition & 0 deletions src/params.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
#define __CULIP_PARAMS_HPP__

#define CULIP_RESULT_PREFIX "CULiP Result"
#define CULIP_EXP_STATS_PREFIX "CULiP ExpStats"

#endif
27 changes: 18 additions & 9 deletions src/utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@ extern "C" void CULiP_print_profile_result(void *profile_result_ptr) {
((long)profile_result.end_timestamp.tv_nsec -
(long)profile_result.start_timestamp.tv_nsec);
printf("[%s][%s] %luns\n", CULIP_RESULT_PREFIX, profile_result.function_name, elapsed_time_us);
fflush(stdout);
}

extern "C" void CULiP_print_exp_stats_result(void *exp_stats_result_ptr) {
const CULiP_exp_stats exp_stats_result =
*((CULiP_exp_stats *)exp_stats_result_ptr);

printf("[%s] %s: %s\n", CULIP_EXP_STATS_PREFIX, exp_stats_result.name, mtk::cu_exp_statistics::to_json(exp_stats_result.stats).c_str());
fflush(stdout);
}

// TODO: Make this function non-blocking using `cuLauchHostFunc`
Expand All @@ -31,7 +40,7 @@ extern "C" void CULiP_launch_function(cudaStream_t cuda_stream, void (*fn)(void*
}

// Function loader
extern "C" void* CULiP_get_function_pointer(const char* const library_name, const char* const env_name, const char* const function_name, void** CULiP_haldle_cache) {
extern "C" void* CULiP_get_function_pointer(const char* const library_name, const char* const env_name, const char* const function_name, void** CULiP_handle_cache) {
CULIBPROFILER_DEBUG_PRINT(printf("[CULiP Debug][%s] start\n", function_name));

// Get the real library path
Expand All @@ -41,17 +50,17 @@ extern "C" void* CULiP_get_function_pointer(const char* const library_name, cons
}

// Open the library
if (*CULiP_haldle_cache == NULL) {
*CULiP_haldle_cache = dlopen(library_path, RTLD_NOW);
if (*CULiP_haldle_cache == NULL) {
if (*CULiP_handle_cache == NULL) {
*CULiP_handle_cache = dlopen(library_path, RTLD_NOW);
if (*CULiP_handle_cache == NULL) {
fprintf(stderr, "[CULiP ERROR] Failed to load the real library %s\n", library_path);
exit(1);
}
CULIBPROFILER_DEBUG_PRINT(printf("[CULiP Debug][%s] %s is loaded\n", function_name, library_path));
}

// Get function pointer
void* function_ptr = dlsym(*CULiP_haldle_cache, function_name);
void* function_ptr = dlsym(*CULiP_handle_cache, function_name);
if (function_ptr == NULL) {
fprintf(stderr, "[CULiP ERROR] Failed to load the function %s\n", __func__);
exit(1);
Expand All @@ -61,13 +70,13 @@ extern "C" void* CULiP_get_function_pointer(const char* const library_name, cons
}

// Profiling status
extern "C" int CULiP_is_profiling_enabled(const char* env_name) {
extern "C" int CULiP_is_profiling_enabled(const char* env_name, const bool disable_if_set) {
const char* value = getenv(env_name);
if (value == NULL) {
return 1;
return disable_if_set;
}
if (strcmp(value, "0") == 0) {
return 0;
return disable_if_set;
}
return 1;
return !disable_if_set;
}
14 changes: 12 additions & 2 deletions src/utils.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef __CUDALIBPROFILER_UTILS_HPP__
#define __CUDALIBPROFILER_UTILS_HPP__
#include <time.h>
#include <cu_exp_statistics.hpp>
#include "params.hpp"

#ifdef CULIBPROFILER_ENABLE_DEBUG_PRINT
Expand All @@ -23,14 +24,23 @@ struct CULiP_profile_result {
struct timespec end_timestamp;
};

struct CULiP_exp_stats {
// name
enum {name_length = 32};
char name[name_length] = {0};

mtk::cu_exp_statistics::result_t stats;
};

extern "C" void CULiP_print_profile_result(void* profile_result_ptr);
extern "C" void CULiP_print_exp_stats_result(void* exp_stats_result_ptr);

// Call a given function on a given stream
extern "C" void CULiP_launch_function(cudaStream_t cuda_stream, void (*fn)(void*), void* const arg);

// Function loader
extern "C" void* CULiP_get_function_pointer(const char* const library_name, const char* const env_name, const char* const function_name, void** CULiP_haldle_cache);
extern "C" void* CULiP_get_function_pointer(const char* const library_name, const char* const env_name, const char* const function_name, void** CULiP_handle_cache);

// Profiling status
extern "C" int CULiP_is_profiling_enabled(const char* env_name);
extern "C" int CULiP_is_profiling_enabled(const char* env_name, const bool disable_if_set = true);
#endif

0 comments on commit 258ddfe

Please sign in to comment.