diff --git a/common/kokkos-sampler/CMakeLists.txt b/common/kokkos-sampler/CMakeLists.txt index 609ab5707..eb94dbd0f 100644 --- a/common/kokkos-sampler/CMakeLists.txt +++ b/common/kokkos-sampler/CMakeLists.txt @@ -1 +1 @@ -add_library(kp_kokkos_sampler ${KOKKOSTOOLS_LIBRARY_MODE} kp_sampler_skip.cpp) +kp_add_library(kp_kokkos_sampler ${KOKKOSTOOLS_LIBRARY_MODE} kp_sampler_skip.cpp) diff --git a/common/kokkos-sampler/README.md b/common/kokkos-sampler/README.md index 3d0c6393a..2b21d7598 100644 --- a/common/kokkos-sampler/README.md +++ b/common/kokkos-sampler/README.md @@ -1,4 +1,7 @@ -This is a sampler utility that is intended to complement other tools in the Kokkos Tools set. This utility allows for sampling (rather than collecting) of profiling or debugging data gathered from a particular tool of the Kokkos Tools set. The Kokkos Tools user provides a sampling rate via the environment variable KOKKOS_TOOLS_SAMPLER_SKIP. +This is a sampler utility that is intended to complement other tools in the Kokkos Tools set. This utility allows for sampling of profiling or debugging data collected from a particular tool of the Kokkos Tools set at each Kokkos kernel invocation. -In order for the state of the sampled profiling and logging data in memory to be captured at the time of the utility's callback invocation, it might be important to enforce fences. However, this also means that there are more synchronization points compared with running the program without the tool. -This fencing behavior can be controlled by setting the environment variable `KOKKOS_TOOLS_GLOBALFENCES`. A non-zero value implies global fences on invocation of the tool. The default is not to introduce extra fences. +To use this utility, a Kokkos Tools user provides a sampling probability by setting the environment variable `KOKKOS_TOOLS_SAMPLER_PROB` to a positive real number between 0.0 and 100.0. The user can alternatively set a sampling skip rate, i.e., the number of Kokkos kernel invocations to skip before the next sample is taken. The user does so by setting the environment variable `KOKKOS_TOOLS_SAMPLER_SKIP` to a non-negative integer. + +If both sampling probability and sampling skip rate are set by the user, this sampling utility only uses the sampling probability for sampling; the utility sets the sampling skip rate to 1, incorporating no pre-defined periodicity in sampling. If neither sampling probability nor the sampling skip rate are set by the user, then purely random sampling is againdone, with the sampler's probability being 10.0 percent. The sampler is periodic only if the sampling probability is not set by the user _and_ the sampling skip rate is set by the user. + +For the state of the sampled profiling and logging data in memory to be captured at the time of the utility's callback invocation, it might be important to enforce fences. However, this also means that there are more synchronization points compared with running the program without the tool.This fencing behavior can be controlled by setting the environment variable `KOKKOS_TOOLS_GLOBALFENCES`. A non-zero value implies global fences on invocation of the tool. The default is not to introduce extra fences. diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index d56ed84ca..0bc61c396 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -6,13 +6,17 @@ #include #include "../../profiling/all/kp_core.hpp" #include "kp_config.hpp" +#include +#include namespace KokkosTools { namespace Sampler { static uint64_t uniqID = 0; -static uint64_t kernelSampleSkip = 101; +static uint64_t kernelSampleSkip = std::numeric_limits::max(); +static double tool_prob_num = -1.0; static int tool_verbosity = 0; static int tool_globFence = 0; +static int tool_seed = -1; // a hash table mapping kID to nestedkID static std::unordered_map infokIDSample; @@ -27,13 +31,15 @@ static finalizeFunction finalizeProfileLibrary = NULL; static beginFunction beginForCallee = NULL; static beginFunction beginScanCallee = NULL; static beginFunction beginReduceCallee = NULL; +static beginFunction beginDeepCopyCallee = NULL; static endFunction endForCallee = NULL; static endFunction endScanCallee = NULL; static endFunction endReduceCallee = NULL; +static endFunction endDeepCopyCallee = NULL; void kokkosp_request_tool_settings(const uint32_t, Kokkos_Tools_ToolSettings* settings) { - settings->requires_global_fencing = false; + settings->requires_global_fencing = 0; } // set of functions from Kokkos ToolProgrammingInterface (includes fence) @@ -48,10 +54,16 @@ uint32_t getDeviceID(uint32_t devid_in) { void invoke_ktools_fence(uint32_t devID) { if (tpi_funcs.fence != nullptr) { + if (tool_verbosity > 1) { + printf( + "KokkosP: Sampler attempting to invoke" + " tool-induced fence on device %d.\n", + getDeviceID(devID)); + } tpi_funcs.fence(devID); if (tool_verbosity > 1) { printf( - "KokkosP: Sampler utility sucessfully invoked " + "KokkosP: Sampler sucessfully invoked" " tool-induced fence on device %d\n", getDeviceID(devID)); } @@ -78,6 +90,8 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, const uint32_t devInfoCount, void* deviceInfo) { const char* tool_verbose_str = getenv("KOKKOS_TOOLS_SAMPLER_VERBOSE"); const char* tool_globFence_str = getenv("KOKKOS_TOOLS_GLOBALFENCES"); + const char* tool_seed_str = getenv("KOKKOS_TOOLS_SEED"); + if (NULL != tool_verbose_str) { tool_verbosity = atoi(tool_verbose_str); } else { @@ -88,6 +102,11 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, } else { tool_globFence = 0; } + if (NULL != tool_seed_str) { + tool_seed = atoi(tool_seed_str); + } else { + tool_seed = 1; + } char* profileLibrary = getenv("KOKKOS_TOOLS_LIBS"); if (NULL == profileLibrary) { @@ -134,6 +153,8 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, (beginFunction)dlsym(childLibrary, "kokkosp_begin_parallel_scan"); beginReduceCallee = (beginFunction)dlsym(childLibrary, "kokkosp_begin_parallel_reduce"); + beginDeepCopyCallee = + (beginFunction)dlsym(childLibrary, "kokkosp_begin_deep_copy"); endScanCallee = (endFunction)dlsym(childLibrary, "kokkosp_end_parallel_scan"); @@ -141,6 +162,8 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, (endFunction)dlsym(childLibrary, "kokkosp_end_parallel_for"); endReduceCallee = (endFunction)dlsym(childLibrary, "kokkosp_end_parallel_reduce"); + endDeepCopyCallee = + (endFunction)dlsym(childLibrary, "kokkosp_end_deep_copy"); initProfileLibrary = (initFunction)dlsym(childLibrary, "kokkosp_init_library"); @@ -182,6 +205,93 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, if (tool_verbosity > 0) { printf("KokkosP: Sampling rate set to: %s\n", tool_sample); } + const char* tool_probability = getenv("KOKKOS_TOOLS_SAMPLER_PROB"); + + if (NULL != tool_probability) { + // read sampling probability as a float between 0 and 100, representing + // a percentage that data should be gathered. + // Connector reasons about probability as a double between 0.0 and 1.0. + tool_prob_num = atof(tool_probability); + if (tool_prob_num > 100.0) { + printf( + "KokkosP: The sampling probability value is set to be greater than " + "100.0. " + "The probability for the sampler will be set to 100 percent; all of " + "the " + "invocations of a Kokkos kernel will be profiled.\n"); + tool_prob_num = 100.0; + } else if (tool_prob_num < 0.0) { + printf( + "KokkosP: The sampling probability value is set to be a negative " + "number. The " + "sampler's probability will be set to 0 percent; none of the " + "invocations of " + "a Kokkos kernel will be profiled.\n"); + tool_prob_num = 0.0; + } + } + if ((tool_prob_num < 0.0) && + (kernelSampleSkip == std::numeric_limits::max())) { + if (tool_verbosity > 0) { + printf( + "KokkosP: Neither the probability " + "nor the skip rate for sampling were set...\n"); + } + tool_prob_num = 10.0; + if (tool_verbosity > 0) { + printf( + "KokkosP: The probability " + "for the sampler is set to the default of %f percent. The skip rate " + "for sampler" + "will not be used.\n", + tool_prob_num); + } + } + + if (tool_verbosity > 0) { + if (tool_verbosity > 1) { + printf("KokkosP: Sampling skip rate provided as input is: %s\n", + tool_sample); + printf("KokkosP: Sampling probability provided as input is: %s\n", + tool_probability); + } + printf("KokkosP: Sampling skip rate is set to: %llu\n", + (unsigned long long)(kernelSampleSkip)); + printf("KokkosP: Sampling probability is set to %f\n", tool_prob_num); + } + + if (0 > tool_seed) { + srand(time(NULL)); + if (tool_verbosity > 0) { + printf( + "KokkosP: Seeding random number generator using clock for " + "random sampling.\n"); + } + } else { + srand(tool_seed); + if (tool_verbosity > 0) { + printf( + "KokkosP: Seeding random number generator using seed %u for " + "random sampling.\n", + tool_seed); + } + } + + if ((NULL != tool_probability) && (NULL != tool_sample)) { + printf( + "KokkosP: You set both the probability and skip rate for the sampler. " + "Only random sampling " + "will be done, using the probabability you set; " + "The skip rate you set will be ignored.\n"); + + if (tool_verbosity > 1) { + printf( + "KokkosP: Note: The skip rate will be set to 1. Sampling will not be " + "based " + " on a pre-defined periodicity.\n"); + } + kernelSampleSkip = 1; + } } void kokkosp_finalize_library() { @@ -194,17 +304,23 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, static uint64_t invocationNum = 0; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { - if (tool_verbosity > 0) { - printf("KokkosP: sample %llu calling child-begin function...\n", - (unsigned long long)(*kID)); - } - if (tool_globFence) { - invoke_ktools_fence(0); - } - if (NULL != beginForCallee) { - uint64_t nestedkID = 0; - (*beginForCallee)(name, devID, &nestedkID); - infokIDSample.insert({*kID, nestedkID}); + if ((rand() / (1.0 * RAND_MAX)) < (tool_prob_num / 100.0)) { + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu calling child-begin function...\n", + (unsigned long long)(*kID)); + } + if (NULL != beginForCallee) { + if (tool_globFence) { + invoke_ktools_fence(0); + } + uint64_t nestedkID = 0; + (*beginForCallee)(name, devID, &nestedkID); + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu finished with child-begin function.\n", + (unsigned long long)(*kID)); + } + infokIDSample.insert({*kID, nestedkID}); + } } } } @@ -232,17 +348,23 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, static uint64_t invocationNum = 0; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { - if (tool_verbosity > 0) { - printf("KokkosP: sample %llu calling child-begin function...\n", - (unsigned long long)(*kID)); - } - if (NULL != beginScanCallee) { - uint64_t nestedkID = 0; - if (tool_globFence) { - invoke_ktools_fence(0); + if ((rand() / (1.0 * RAND_MAX)) < (tool_prob_num / 100.0)) { + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu calling child-begin function...\n", + (unsigned long long)(*kID)); + } + if (NULL != beginScanCallee) { + uint64_t nestedkID = 0; + if (tool_globFence) { + invoke_ktools_fence(0); + } + (*beginScanCallee)(name, devID, &nestedkID); + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu finished with child-begin function.\n", + (unsigned long long)(*kID)); + } + infokIDSample.insert({*kID, nestedkID}); } - (*beginScanCallee)(name, devID, &nestedkID); - infokIDSample.insert({*kID, nestedkID}); } } } @@ -270,17 +392,23 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, static uint64_t invocationNum = 0; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { - if (tool_verbosity > 0) { - printf("KokkosP: sample %llu calling child-begin function...\n", - (unsigned long long)(*kID)); - } - if (NULL != beginReduceCallee) { - uint64_t nestedkID = 0; - if (tool_globFence) { - invoke_ktools_fence(0); + if ((rand() / (1.0 * RAND_MAX)) < (tool_prob_num / 100.0)) { + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu calling child-begin function...\n", + (unsigned long long)(*kID)); + } + if (NULL != beginReduceCallee) { + uint64_t nestedkID = 0; + if (tool_globFence) { + invoke_ktools_fence(0); + } + (*beginReduceCallee)(name, devID, &nestedkID); + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu finished with child-begin function.\n", + (unsigned long long)(*kID)); + } + infokIDSample.insert({*kID, nestedkID}); } - (*beginReduceCallee)(name, devID, &nestedkID); - infokIDSample.insert({*kID, nestedkID}); } } } @@ -296,12 +424,61 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { if (tool_globFence) { invoke_ktools_fence(0); } + (*endScanCallee)(retrievedNestedkID); infokIDSample.erase(kID); } } } + +void kokkosp_begin_deep_copy(const char* name, const uint32_t devID, + uint64_t* kID) { + *kID = uniqID++; + static uint64_t invocationNum = 0; + ++invocationNum; + if ((invocationNum % kernelSampleSkip) == 0) { + if ((rand() / (1.0 * RAND_MAX)) < (tool_prob_num / 100.0)) { + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu calling child-begin function...\n", + (unsigned long long)(*kID)); + } + if (NULL != beginDeepCopyCallee) { + uint64_t nestedkID = 0; + if (tool_globFence) { + invoke_ktools_fence(0); + } + (*beginDeepCopyCallee)(name, devID, &nestedkID); + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu finished with child-begin function.\n", + (unsigned long long)(*kID)); + } + infokIDSample.insert({*kID, nestedkID}); + } + } + } +} + +void kokkosp_end_deep_copy(const uint64_t kID) { + if (NULL != endDeepCopyCallee) { + if (!(infokIDSample.find(kID) == infokIDSample.end())) { + uint64_t retrievedNestedkID = infokIDSample[kID]; + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu calling child-end function...\n", + (unsigned long long)(kID)); + } + if (tool_globFence) { + invoke_ktools_fence(0); + } + + (*endDeepCopyCallee)(retrievedNestedkID); + infokIDSample.erase(kID); + } + } +} + + + } // namespace Sampler } // end namespace KokkosTools @@ -319,5 +496,8 @@ EXPOSE_BEGIN_PARALLEL_SCAN(impl::kokkosp_begin_parallel_scan) EXPOSE_END_PARALLEL_SCAN(impl::kokkosp_end_parallel_scan) EXPOSE_BEGIN_PARALLEL_REDUCE(impl::kokkosp_begin_parallel_reduce) EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce) +EXPOSE_BEGIN_DEEP_COPY(impl::kokkosp_begin_deep_copy) +EXPOSE_END_DEEP_COPY(impl::kokkosp_end_deep_copy) + } // end extern "C"