diff --git a/scripts/superbuild/CMakeLists.txt b/scripts/superbuild/CMakeLists.txt index 839391a9db7..a4506ca2d23 100644 --- a/scripts/superbuild/CMakeLists.txt +++ b/scripts/superbuild/CMakeLists.txt @@ -69,6 +69,7 @@ set(LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY "COMMON" # descending into subdirectories. lbann_sb_add_packages( # Ack, a "third-order" dependency + NCCL RCCL # These are "second-order" dependencies diff --git a/scripts/superbuild/README.md b/scripts/superbuild/README.md index 88ff9214e26..4bb048e2a73 100644 --- a/scripts/superbuild/README.md +++ b/scripts/superbuild/README.md @@ -84,6 +84,8 @@ The following packages are known by the SuperBuild framework: algebra library. - [JPEG-TURBO](https://github.com/libjpeg-turbo/libjpeg-turbo) - JPEG but in turbo mode. Zoom zoom zoom. +- [NCCL](https://github.com/NVIDIA/nccl) - The NVIDIA Collective + Communications Library. - [OpenBLAS](https://github.com/xianyi/OpenBLAS.git) - BLAS library for when your vendor doesn't do a good job. - [OpenCV](https://github.com/opencv/opencv) - Computer vision @@ -91,6 +93,8 @@ The following packages are known by the SuperBuild framework: - [protobuf](https://github.com/protocolbuffers/protobuf.git) - And yet *another* serialization format that LBANN (and others) (ab)use for model topology description and configuration. +- [RCCL](https://github.com/ROCm/rccl) - The ROCm Communication + Collectives Library. - [spdlog](https://github.com/gabime/spdlog) - Fast C++ logging library. - [zstr](https://github.com/mateidavid/zstr) - C++ ZLib wrapper. diff --git a/scripts/superbuild/aluminum/CMakeLists.txt b/scripts/superbuild/aluminum/CMakeLists.txt index c04e78c0100..a8ec264703e 100644 --- a/scripts/superbuild/aluminum/CMakeLists.txt +++ b/scripts/superbuild/aluminum/CMakeLists.txt @@ -61,7 +61,7 @@ lbann_sb_add_cmake_extern_pkg( OPTIONAL_LANGUAGES CUDA HIP GITHUB_URL llnl/Aluminum.git GIT_TAG "master" - DEPENDS_ON Caliper RCCL) + DEPENDS_ON Caliper NCCL RCCL) set(Aluminum_DIR ${LBANN_SB_Aluminum_PREFIX} CACHE INTERNAL "The install prefix of Aluminum.") diff --git a/scripts/superbuild/cmake/modules/LBANNSuperBuildAddPackages.cmake b/scripts/superbuild/cmake/modules/LBANNSuperBuildAddPackages.cmake index e57c3ee9690..40437f79da2 100644 --- a/scripts/superbuild/cmake/modules/LBANNSuperBuildAddPackages.cmake +++ b/scripts/superbuild/cmake/modules/LBANNSuperBuildAddPackages.cmake @@ -27,14 +27,22 @@ include(CMakeDependentOption) macro(lbann_sb_default_pkg_option PKG_NAME OPTION_NAME DOC_STR VALUE) option(LBANN_SB_FWD_${PKG_NAME}_${OPTION_NAME} - "${DOC_STR}" + "${PKG_NAME}: ${DOC_STR}" ${VALUE}) endmacro () +# This assumes PKG_NAME is defined already. No check to ensure this, +# just don't misuse. :) +macro(lbann_sb_this_pkg_option OPTNAME DOCSTR DEFVAL) + option(LBANN_SB_FWD_${PKG_NAME}_${OPTNAME} + "${PKG_NAME}: ${DOCSTR}" + ${DEFVAL}) +endmacro () + macro(lbann_sb_default_cuda_option PKG_NAME OPTION_NAME DOC_STR VALUE) cmake_dependent_option( LBANN_SB_FWD_${PKG_NAME}_${OPTION_NAME} - "${DOC_STR}" + "${PKG_NAME}: ${DOC_STR}" ${VALUE} "LBANN_SB_DEFAULT_CUDA_OPTS" OFF) @@ -43,7 +51,7 @@ endmacro () macro(lbann_sb_default_rocm_option PKG_NAME OPTION_NAME DOC_STR VALUE) cmake_dependent_option( LBANN_SB_FWD_${PKG_NAME}_${OPTION_NAME} - "${DOC_STR}" + "${PKG_NAME}: ${DOC_STR}" ${VALUE} "LBANN_SB_DEFAULT_ROCM_OPTS" OFF) @@ -52,7 +60,7 @@ endmacro () macro(lbann_sb_default_gpu_option PKG_NAME OPTION_NAME DOC_STR VALUE) cmake_dependent_option( LBANN_SB_FWD_${PKG_NAME}_${OPTION_NAME} - "${DOC_STR}" + "${PKG_NAME}: ${DOC_STR}" ${VALUE} "LBANN_SB_DEFAULT_CUDA_OPTS OR LBANN_SB_DEFAULT_ROCM_OPTS" OFF) diff --git a/scripts/superbuild/nccl/CMakeLists.txt b/scripts/superbuild/nccl/CMakeLists.txt new file mode 100644 index 00000000000..3ce8bb409da --- /dev/null +++ b/scripts/superbuild/nccl/CMakeLists.txt @@ -0,0 +1,174 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# The goal of this package is to enable *basic* builds of NCCL. Any +# configuration that would require modification to the provided +# makefile is considered out-of-scope and users needing such +# configuration should consider a standalone build rather than +# superbuilding NCCL. +# +# Note that the default NCCL makefile is rather rigid. It looks like +# one _can_ inject flags, but they have a high likelihood of being +# trampled by the makefile. E.g., the makefiles specifies optimization +# flags *after* the user injection point, so trying to modify the +# optimization level manually would be moot. + +# Interprets the given variable as a boolean value and converts it to +# 1 (true) or 0 (false). +macro(bool_as_num VAR) + if (${VAR}) + set(${VAR} 1) + else () + set(${VAR} 0) + endif () +endmacro () + +lbann_sb_init_extern_pkg( + NAME NCCL + LANGUAGES C CXX # CUDA <- can't set explicitly; inferred from ${CUDA_HOME} + GITHUB_URL NVIDIA/nccl + GIT_TAG "master") + +# User-facing options +lbann_sb_this_pkg_option( + VERBOSE + "Print build commands?" + ON) +bool_as_num(LBANN_SB_FWD_NCCL_VERBOSE) + +lbann_sb_this_pkg_option( + KEEP + "Keep intermediate files generated during compilation" + OFF) +bool_as_num(LBANN_SB_FWD_NCCL_KEEP) + +lbann_sb_this_pkg_option( + ASAN + "Build with address sanitizer enabled" + OFF) +bool_as_num(LBANN_SB_FWD_NCCL_ASAN) + +lbann_sb_this_pkg_option( + TRACE + "Build with tracing enabled" + OFF) +bool_as_num(LBANN_SB_FWD_NCCL_TRACE) + +# Debug build? +string(TOLOWER "${LBANN_SB_${PKG_NAME}_BUILD_TYPE}" _nccl_build_type) +if (_nccl_build_type STREQUAL "debug") + set(_nccl_debug 1) +else () + set(_nccl_debug 0) +endif () + +# Prefer a user-specified CUDA path, then check CUDA_HOME +if (LBANN_SB_FWD_NCCL_CUDA_PATH) + set(_nccl_cuda_path_opt + "CUDA_HOME=${LBANN_SB_FWD_NCCL_CUDA_PATH}") +elseif (DEFINED ENV{CUDA_HOME}) + set(_nccl_cuda_path_opt + "CUDA_HOME=$ENV{CUDA_HOME}") +else () + message(WARNING + "You have enabled NCCL package, but CUDA_HOME " + "is not available in your environment.") +endif () + +# Gencode control +if (LBANN_SB_FWD_NCCL_NVCC_GENCODE) + set(_nccl_nvcc_gencode_opt + "NVCC_GENCODE=${LBANN_SB_FWD_NCCL_NVCC_GENCODE}") +elseif (DEFINED $ENV{NVCC_GENCODE}) + set(_nccl_nvcc_gencode_opt + "NVCC_GENCODE=$ENV{NVCC_GENCODE}") +else () + message(WARNING + "You have enabled NCCL package, but you have not set " + "the NVCC_GENCODE. This will build all gencodes supported " + "by NCCL, which may increase the build time.") +endif () + +# The build system here is just a set of makefiles. +find_program(GNU_MAKE_PROGRAM make) + +include (ExternalProject) +ExternalProject_Add(${PKG_NAME} + PREFIX "${CMAKE_CURRENT_BINARY_DIR}" + ${LBANN_SB_GIT_REPOSITORY_TAG} ${LBANN_SB_${PKG_NAME}_URL} + ${LBANN_SB_GIT_TAG_TAG} ${LBANN_SB_${PKG_NAME}_TAG} + TMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/tmp" + STAMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/stamp" + + SOURCE_DIR "${LBANN_SB_${PKG_NAME}_SOURCE_DIR}" + INSTALL_DIR "${LBANN_SB_${PKG_NAME}_PREFIX}" + + GIT_SHALLOW 1 + + BUILD_IN_SOURCE 1 + USES_TERMINAL_BUILD 1 + LOG_DOWNLOAD 1 + LOG_UPDATE 1 + LOG_CONFIGURE 1 + LOG_BUILD 1 + LOG_INSTALL 1 + LOG_TEST 1 + + CONFIGURE_COMMAND "" + + BUILD_COMMAND + ${GNU_MAKE_PROGRAM} + src.build + "PREFIX=${LBANN_SB_${PKG_NAME}_PREFIX}" + "CC=${LBANN_SB_${PKG_NAME}_C_COMPILER}" + "CXX=${LBANN_SB_${PKG_NAME}_CXX_COMPILER}" + ${_nccl_cuda_path_opt} + ${_nccl_nvcc_gencode_opt} + "DEBUG=${_nccl_debug}" + "VERBOSE=${LBANN_SB_FWD_NCCL_VERBOSE}" + "KEEP=${LBANN_SB_FWD_NCCL_KEEP}" + "ASAN=${LBANN_SB_FWD_NCCL_ASAN}" + "TRACE=${LBANN_SB_FWD_NCCL_TRACE}" + -j${${PKG_NAME}_MAX_MAKE_JOBS} + + INSTALL_COMMAND + ${GNU_MAKE_PROGRAM} + src.install + "PREFIX=${LBANN_SB_${PKG_NAME}_PREFIX}" + "CC=${LBANN_SB_${PKG_NAME}_C_COMPILER}" + "CXX=${LBANN_SB_${PKG_NAME}_CXX_COMPILER}" + ${_nccl_cuda_path_opt} + ${_nccl_nvcc_gencode_opt} + "DEBUG=${_nccl_debug}" + "VERBOSE=${LBANN_SB_FWD_NCCL_VERBOSE}" + "KEEP=${LBANN_SB_FWD_NCCL_KEEP}" + "ASAN=${LBANN_SB_FWD_NCCL_ASAN}" + "TRACE=${LBANN_SB_FWD_NCCL_TRACE}" + -j${${PKG_NAME}_MAX_MAKE_JOBS} +) + +set(${PKG_NAME}_DIR ${LBANN_SB_${PKG_NAME}_PREFIX} + CACHE INTERNAL "The install prefix of ${PKG_NAME}.")