huggingface · mfuntowicz · Dec 10, 2024 · Dec 13, 2024 · Hugoch · Dec 13, 2024
diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
@@ -1,11 +1,19 @@
 cmake_minimum_required(VERSION 3.20)
 
-if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
+if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
     find_program(CCACHE_EXECUTABLE "ccache")
     if (CCACHE_EXECUTABLE)
         message(STATUS "Using ccache")
-        set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
+        set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
+        set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
+        set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
     endif ()
+else ()
+    find_program(CCACHE_EXECUTABLE ${CMAKE_CXX_COMPILER_LAUNCHER})
+    message(STATUS "Using user specified cmake cxx compiler launcher: ${CMAKE_CXX_COMPILER_LAUNCHER}")
+    set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
+    set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
+    set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
 endif ()
 
 if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
@@ -21,28 +29,37 @@ include(CheckCXXCompilerFlag)
 
 option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
 option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
+option(TGI_TRTLLM_BACKEND_BUILD_USE_LLD "Enable lld linker instead of ld" OFF)
 set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
 set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
 
 # We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
 find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
+find_package(MPI REQUIRED)
 
 #### External dependencies ####
 include(cmake/json.cmake)
 include(cmake/spdlog.cmake)
 include(cmake/trtllm.cmake)
 
-if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(TGI_TRTLLM_BACKEND_DEBUG ON)
     add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
-endif()
+endif ()
+
+if (${TGI_TRTLLM_BACKEND_BUILD_USE_LLD})
+    message(STATUS "Using lld linker")
+    add_link_options("-fuse-ld=lld")
+endif ()
 
 # This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
 check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
-if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
-    set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
-endif()
+if (${COMPILER_SUPPORT_WARNING_ON_NVRO})
+    message(STATUS "Enabling non-NVRO detection")
+    target_compile_options(tgi_trtllm_backend_impl "-Werror -Wnvro")
+endif ()
 
 # Let's build TRTLLM as part of CMake
 add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
@@ -55,21 +72,20 @@ add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp cs
 include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 target_include_directories(tgi_trtllm_backend_impl PRIVATE
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
-#        $<INSTALL_INTERFACE:csrc>
+        #        $<INSTALL_INTERFACE:csrc>
 )
 target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
 target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
 target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)
-
-if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-    target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
-else()
-    target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
-endif ()
+target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)
 
 # This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
 install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
-install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
+install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} TYPE LIB)
+if (NOT ${TGI_TRTLLM_BACKEND_DEBUG})
+    install(FILES ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
+endif ()
+
 
 #### Unit Tests ####
 if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
@@ -85,18 +101,13 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
     target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
     target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
     target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
+    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)
 
-    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-        target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
-    else()
-        target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
-    endif ()
-
-    if(CMAKE_BUILD_TYPE MATCHES "Debug")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
+    if (CMAKE_BUILD_TYPE MATCHES "Debug")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}  -fsanitize=address -fsanitize=undefined")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fsanitize=undefined")
         target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address)
-    endif()
+    endif ()
 
     list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
     include(CTest)

diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
@@ -15,9 +15,8 @@ const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR");
 // Dependencies
 const BACKEND_DEPS: [&str; 2] = ["tgi_trtllm_backend_impl", "tgi_trtllm_backend"];
 const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
-const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
+const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 4] = [
     ("dylib", "tensorrt_llm"),
-    ("static", "tensorrt_llm_executor_static"),
     ("dylib", "tensorrt_llm_nvrtc_wrapper"),
     ("dylib", "nvinfer_plugin_tensorrt_llm"),
     ("dylib", "decoder_attention"),
@@ -32,6 +31,58 @@ macro_rules! probe {
     };
 }
 
+fn get_compiler_flag(
+    switch: bool,
+    true_case: &'static str,
+    false_case: &'static str,
+) -> &'static str {
+    match switch {
+        true => true_case,
+        false => false_case,
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+fn get_system_install_path(install_path: &PathBuf) -> PathBuf {
+    install_path.join("lib64")
+}
+
+#[cfg(not(target_arch = "x86_64"))]
+fn get_system_install_path(install_path: &PathBuf) -> PathBuf {
+    install_path.join("lib")
+}
+
+fn get_library_architecture() -> &'static str {
+    let os = env::var("CARGO_CFG_TARGET_OS").unwrap();
+    let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
+    let env = env::var("CARGO_CFG_TARGET_ENV").unwrap();
+
+    match os.as_str() {
+        "linux" => {
+            if env != "gnu" {
+                panic!("unsupported linux ABI {env}, only 'gnu' is supported")
+            }
+
+            match arch.as_str() {
+                "x86_64" => "x86_64-linux-gnu",
+                "aarch64" => "aarch64-linux-gnu",
+                _ => panic!("unsupported linux architecture {arch}"),
+            }
+        }
+        "windows" => {
+            if env != "msvc" {
+                panic!("unsupported windows ABI {env}, only 'msvc' is supported")
+            }
+
+            match arch.as_str() {
+                "x86_64" => "x86_64-windows-msvc",
+                _ => panic!("unsupported windows architecture {arch}"),
+            }
+        }
+        _ => panic!("unsupported OS {os}"),
+    }
+}
+
 fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf, PathBuf) {
     // Build the backend implementation through CMake
     let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
@@ -44,7 +95,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
     }
 
     let mut config = cmake::Config::new(".");
-    config.uses_cxx11()
+    config
+        .uses_cxx11()
         .generator("Ninja")
         .profile(match is_debug {
             true => "Debug",
@@ -53,16 +105,28 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
         .env("OPT_LEVEL", opt_level)
         .define("CMAKE_INSTALL_PREFIX", &install_path)
         .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
-        .define("Python3_ROOT_DIR", "../venv")
+        .define("CMAKE_LIBRARY_ARCHITECTURE", get_library_architecture())
         .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
+        .define(
+            "TGI_TRTLLM_BACKEND_DEBUG",
+            get_compiler_flag(is_debug, "ON", "OFF"),
+        )
         .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);
 
-        // Allow to override which Python to use ...
-        if let Some(python3) = option_env!("Python3_EXECUTABLE") {
-            config.define("Python3_EXECUTABLE", python3);
-        }
+    if let Some(nvcc_host_compiler) = option_env!("CMAKE_CUDA_HOST_COMPILER") {
+        config.define("CMAKE_CUDA_HOST_COMPILER", nvcc_host_compiler);
+    }
+
+    if let Some(cxx_compiler_launcher) = option_env!("CMAKE_CXX_COMPILER_LAUNCHER") {
+        config.define("CMAKE_CXX_COMPILER_LAUNCHER", cxx_compiler_launcher);
+    }
 
-        config.build();
+    // Allow to override which Python to use ...
+    if let Some(python3) = option_env!("Python3_EXECUTABLE") {
+        config.define("Python3_EXECUTABLE", python3);
+    }
+
+    config.build();
 
     // Additional transitive CMake dependencies
     let deps_folder = out_dir.join("build").join("_deps");
@@ -77,7 +141,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
     }
 
     // Emit linkage information from the artifacts we just built
-    let install_lib_path = install_path.join("lib");
+
+    let install_lib_path = get_system_install_path(&install_path);
 
     println!(
         r"cargo:warning=Adding link search path: {}",
@@ -89,11 +154,6 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
 }
 
 fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
-    let ndebug = match is_debug {
-        true => "1",
-        false => "0",
-    };
-
     CFG.include_prefix = "backends/trtllm";
     cxx_build::bridge("src/lib.rs")
         .static_flag(true)
@@ -105,7 +165,10 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
         .include("/usr/local/tensorrt/include")
         .include("csrc/")
         .file("csrc/ffi.hpp")
-        .define("TGI_TRTLLM_BACKEND_DEBUG", ndebug)
+        .define(
+            "TGI_TRTLLM_BACKEND_DEBUG",
+            get_compiler_flag(is_debug, "ON", "OFF"),
+        )
         .compile("tgi_trtllm_backend");
 
     println!("cargo:rerun-if-changed=CMakeLists.txt");

diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake
@@ -4,14 +4,14 @@ set(SPDLOG_FMT_EXTERNAL OFF)
 
 # Define the level at which SPDLOG_ compilation level is defined
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE)
 else ()
-    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
 endif ()
 
 fetchcontent_declare(
         spdlog
-#        DOWNLOAD_EXTRACT_TIMESTAMP
-        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
+        #        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.15.0.tar.gz
 )
 fetchcontent_makeavailable(spdlog)
diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
@@ -14,11 +14,13 @@ message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 set(ENABLE_UCX OFF)
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
     set(FAST_BUILD ON)
-    set(NVTX_DISABLE OFF)
+    set(NVTX_DISABLE ON)
+    set(INDEX_RANGE_CHECK ON)
 else ()
     set(FAST_BUILD OFF)
     set(FAST_MATH ON)
-    set(NVTX_DISABLE ON)
+    set(NVTX_DISABLE OFF)
+    set(INDEX_RANGE_CHECK OFF)
 endif ()
 
 find_package(Python3 REQUIRED Interpreter)

diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp
@@ -1,7 +1,6 @@
 #include <ranges>
 
 #include <nlohmann/json.hpp>
-#include <spdlog/spdlog.h>
 
 #include "backend.hpp"
 #include "hardware.hpp"
@@ -17,7 +16,8 @@ namespace huggingface::tgi::backends::trtllm {
         if (world_size > 1) {
             SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
             mode = tle::CommunicationMode::kORCHESTRATOR;
-            orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr, true);
+            orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr,
+                                                                             true);
         } else {
             SPDLOG_INFO("Detected single engine deployment, using leader mode");
         }
@@ -44,21 +44,22 @@ namespace huggingface::tgi::backends::trtllm {
     }
 
     backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
-        : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
+            : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
 
     size_t backend_t::num_tokens_ready() const noexcept {
         return executor_.getNumResponsesReady();
     }
 
     std::expected<request_id_t, backend_error_t>
-    backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
-        SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
-        return executor_.enqueueRequest(tle::Request {
+    backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t g_params,
+                      const sampling_params_t s_params) noexcept {
+        SPDLOG_DEBUG("Submit {:d} tokens for scheduling ({}, {})", token_ids.size(), g_params, s_params);
+        return executor_.enqueueRequest(tle::Request{
                 {token_ids.begin(), token_ids.end()},  // Making actual copy of the tokens
-                static_cast<tle::SizeType32>(generation_params.max_new_tokens),
+                static_cast<tle::SizeType32>(g_params.max_new_tokens),
                 true,
-                (tle::SamplingConfig) sampling_params,
-                tle::OutputConfig { /* returnLogProbs= */ true },
+                (tle::SamplingConfig) s_params,
+                tle::OutputConfig{ /* returnLogProbs= */ true},
                 std::nullopt,
                 std::nullopt,
                 std::nullopt,