TensorRT-LLM backend bump to latest version + misc fixes (#2791)

* misc(cmake) update dependencies * feat(hardware) enable new hardware.hpp and unittests * test(ctest) enable address sanitizer * feat(backend): initial rewrite of the backend for simplicity * feat(backend): remove all the logs from hardware.hpp * feat(backend): added some logging * feat(backend): enable compiler warning if support for RVO not applying * feat(backend): missing return statement * feat(backend): introduce backend_workspace_t to store precomputed information from the engine folder * feat(backend): delete previous backend impl * feat(backend): more impl * feat(backend): use latest trtllm main version to have g++ >= 13 compatibility * feat(backend): allow overriding which Python to use * feat(backend): fix backend_exception_t -> backend_error_t naming * feat(backend): impl missing generation_step_t as return value of pull_tokens * feat(backend): make backend_workspace_t::engines_folder constexpr * feat(backend): fix main.rs retrieving the tokenizer * feat(backend): add guard to multiple header definitions * test(backend): add more unittest * feat(backend): remove constexpr from par * feat(backend): remove constexpig * test(backend): more test coverage * chore(trtllm): update dependency towards 0.15.0 * effectively cancel the request on the executor * feat(backend) fix moving backend when pulling * feat(backend): make sure we can easily cancel request on the executor * feat(backend): fix missing "0" field access * misc(backend): fix reborrowing Pin<&mut T> as described in the doc https://doc.rust-lang.org/stable/std/pin/struct.Pin.html#method.as_mut * chore: Add doc and CI for TRTLLM (#2799) * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * doc: Formatting * misc(backend): indent --------- Co-authored-by: Hugo Larcher <[email protected]>
huggingface · Dec 13, 2024 · ea7f408 · ea7f408
1 parent 3bb3fd1
commit ea7f408
Show file tree

Hide file tree

Showing 32 changed files with 1,196 additions and 900 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -8,6 +8,7 @@ on:
         description: Hardware
           # options:
           # - cuda
+          # - cuda-trtllm
           # - rocm
           # - intel
         required: true
@@ -52,6 +53,15 @@ jobs:
                 export platform=""
                 export extra_pytest=""
                 ;;
+            cuda-trtllm)
+                export dockerfile="Dockerfile_trtllm"
+                export label_extension="-trtllm"
+                export docker_volume="/mnt/cache"
+                export docker_devices=""
+                export runs_on="ubuntu-latest"
+                export platform=""
+                export extra_pytest=""
+                ;;
             rocm)
                 export dockerfile="Dockerfile_amd"
                 export label_extension="-rocm"

diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml
@@ -37,7 +37,7 @@ jobs:
       # fail-fast is true by default
       fail-fast: false
       matrix:
-        hardware: ["cuda", "rocm", "intel-xpu", "intel-cpu"]
+        hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu"]
     uses: ./.github/workflows/build.yaml # calls the one above ^
     permissions:
       contents: write

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
@@ -1,5 +1,5 @@
 ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
-ARG OMPI_VERSION="4.1.6"
+ARG OMPI_VERSION="4.1.7rc1"
 
 # Build dependencies resolver stage
 FROM lukemathwalker/cargo-chef:latest AS chef
@@ -10,26 +10,29 @@ COPY . .
 RUN cargo chef prepare --recipe-path recipe.json
 
 # CUDA dependent dependencies resolver stage
-FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
+FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
 
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
     apt update && apt install -y \
     build-essential \
     cmake \
     curl \
-    gcc  \
-    g++ \
+    gcc-14  \
+    g++-14 \
     git \
     git-lfs \
     libssl-dev \
+    libucx-dev \
     ninja-build \
     pkg-config \
+    pipx \
     python3 \
     python3-dev \
     python3-setuptools \
     tar \
-    wget
+    wget && \
+    pipx ensurepath
 
 ENV TGI_INSTALL_PREFIX=/usr/local/tgi
 ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
@@ -83,13 +86,15 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
     cd backends/trtllm && \
     CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
 
-FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
-RUN apt update && apt install -y python3-minimal python3-dev python3-pip && \
+FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
+RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
     rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
-    python3 -m pip install transformers tokenizers
+    pipx ensurepath && \
+    pipx install --include-deps transformers tokenizers
 
 WORKDIR /usr/local/tgi/bin
 
+ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
 ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
 ENV TOKENIZERS_PARALLELISM=false
 ENV OMPI_MCA_plm_rsh_agent=""

diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
@@ -13,10 +13,11 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
 endif ()
 
 project(tgi-trtllm-backend VERSION 1.0.0)
-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 23)
 
 include(FetchContent)
 include(ExternalProject)
+include(CheckCXXCompilerFlag)
 
 option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
 option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
@@ -29,27 +30,42 @@ set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE ST
 find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
 
 #### External dependencies ####
-include(cmake/fmt.cmake)
 include(cmake/json.cmake)
 include(cmake/spdlog.cmake)
 include(cmake/trtllm.cmake)
 
+if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
+endif()
+
+# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
+check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
+if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
+    set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
+endif()
+
 # Let's build TRTLLM as part of CMake
 add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
 
 # Tell CMake to need try to override the RPATH for executorWorker as it has not information on how to do so
 set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)
 
 # TGI TRTLLM Backend definition
-add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
+add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp csrc/backend.cpp)
 include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 target_include_directories(tgi_trtllm_backend_impl PRIVATE
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-        $<INSTALL_INTERFACE:include>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
+#        $<INSTALL_INTERFACE:csrc>
 )
 target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
-target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper CUDA::cudart CUDA::nvml)
-target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt)
+target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
+target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)
+
+if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
+else()
+    target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
+endif ()
 
 # This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
 install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
@@ -60,16 +76,30 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
     message(STATUS "Building tests")
     FetchContent_Declare(
             Catch2
-            GIT_REPOSITORY https://github.com/catchorg/Catch2
-            GIT_TAG v3.6.0
+            URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz
     )
     FetchContent_MakeAvailable(Catch2)
 
-    #    add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
-    #    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt CUDA::cudart CUDA::nvml)
+    add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp)
+    target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
+    target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
+    target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
+    target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
+
+    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+        target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
+    else()
+        target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
+    endif ()
+
+    if(CMAKE_BUILD_TYPE MATCHES "Debug")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
+        target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address)
+    endif()
 
     list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
     include(CTest)
     include(Catch)
-    #    catch_discover_tests(tgi_trtllm_backend_tests)
+    catch_discover_tests(tgi_trtllm_backend_tests)
 endif ()
diff --git a/backends/trtllm/Cargo.toml b/backends/trtllm/Cargo.toml
@@ -7,20 +7,21 @@ homepage.workspace = true
 
 [dependencies]
 async-trait = "0.1"
-async-stream = "0.3"
+#async-stream = "0.3"
 clap = { version = "4.5", features = ["derive"] }
 cxx = "1.0"
 hashbrown = "0.14"
 hf-hub = { workspace = true }
-log = { version = "0.4", features = [] }
+#log = { version = "0.4", features = [] }
 text-generation-router = { path = "../../router" }
 tokenizers = { workspace = true }
 tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
 tokio-stream = "0.1.15"
 thiserror = "1.0.63"
 tracing = "0.1"
-tracing-opentelemetry = "0.25"
-tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
+#tracing-opentelemetry = "0.25"
+#tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
+pyo3 = { workspace = true }
 
 [build-dependencies]
 cmake = "0.1"

diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
@@ -4,7 +4,7 @@ use std::env;
 use std::env::consts::ARCH;
 use std::path::{absolute, PathBuf};
 
-const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
+const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
 const CUDA_REQUIRED_VERSION: &str = "12.6";
 const MPI_REQUIRED_VERSION: &str = "4.1";
@@ -43,8 +43,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
         install_path = absolute(out_dir).expect("cannot happen").join(install_path);
     }
 
-    let _ = cmake::Config::new(".")
-        .uses_cxx11()
+    let mut config = cmake::Config::new(".");
+    config.uses_cxx11()
         .generator("Ninja")
         .profile(match is_debug {
             true => "Debug",
@@ -53,9 +53,16 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
         .env("OPT_LEVEL", opt_level)
         .define("CMAKE_INSTALL_PREFIX", &install_path)
         .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
+        .define("Python3_ROOT_DIR", "../venv")
         .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
-        .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path)
-        .build();
+        .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);
+
+        // Allow to override which Python to use ...
+        if let Some(python3) = option_env!("Python3_EXECUTABLE") {
+            config.define("Python3_EXECUTABLE", python3);
+        }
+
+        config.build();
 
     // Additional transitive CMake dependencies
     let deps_folder = out_dir.join("build").join("_deps");
@@ -90,26 +97,25 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
     CFG.include_prefix = "backends/trtllm";
     cxx_build::bridge("src/lib.rs")
         .static_flag(true)
-        .include(deps_folder.join("fmt-src").join("include"))
+        .std("c++23")
         .include(deps_folder.join("spdlog-src").join("include"))
         .include(deps_folder.join("json-src").join("include"))
         .include(deps_folder.join("trtllm-src").join("cpp").join("include"))
         .include("/usr/local/cuda/include")
         .include("/usr/local/tensorrt/include")
-        .file("src/ffi.cpp")
-        .std("c++20")
-        .define("NDEBUG", ndebug)
+        .include("csrc/")
+        .file("csrc/ffi.hpp")
+        .define("TGI_TRTLLM_BACKEND_DEBUG", ndebug)
         .compile("tgi_trtllm_backend");
 
     println!("cargo:rerun-if-changed=CMakeLists.txt");
     println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
     println!("cargo:rerun-if-changed=cmake/json.cmake");
-    println!("cargo:rerun-if-changed=cmake/fmt.cmake");
     println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
-    println!("cargo:rerun-if-changed=include/backend.h");
-    println!("cargo:rerun-if-changed=lib/backend.cpp");
-    println!("cargo:rerun-if-changed=include/ffi.h");
-    println!("cargo:rerun-if-changed=src/ffi.cpp");
+    println!("cargo:rerun-if-changed=csrc/backend.hpp");
+    println!("cargo:rerun-if-changed=csrc/backend.cpp");
+    println!("cargo:rerun-if-changed=csrc/hardware.hpp");
+    println!("cargo:rerun-if-changed=csrc/ffi.hpp");
 }
 
 fn main() {

diff --git a/backends/trtllm/cmake/fmt.cmake b/backends/trtllm/cmake/fmt.cmake