[TENSORRT-LLM] - Implement new looper thread based backend (#2357)

* (backend) use parking_lot crate for RwLock fairness # Conflicts: # backends/trtllm/src/backend.rs * (launcher) default new server::run parameters to false for now * (chore) fmt ... why? * (ffi) use const for GetSamplingConfig * (server) expose new SchedulingError * (trt) * (build) setup ccache if available * (ffi) add max_new_tokens parameters * (backend) cleanup a bit * (backend) expose PullNewTokens * (ffi) cleanup again * (ffi) add missing headers imports * (ffi) add template specialization to catch and convert to Rust Result<T, tensorrt_llm::common::TllmException> * (looper) new looper initial implementation * (ffi) remove narrowing type warning * (ffi) encode the provided user prompt within each request thread * (misc) change scope identifiers * (backend) implement the post_processor background thread * (misc) missing Result types for Rust * use blocking_recv in looper to consume awaiting_requests at max before pulling in a single step * (server) forward auth_token to server::run * (build) fetchcontent use archives instead of git * (ffi) fix usage of wrong vector constructor making a capacity fill call * (ffi) missing namespace for tle::Response * (ffi) do not use reference capture in lambda as we are not capturing anything * (backend) refactor & cleanup * (Dockerfile.trtllm) delete for now * (misc) simplify [make_]move_iterator by using c++20 type inference * (misc) no need to move for uint32_t items * (scheduler) rework submit/pull logic * (post) impl postprocessing * (misc) delete backend.rs * (misc) rerun-if-changed all the cmake modules * (misc) move to latest trtllm * (fix): HOPPER_SM_MAJOR is 9 not 8 * (misc: build for sm_{75,80,86,89,90} by default * (misc): build with trtllm 0.13.0 * (misc): increase verbosity of spdlog * (fix): do not recreate the stateful hashmap at every it * (misc): update dependency in trtllm dockerfile * (misc): update dependency in trtllm dockerfile * (misc): disable logging in release mode * (misc): improve trtllm download script robustness * (fix): ore fixes for Dockerfile * misc(cuda): require 12.6 * chore(cmake): use correct policy for download_timestamp * feat(looper): check engine and executorWorker paths exist before creating the backend * chore(cmake): download timestamp should be before URL * feat(looper): minor optimizations to avoid growing too much the containers * chore(trtllm): move dockerfile to right place * chore(trtllm): disable tokenizer parallelism by default * chore(trtllm): fmt * chore(trtllm): post-rebase commit * chore(trtllm): remove unused method * feat(trtllm): cache maxNumTokens to avoid calling JSON everytime * misc(router): remove SchedulingError * feat(trtllm): do not tokenize twice * Revert "chore(trtllm): remove unused method" This reverts commit 3174716 * chore(rebase): fix invalid references * chore(router): add python dependency * Lint. * Fix bad rebase --------- Co-authored-by: Nicolas Patry <[email protected]>
huggingface · Oct 25, 2024 · 43df056 · 43df056
1 parent ed87b46
commit 43df056
Show file tree

Hide file tree

Showing 22 changed files with 794 additions and 598 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Dockerfile.trtllm b/Dockerfile.trtllm
diff --git a/backends/trtllm/Dockerfile → Dockerfile_trtllm b/backends/trtllm/Dockerfile → Dockerfile_trtllm
@@ -10,7 +10,7 @@ COPY . .
 RUN cargo chef prepare --recipe-path recipe.json
 
 # CUDA dependent dependencies resolver stage
-FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04 AS cuda-builder
+FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
 
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
@@ -26,6 +26,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     ninja-build \
     pkg-config \
     python3 \
+    python3-dev \
     python3-setuptools \
     tar \
     wget
@@ -82,10 +83,15 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
     cd backends/trtllm && \
     CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
 
-FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime
+FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
+RUN apt update && apt install -y python3 && \
+    rm -rf /var/lib/{apt,dpkg,cache,log}/
+
 WORKDIR /usr/local/tgi/bin
 
 ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+ENV TOKENIZERS_PARALLELISM=false
+ENV OMPI_MCA_plm_rsh_agent=""
 
 COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
 COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt

diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
@@ -1,5 +1,17 @@
 cmake_minimum_required(VERSION 3.20)
 
+if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
+    find_program(CCACHE_EXECUTABLE "ccache")
+    if (CCACHE_EXECUTABLE)
+        message(STATUS "Using ccache")
+        set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
+    endif ()
+endif ()
+
+if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
+    cmake_policy(SET CMP0135 NEW)
+endif ()
+
 project(tgi-trtllm-backend VERSION 1.0.0)
 set(CMAKE_CXX_STANDARD 20)
 
@@ -14,7 +26,7 @@ set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include"
 set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
 
 # We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
-find_package(CUDAToolkit 12.5 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
+find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
 
 #### External dependencies ####
 include(cmake/fmt.cmake)

diff --git a/backends/trtllm/Cargo.toml b/backends/trtllm/Cargo.toml
@@ -10,16 +10,17 @@ async-trait = "0.1"
 async-stream = "0.3"
 clap = { version = "4.5", features = ["derive"] }
 cxx = "1.0"
+hashbrown = "0.14"
+hf-hub = { workspace = true }
 log = { version = "0.4", features = [] }
 text-generation-router = { path = "../../router" }
-tokenizers = { version = "0.19", features = ["hf-hub"] }
-tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokenizers = { workspace = true }
+tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
 tokio-stream = "0.1.15"
-thiserror = "1.0.62"
+thiserror = "1.0.63"
 tracing = "0.1"
-tracing-opentelemetry = "0.24"
+tracing-opentelemetry = "0.25"
 tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
-parking_lot = "0.12"
 
 [build-dependencies]
 cmake = "0.1"

diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
@@ -6,7 +6,7 @@ use std::path::{absolute, PathBuf};
 
 const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
-const CUDA_REQUIRED_VERSION: &str = "12.5";
+const CUDA_REQUIRED_VERSION: &str = "12.6";
 const MPI_REQUIRED_VERSION: &str = "4.1";
 const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
 const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
@@ -36,7 +36,7 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
     // Build the backend implementation through CMake
     let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
     let tensorrt_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt");
-    let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("90-real"); // Hopper by default
+    let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("75-real;80-real;86-real;89-real;90-real");
 
     let mut install_path = PathBuf::from(install_path);
     if !install_path.is_absolute() {
@@ -81,7 +81,12 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
     (PathBuf::from(install_path), deps_folder)
 }
 
-fn build_ffi_layer(deps_folder: &PathBuf) {
+fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
+    let ndebug = match is_debug {
+        true => "1",
+        false => "0",
+    };
+
     CFG.include_prefix = "backends/trtllm";
     cxx_build::bridge("src/lib.rs")
         .static_flag(true)
@@ -93,9 +98,14 @@ fn build_ffi_layer(deps_folder: &PathBuf) {
         .include("/usr/local/tensorrt/include")
         .file("src/ffi.cpp")
         .std("c++20")
+        .define("NDEBUG", ndebug)
         .compile("tgi_trtllm_backend");
 
     println!("cargo:rerun-if-changed=CMakeLists.txt");
+    println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
+    println!("cargo:rerun-if-changed=cmake/json.cmake");
+    println!("cargo:rerun-if-changed=cmake/fmt.cmake");
+    println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
     println!("cargo:rerun-if-changed=include/backend.h");
     println!("cargo:rerun-if-changed=lib/backend.cpp");
     println!("cargo:rerun-if-changed=include/ffi.h");
@@ -115,7 +125,7 @@ fn main() {
     let (_backend_path, deps_folder) = build_backend(is_debug, opt_level, &out_dir);
 
     // Build the FFI layer calling the backend above
-    build_ffi_layer(&deps_folder);
+    build_ffi_layer(&deps_folder, is_debug);
 
     // Emit linkage search path
     probe!("ompi", MPI_REQUIRED_VERSION);

diff --git a/backends/trtllm/cmake/fmt.cmake b/backends/trtllm/cmake/fmt.cmake
@@ -1,6 +1,6 @@
 FetchContent_Declare(
         fmt
-        GIT_REPOSITORY https://github.com/fmtlib/fmt
-        GIT_TAG 11.0.1
+        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz
 )
 FetchContent_MakeAvailable(fmt)
diff --git a/backends/trtllm/cmake/json.cmake b/backends/trtllm/cmake/json.cmake
@@ -1,5 +1,6 @@
 fetchcontent_declare(
         json
+        DOWNLOAD_EXTRACT_TIMESTAMP
         URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
 )
 fetchcontent_makeavailable(json)
diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake
@@ -11,7 +11,7 @@ endif ()
 
 fetchcontent_declare(
         spdlog
-        GIT_REPOSITORY https://github.com/gabime/spdlog.git
-        GIT_TAG v1.14.1
+        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
 )
 fetchcontent_makeavailable(spdlog)
diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
@@ -23,8 +23,9 @@ endif ()
 fetchcontent_declare(
         trtllm
         GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git
-        GIT_TAG a681853d3803ee5893307e812530b5e7004bb6e1
+        GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc
         GIT_SHALLOW FALSE
+        DOWNLOAD_EXTRACT_TIMESTAMP
 )
 fetchcontent_makeavailable(trtllm)