Skip to content

Commit

Permalink
[TENSORRT-LLM] - Implement new looper thread based backend (#2357)
Browse files Browse the repository at this point in the history
* (backend) use parking_lot crate for RwLock fairness

# Conflicts:
#	backends/trtllm/src/backend.rs

* (launcher) default new server::run parameters to false for now

* (chore) fmt ... why?

* (ffi) use const for GetSamplingConfig

* (server) expose new SchedulingError

* (trt)

* (build) setup ccache if available

* (ffi) add max_new_tokens parameters

* (backend) cleanup a bit

* (backend) expose PullNewTokens

* (ffi) cleanup again

* (ffi) add missing headers imports

* (ffi) add template specialization to catch and convert to Rust Result<T, tensorrt_llm::common::TllmException>

* (looper) new looper initial implementation

* (ffi) remove narrowing type warning

* (ffi) encode the provided user prompt within each request thread

* (misc) change scope identifiers

* (backend) implement the post_processor background thread

* (misc) missing Result types for Rust

* use blocking_recv in looper to consume awaiting_requests at max before pulling in a single step

* (server) forward auth_token to server::run

* (build) fetchcontent use archives instead of git

* (ffi) fix usage of wrong vector constructor making a capacity fill call

* (ffi) missing namespace for tle::Response

* (ffi) do not use reference capture in lambda as we are not capturing anything

* (backend) refactor & cleanup

* (Dockerfile.trtllm) delete for now

* (misc) simplify [make_]move_iterator by using c++20 type inference

* (misc) no need to move for uint32_t items

* (scheduler) rework submit/pull logic

* (post) impl postprocessing

* (misc) delete backend.rs

* (misc) rerun-if-changed all the cmake modules

* (misc) move to latest trtllm

* (fix): HOPPER_SM_MAJOR is 9 not 8

* (misc: build for sm_{75,80,86,89,90} by default

* (misc): build with trtllm 0.13.0

* (misc): increase verbosity of spdlog

* (fix): do not recreate the stateful hashmap at every it

* (misc): update dependency in trtllm dockerfile

* (misc): update dependency in trtllm dockerfile

* (misc): disable logging in release mode

* (misc): improve trtllm download script robustness

* (fix): ore fixes for Dockerfile

* misc(cuda): require 12.6

* chore(cmake): use correct policy for download_timestamp

* feat(looper): check engine and executorWorker paths exist before creating the backend

* chore(cmake): download timestamp should be before URL

* feat(looper): minor optimizations to avoid growing too much the containers

* chore(trtllm): move dockerfile to right place

* chore(trtllm): disable tokenizer parallelism by default

* chore(trtllm): fmt

* chore(trtllm): post-rebase commit

* chore(trtllm): remove unused method

* feat(trtllm): cache maxNumTokens to avoid calling JSON everytime

* misc(router): remove SchedulingError

* feat(trtllm): do not tokenize twice

* Revert "chore(trtllm): remove unused method"

This reverts commit 3174716

* chore(rebase): fix invalid references

* chore(router): add python dependency

* Lint.

* Fix bad rebase

---------

Co-authored-by: Nicolas Patry <[email protected]>
  • Loading branch information
mfuntowicz and Narsil authored Oct 25, 2024
1 parent ed87b46 commit 43df056
Show file tree
Hide file tree
Showing 22 changed files with 794 additions and 598 deletions.
68 changes: 17 additions & 51 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 0 additions & 23 deletions Dockerfile.trtllm

This file was deleted.

10 changes: 8 additions & 2 deletions backends/trtllm/Dockerfile → Dockerfile_trtllm
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ COPY . .
RUN cargo chef prepare --recipe-path recipe.json

# CUDA dependent dependencies resolver stage
FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04 AS cuda-builder
FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
Expand All @@ -26,6 +26,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
ninja-build \
pkg-config \
python3 \
python3-dev \
python3-setuptools \
tar \
wget
Expand Down Expand Up @@ -82,10 +83,15 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
cd backends/trtllm && \
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release

FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime
FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
RUN apt update && apt install -y python3 && \
rm -rf /var/lib/{apt,dpkg,cache,log}/

WORKDIR /usr/local/tgi/bin

ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
ENV TOKENIZERS_PARALLELISM=false
ENV OMPI_MCA_plm_rsh_agent=""

COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
Expand Down
14 changes: 13 additions & 1 deletion backends/trtllm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
cmake_minimum_required(VERSION 3.20)

if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
find_program(CCACHE_EXECUTABLE "ccache")
if (CCACHE_EXECUTABLE)
message(STATUS "Using ccache")
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
endif ()
endif ()

if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif ()

project(tgi-trtllm-backend VERSION 1.0.0)
set(CMAKE_CXX_STANDARD 20)

Expand All @@ -14,7 +26,7 @@ set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include"
set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")

# We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
find_package(CUDAToolkit 12.5 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)

#### External dependencies ####
include(cmake/fmt.cmake)
Expand Down
11 changes: 6 additions & 5 deletions backends/trtllm/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,17 @@ async-trait = "0.1"
async-stream = "0.3"
clap = { version = "4.5", features = ["derive"] }
cxx = "1.0"
hashbrown = "0.14"
hf-hub = { workspace = true }
log = { version = "0.4", features = [] }
text-generation-router = { path = "../../router" }
tokenizers = { version = "0.19", features = ["hf-hub"] }
tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
tokenizers = { workspace = true }
tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
tokio-stream = "0.1.15"
thiserror = "1.0.62"
thiserror = "1.0.63"
tracing = "0.1"
tracing-opentelemetry = "0.24"
tracing-opentelemetry = "0.25"
tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
parking_lot = "0.12"

[build-dependencies]
cmake = "0.1"
Expand Down
18 changes: 14 additions & 4 deletions backends/trtllm/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::path::{absolute, PathBuf};

const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
const CUDA_REQUIRED_VERSION: &str = "12.5";
const CUDA_REQUIRED_VERSION: &str = "12.6";
const MPI_REQUIRED_VERSION: &str = "4.1";
const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
Expand Down Expand Up @@ -36,7 +36,7 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
// Build the backend implementation through CMake
let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
let tensorrt_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt");
let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("90-real"); // Hopper by default
let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("75-real;80-real;86-real;89-real;90-real");

let mut install_path = PathBuf::from(install_path);
if !install_path.is_absolute() {
Expand Down Expand Up @@ -81,7 +81,12 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
(PathBuf::from(install_path), deps_folder)
}

fn build_ffi_layer(deps_folder: &PathBuf) {
fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
let ndebug = match is_debug {
true => "1",
false => "0",
};

CFG.include_prefix = "backends/trtllm";
cxx_build::bridge("src/lib.rs")
.static_flag(true)
Expand All @@ -93,9 +98,14 @@ fn build_ffi_layer(deps_folder: &PathBuf) {
.include("/usr/local/tensorrt/include")
.file("src/ffi.cpp")
.std("c++20")
.define("NDEBUG", ndebug)
.compile("tgi_trtllm_backend");

println!("cargo:rerun-if-changed=CMakeLists.txt");
println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
println!("cargo:rerun-if-changed=cmake/json.cmake");
println!("cargo:rerun-if-changed=cmake/fmt.cmake");
println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
println!("cargo:rerun-if-changed=include/backend.h");
println!("cargo:rerun-if-changed=lib/backend.cpp");
println!("cargo:rerun-if-changed=include/ffi.h");
Expand All @@ -115,7 +125,7 @@ fn main() {
let (_backend_path, deps_folder) = build_backend(is_debug, opt_level, &out_dir);

// Build the FFI layer calling the backend above
build_ffi_layer(&deps_folder);
build_ffi_layer(&deps_folder, is_debug);

// Emit linkage search path
probe!("ompi", MPI_REQUIRED_VERSION);
Expand Down
4 changes: 2 additions & 2 deletions backends/trtllm/cmake/fmt.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FetchContent_Declare(
fmt
GIT_REPOSITORY https://github.com/fmtlib/fmt
GIT_TAG 11.0.1
DOWNLOAD_EXTRACT_TIMESTAMP
URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz
)
FetchContent_MakeAvailable(fmt)
1 change: 1 addition & 0 deletions backends/trtllm/cmake/json.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
fetchcontent_declare(
json
DOWNLOAD_EXTRACT_TIMESTAMP
URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
)
fetchcontent_makeavailable(json)
4 changes: 2 additions & 2 deletions backends/trtllm/cmake/spdlog.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ endif ()

fetchcontent_declare(
spdlog
GIT_REPOSITORY https://github.com/gabime/spdlog.git
GIT_TAG v1.14.1
DOWNLOAD_EXTRACT_TIMESTAMP
URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
)
fetchcontent_makeavailable(spdlog)
3 changes: 2 additions & 1 deletion backends/trtllm/cmake/trtllm.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ endif ()
fetchcontent_declare(
trtllm
GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git
GIT_TAG a681853d3803ee5893307e812530b5e7004bb6e1
GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc
GIT_SHALLOW FALSE
DOWNLOAD_EXTRACT_TIMESTAMP
)
fetchcontent_makeavailable(trtllm)

Expand Down
Loading

0 comments on commit 43df056

Please sign in to comment.