Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TRTLLM] Expose finish reason #2841

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 36 additions & 25 deletions backends/trtllm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
cmake_minimum_required(VERSION 3.20)

if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
find_program(CCACHE_EXECUTABLE "ccache")
if (CCACHE_EXECUTABLE)
message(STATUS "Using ccache")
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
endif ()
else ()
find_program(CCACHE_EXECUTABLE ${CMAKE_CXX_COMPILER_LAUNCHER})
message(STATUS "Using user specified cmake cxx compiler launcher: ${CMAKE_CXX_COMPILER_LAUNCHER}")
set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_EXECUTABLE})
endif ()

if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
Expand All @@ -21,28 +29,37 @@ include(CheckCXXCompilerFlag)

option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
option(TGI_TRTLLM_BACKEND_BUILD_USE_LLD "Enable lld linker instead of ld" OFF)
set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")

# We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
find_package(MPI REQUIRED)

#### External dependencies ####
include(cmake/json.cmake)
include(cmake/spdlog.cmake)
include(cmake/trtllm.cmake)

if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
set(TGI_TRTLLM_BACKEND_DEBUG ON)
add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
endif()
endif ()

if (${TGI_TRTLLM_BACKEND_BUILD_USE_LLD})
message(STATUS "Using lld linker")
add_link_options("-fuse-ld=lld")
endif ()

# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
endif()
if (${COMPILER_SUPPORT_WARNING_ON_NVRO})
message(STATUS "Enabling non-NVRO detection")
target_compile_options(tgi_trtllm_backend_impl "-Werror -Wnvro")
endif ()

# Let's build TRTLLM as part of CMake
add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
Expand All @@ -55,21 +72,20 @@ add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp cs
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
target_include_directories(tgi_trtllm_backend_impl PRIVATE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
# $<INSTALL_INTERFACE:csrc>
# $<INSTALL_INTERFACE:csrc>
)
target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)

if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
else()
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
endif ()
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)

# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} TYPE LIB)
if (NOT ${TGI_TRTLLM_BACKEND_DEBUG})
install(FILES ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
endif ()


#### Unit Tests ####
if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
Expand All @@ -85,18 +101,13 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)

if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
else()
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
endif ()

if(CMAKE_BUILD_TYPE MATCHES "Debug")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
if (CMAKE_BUILD_TYPE MATCHES "Debug")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fsanitize=undefined")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fsanitize=undefined")
target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address)
endif()
endif ()

list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
include(CTest)
Expand Down
95 changes: 79 additions & 16 deletions backends/trtllm/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR");
// Dependencies
const BACKEND_DEPS: [&str; 2] = ["tgi_trtllm_backend_impl", "tgi_trtllm_backend"];
const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 4] = [
("dylib", "tensorrt_llm"),
("static", "tensorrt_llm_executor_static"),
("dylib", "tensorrt_llm_nvrtc_wrapper"),
("dylib", "nvinfer_plugin_tensorrt_llm"),
("dylib", "decoder_attention"),
Expand All @@ -32,6 +31,58 @@ macro_rules! probe {
};
}

fn get_compiler_flag(
switch: bool,
true_case: &'static str,
false_case: &'static str,
) -> &'static str {
match switch {
true => true_case,
false => false_case,
}
}

#[cfg(target_arch = "x86_64")]
fn get_system_install_path(install_path: &PathBuf) -> PathBuf {
install_path.join("lib64")
}

#[cfg(not(target_arch = "x86_64"))]
fn get_system_install_path(install_path: &PathBuf) -> PathBuf {
install_path.join("lib")
}

fn get_library_architecture() -> &'static str {
let os = env::var("CARGO_CFG_TARGET_OS").unwrap();
let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
let env = env::var("CARGO_CFG_TARGET_ENV").unwrap();

match os.as_str() {
"linux" => {
if env != "gnu" {
panic!("unsupported linux ABI {env}, only 'gnu' is supported")
}

match arch.as_str() {
"x86_64" => "x86_64-linux-gnu",
"aarch64" => "aarch64-linux-gnu",
_ => panic!("unsupported linux architecture {arch}"),
}
}
"windows" => {
if env != "msvc" {
panic!("unsupported windows ABI {env}, only 'msvc' is supported")
}

match arch.as_str() {
"x86_64" => "x86_64-windows-msvc",
_ => panic!("unsupported windows architecture {arch}"),
}
}
_ => panic!("unsupported OS {os}"),
}
}

fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf, PathBuf) {
// Build the backend implementation through CMake
let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
Expand All @@ -44,7 +95,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
}

let mut config = cmake::Config::new(".");
config.uses_cxx11()
config
.uses_cxx11()
.generator("Ninja")
.profile(match is_debug {
true => "Debug",
Expand All @@ -53,16 +105,28 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
.env("OPT_LEVEL", opt_level)
.define("CMAKE_INSTALL_PREFIX", &install_path)
.define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
.define("Python3_ROOT_DIR", "../venv")
.define("CMAKE_LIBRARY_ARCHITECTURE", get_library_architecture())
.define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
.define(
"TGI_TRTLLM_BACKEND_DEBUG",
get_compiler_flag(is_debug, "ON", "OFF"),
)
.define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);

// Allow to override which Python to use ...
if let Some(python3) = option_env!("Python3_EXECUTABLE") {
config.define("Python3_EXECUTABLE", python3);
}
if let Some(nvcc_host_compiler) = option_env!("CMAKE_CUDA_HOST_COMPILER") {
config.define("CMAKE_CUDA_HOST_COMPILER", nvcc_host_compiler);
}

if let Some(cxx_compiler_launcher) = option_env!("CMAKE_CXX_COMPILER_LAUNCHER") {
config.define("CMAKE_CXX_COMPILER_LAUNCHER", cxx_compiler_launcher);
}

config.build();
// Allow to override which Python to use ...
if let Some(python3) = option_env!("Python3_EXECUTABLE") {
config.define("Python3_EXECUTABLE", python3);
}

config.build();

// Additional transitive CMake dependencies
let deps_folder = out_dir.join("build").join("_deps");
Expand All @@ -77,7 +141,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
}

// Emit linkage information from the artifacts we just built
let install_lib_path = install_path.join("lib");

let install_lib_path = get_system_install_path(&install_path);

println!(
r"cargo:warning=Adding link search path: {}",
Expand All @@ -89,11 +154,6 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
}

fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
let ndebug = match is_debug {
true => "1",
false => "0",
};

CFG.include_prefix = "backends/trtllm";
cxx_build::bridge("src/lib.rs")
.static_flag(true)
Expand All @@ -105,7 +165,10 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
.include("/usr/local/tensorrt/include")
.include("csrc/")
.file("csrc/ffi.hpp")
.define("TGI_TRTLLM_BACKEND_DEBUG", ndebug)
.define(
"TGI_TRTLLM_BACKEND_DEBUG",
get_compiler_flag(is_debug, "ON", "OFF"),
)
.compile("tgi_trtllm_backend");

println!("cargo:rerun-if-changed=CMakeLists.txt");
Expand Down
8 changes: 4 additions & 4 deletions backends/trtllm/cmake/spdlog.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ set(SPDLOG_FMT_EXTERNAL OFF)

# Define the level at which SPDLOG_ compilation level is defined
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE)
else ()
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we stick to INFO rather than DEBUG? Or move the SPDLOG_LEVEL_DEBUG() to SPDLOG_LEVEL_INFO?

endif ()

fetchcontent_declare(
spdlog
# DOWNLOAD_EXTRACT_TIMESTAMP
URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
# DOWNLOAD_EXTRACT_TIMESTAMP
URL https://github.com/gabime/spdlog/archive/refs/tags/v1.15.0.tar.gz
)
fetchcontent_makeavailable(spdlog)
6 changes: 4 additions & 2 deletions backends/trtllm/cmake/trtllm.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@ message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
set(ENABLE_UCX OFF)
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
set(FAST_BUILD ON)
set(NVTX_DISABLE OFF)
set(NVTX_DISABLE ON)
set(INDEX_RANGE_CHECK ON)
else ()
set(FAST_BUILD OFF)
set(FAST_MATH ON)
set(NVTX_DISABLE ON)
set(NVTX_DISABLE OFF)
set(INDEX_RANGE_CHECK OFF)
endif ()

find_package(Python3 REQUIRED Interpreter)
Expand Down
19 changes: 10 additions & 9 deletions backends/trtllm/csrc/backend.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include <ranges>

#include <nlohmann/json.hpp>
#include <spdlog/spdlog.h>

#include "backend.hpp"
#include "hardware.hpp"
Expand All @@ -17,7 +16,8 @@ namespace huggingface::tgi::backends::trtllm {
if (world_size > 1) {
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
mode = tle::CommunicationMode::kORCHESTRATOR;
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr, true);
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr,
true);
} else {
SPDLOG_INFO("Detected single engine deployment, using leader mode");
}
Expand All @@ -44,21 +44,22 @@ namespace huggingface::tgi::backends::trtllm {
}

backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
: workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
: workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}

size_t backend_t::num_tokens_ready() const noexcept {
return executor_.getNumResponsesReady();
}

std::expected<request_id_t, backend_error_t>
backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
return executor_.enqueueRequest(tle::Request {
backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t g_params,
const sampling_params_t s_params) noexcept {
SPDLOG_DEBUG("Submit {:d} tokens for scheduling ({}, {})", token_ids.size(), g_params, s_params);
return executor_.enqueueRequest(tle::Request{
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens
static_cast<tle::SizeType32>(generation_params.max_new_tokens),
static_cast<tle::SizeType32>(g_params.max_new_tokens),
true,
(tle::SamplingConfig) sampling_params,
tle::OutputConfig { /* returnLogProbs= */ true },
(tle::SamplingConfig) s_params,
tle::OutputConfig{ /* returnLogProbs= */ true},
std::nullopt,
std::nullopt,
std::nullopt,
Expand Down
Loading
Loading