From 25b20cba2a21418beeebb8c87a651ee2473ff63e Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 31 Jul 2024 12:30:53 +0000 Subject: [PATCH 01/62] (backend) use parking_lot crate for RwLock fairness # Conflicts: # backends/trtllm/src/backend.rs --- backends/trtllm/src/backend.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/backends/trtllm/src/backend.rs b/backends/trtllm/src/backend.rs index b23aa6c01fe..6bf1847255a 100644 --- a/backends/trtllm/src/backend.rs +++ b/backends/trtllm/src/backend.rs @@ -2,8 +2,8 @@ use std::future::Future; use std::path::Path; use std::pin::{pin, Pin}; use std::str::FromStr; -use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, OnceLock}; +use std::sync::atomic::{AtomicBool, Ordering}; use std::task::{Context, Poll}; use std::time::Duration; @@ -12,17 +12,17 @@ use cxx::UniquePtr; use log::{error, warn}; use tokenizers::Tokenizer; use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; -use tokio::time::{sleep, Instant}; -use tokio_stream::wrappers::UnboundedReceiverStream; +use tokio::time::{Instant, sleep}; use tokio_stream::{Stream, StreamExt}; -use tracing::{instrument, span, Level}; +use tokio_stream::wrappers::UnboundedReceiverStream; +use tracing::{instrument, Level, span}; // use tokio::sync::RwLock; use parking_lot::RwLock; +use text_generation_router::{FinishReason, Token}; use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; +use text_generation_router::validation::{Chunk, ValidationError, ValidGenerateRequest}; use text_generation_router::validation::ValidationError::UnsupportedModality; -use text_generation_router::validation::{Chunk, ValidGenerateRequest, ValidationError}; -use text_generation_router::{FinishReason, Token}; use crate::errors::TensorRtLlmBackendError; use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; From a3f7d76f7bb7e3b51c18ed79a4b3ee8677434e01 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 31 Jul 2024 09:06:52 +0000 Subject: [PATCH 02/62] (launcher) default new server::run parameters to false for now --- backends/trtllm/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index e0ba46c7955..9faa66a4f74 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -159,7 +159,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { true, max_client_batch_size, false, - false, + false ) .await?; Ok(()) From cea64e234fd0fab2c6d52a7c160807b674759ece Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 31 Jul 2024 20:38:30 +0000 Subject: [PATCH 03/62] (chore) fmt ... why? --- backends/trtllm/src/backend.rs | 12 ++++++------ backends/trtllm/src/main.rs | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/backends/trtllm/src/backend.rs b/backends/trtllm/src/backend.rs index 6bf1847255a..b23aa6c01fe 100644 --- a/backends/trtllm/src/backend.rs +++ b/backends/trtllm/src/backend.rs @@ -2,8 +2,8 @@ use std::future::Future; use std::path::Path; use std::pin::{pin, Pin}; use std::str::FromStr; -use std::sync::{Arc, OnceLock}; use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, OnceLock}; use std::task::{Context, Poll}; use std::time::Duration; @@ -12,17 +12,17 @@ use cxx::UniquePtr; use log::{error, warn}; use tokenizers::Tokenizer; use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; -use tokio::time::{Instant, sleep}; -use tokio_stream::{Stream, StreamExt}; +use tokio::time::{sleep, Instant}; use tokio_stream::wrappers::UnboundedReceiverStream; -use tracing::{instrument, Level, span}; +use tokio_stream::{Stream, StreamExt}; +use tracing::{instrument, span, Level}; // use tokio::sync::RwLock; use parking_lot::RwLock; -use text_generation_router::{FinishReason, Token}; use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; -use text_generation_router::validation::{Chunk, ValidationError, ValidGenerateRequest}; use text_generation_router::validation::ValidationError::UnsupportedModality; +use text_generation_router::validation::{Chunk, ValidGenerateRequest, ValidationError}; +use text_generation_router::{FinishReason, Token}; use crate::errors::TensorRtLlmBackendError; use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index 9faa66a4f74..e0ba46c7955 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -159,7 +159,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { true, max_client_batch_size, false, - false + false, ) .await?; Ok(()) From 0cd7538a488cee59a1db0c60adc37d67862201a1 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 1 Aug 2024 07:49:37 +0000 Subject: [PATCH 04/62] (ffi) use const for GetSamplingConfig --- backends/trtllm/include/backend.h | 29 +++++++++++++++-------------- backends/trtllm/lib/backend.cpp | 12 ++++++------ 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index 7990e76b90d..3f89677c4d6 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -48,12 +48,12 @@ namespace huggingface::tgi::backends { * @return */ tle::SamplingConfig GetSamplingConfig( - uint32_t topK, - float_t topP, - float_t temperature, - float_t repetition_penalty, - float_t frequency_penalty, - uint64_t seed + const uint32_t topK, + const float_t topP, + const float_t temperature, + const float_t repetition_penalty, + const float_t frequency_penalty, + const uint64_t seed ); /** @@ -94,13 +94,14 @@ namespace huggingface::tgi::backends { * @return Request id related to this generation for reference */ [[nodiscard]] RequestId Submit( - const std::vector &tokens, - int32_t topK, - float_t topP, - float_t temperature, - float_t repetition_penalty, - float_t frequency_penalty, - uint64_t seed + const std::vector &tokens, + const uint32_t maxNewTokens, + const int32_t topK, + const float_t topP, + const float_t temperature, + const float_t repetition_penalty, + const float_t frequency_penalty, + const uint64_t seed ); /** @@ -108,7 +109,7 @@ namespace huggingface::tgi::backends { * @param requestId The request id to poll the generation results * @return */ - std::vector Poll(RequestId requestId); + std::vector Poll(RequestId requestId); /** * Stop the underlying executor diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index c066a6d6eab..788b7674a99 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -55,12 +55,12 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co } tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig( - uint32_t topK, - float_t topP, - float_t temperature, - float_t repetition_penalty, - float_t frequency_penalty, - uint64_t seed) { + const uint32_t topK, + const float_t topP, + const float_t temperature, + const float_t repetition_penalty, + const float_t frequency_penalty, + const uint64_t seed) { return tle::SamplingConfig( 1, // TGI only use a single beam topK, From 169e1f452f338b258cf5a24032082b673848579c Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 1 Aug 2024 11:59:14 +0000 Subject: [PATCH 05/62] (server) expose new SchedulingError --- router/src/infer/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/router/src/infer/mod.rs b/router/src/infer/mod.rs index 896f4f4318f..39b6f8cff55 100644 --- a/router/src/infer/mod.rs +++ b/router/src/infer/mod.rs @@ -357,6 +357,8 @@ pub enum InferError { ToolError(String), #[error("Stream event serialization error")] StreamSerializationError(String), + #[error("Scheduling error: {0}")] + SchedulingError(String), } impl InferError { @@ -371,6 +373,7 @@ impl InferError { InferError::MissingTemplateVariable(_) => "missing_template_variable", InferError::ToolError(_) => "tool_error", InferError::StreamSerializationError(_) => "stream_serialization_error", + InferError::SchedulingError(_) => "schedling" } } } From 2a339f99dda73efe36f7670f5c455f797ebfe2c7 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 2 Aug 2024 22:09:12 +0000 Subject: [PATCH 06/62] (trt) --- backends/trtllm/Cargo.toml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/backends/trtllm/Cargo.toml b/backends/trtllm/Cargo.toml index 43a114ba4b0..97ef1a76891 100644 --- a/backends/trtllm/Cargo.toml +++ b/backends/trtllm/Cargo.toml @@ -10,16 +10,17 @@ async-trait = "0.1" async-stream = "0.3" clap = { version = "4.5", features = ["derive"] } cxx = "1.0" +hashbrown = "0.14" +hf-hub = { workspace = true } log = { version = "0.4", features = [] } text-generation-router = { path = "../../router" } -tokenizers = { version = "0.19", features = ["hf-hub"] } -tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] } +tokenizers = { workspace = true } +tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] } tokio-stream = "0.1.15" -thiserror = "1.0.62" +thiserror = "1.0.63" tracing = "0.1" -tracing-opentelemetry = "0.24" +tracing-opentelemetry = "0.25" tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] } -parking_lot = "0.12" [build-dependencies] cmake = "0.1" From f6f689f50923c2f55e874eb40417fb6a34c2728f Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 2 Aug 2024 22:10:01 +0000 Subject: [PATCH 07/62] (build) setup ccache if available --- backends/trtllm/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index 425b2d7b9a5..92a6b65a34f 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -1,5 +1,13 @@ cmake_minimum_required(VERSION 3.20) +if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug") + find_program(CCACHE_EXECUTABLE "ccache") + if (CCACHE_EXECUTABLE) + message(STATUS "Using ccache") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE) + endif () +endif () + project(tgi-trtllm-backend VERSION 1.0.0) set(CMAKE_CXX_STANDARD 20) From 38b5263c61898b02949c25e24d0a77fc6948c3e0 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 2 Aug 2024 22:11:41 +0000 Subject: [PATCH 08/62] (ffi) add max_new_tokens parameters --- backends/trtllm/include/backend.h | 2 +- backends/trtllm/include/ffi.h | 4 +++- backends/trtllm/lib/backend.cpp | 16 +++++----------- backends/trtllm/src/ffi.cpp | 5 +++-- backends/trtllm/src/lib.rs | 1 + 5 files changed, 13 insertions(+), 15 deletions(-) diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index 3f89677c4d6..bb31daa9161 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -94,7 +94,7 @@ namespace huggingface::tgi::backends { * @return Request id related to this generation for reference */ [[nodiscard]] RequestId Submit( - const std::vector &tokens, + const std::vector &tokens, const uint32_t maxNewTokens, const int32_t topK, const float_t topP, diff --git a/backends/trtllm/include/ffi.h b/backends/trtllm/include/ffi.h index fe0be9fc820..df296918507 100644 --- a/backends/trtllm/include/ffi.h +++ b/backends/trtllm/include/ffi.h @@ -37,6 +37,7 @@ namespace huggingface::tgi::backends { /*** * * @param tokens + * @param maxNewTokens * @param topK * @param topP * @param temperature @@ -47,7 +48,8 @@ namespace huggingface::tgi::backends { */ [[nodiscard("returned request id should be used to refer to the request's generation result later on")]] uint64_t - Submit(rust::Slice tokens, int32_t topK, float_t topP, float_t temperature, + Submit(rust::Slice tokens, uint32_t maxNewTokens, + int32_t topK, float_t topP, float_t temperature, float_t repetition_penalty, float_t frequency_penalty, uint64_t seed); /*** diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 788b7674a99..dc9ffdaafa8 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -103,6 +103,7 @@ size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const [[nodiscard("Returned request id needs to be provided back to gather generated tokens")]] tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const std::vector &tokens, + const uint32_t maxNewTokens, const int32_t topK, const float_t topP, const float_t temperature, @@ -124,19 +125,12 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( ); #endif - const auto maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get(); - const auto maxNewTokens = static_cast(std::max(1ul, maxNumTokens - tokens.size())); + const auto maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get(); + const auto maxNewTokensChecked = static_cast( + std::min(maxNewTokens, static_cast(maxNumTokens - tokens.size()))); const auto sampling = GetSamplingConfig(topK, topP, temperature, repetition_penalty, frequency_penalty, seed); - const auto output = tle::OutputConfig(true, false, false, true, false); - return executor.enqueueRequest( - tle::Request{tokens, maxNewTokens, true, sampling, output}); -} - -[[nodiscard("Generated tokens result must be used")]] -std::vector huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) { - SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId); - return executor.awaitResponses(requestId); + return executor.enqueueRequest(tle::Request{tokens, maxNewTokensChecked, true, sampling, OUTPUT_CONFIG}); } diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index d6317a68c89..beca88ad997 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -25,8 +25,9 @@ bool huggingface::tgi::backends::TensorRtLlmBackendImpl::IsReady() const { } uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit( - rust::Slice tokens, int32_t topK, float_t topP, float_t temperature, float_t repetition_penalty, - float_t frequency_penalty, uint64_t seed) { + rust::Slice tokens, uint32_t maxNewTokens, + int32_t topK, float_t topP, float_t temperature, + float_t repetition_penalty, float_t frequency_penalty, uint64_t seed) { // This will copy all the items from the initial slice std::vector tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end())); diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index 1a804f88973..5253096cca4 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -56,6 +56,7 @@ mod ffi { fn Submit( self: Pin<&mut TensorRtLlmBackendImpl>, tokens: &[u32], + max_new_tokens: u32, top_k: i32, top_p: f32, temperature: f32, From b8a40a0af3e1b781dd268dfafeda3186400beeb5 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 2 Aug 2024 22:14:03 +0000 Subject: [PATCH 09/62] (backend) cleanup a bit --- backends/trtllm/include/backend.h | 2 ++ backends/trtllm/lib/backend.cpp | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index bb31daa9161..9fda8f87b92 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -23,6 +23,8 @@ namespace huggingface::tgi::backends { using RequestId = tle::IdType; using TokenId = tle::TokenIdType; + const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false); + /** * Initialize all the components required by TRTLLM. * It is required to call this function before attempting to load any engine diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index dc9ffdaafa8..2eca477f539 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -12,6 +12,7 @@ void huggingface::tgi::backends::InitializeBackend() { nvmlInit_v2(); initTrtLlmPlugins(); + SPDLOG_INFO("Backend Executor Version: {}", tle::version()); const auto numGpus = huggingface::hardware::cuda::GetNumDevices(); if (numGpus.has_value()) { SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value()); @@ -22,7 +23,7 @@ void huggingface::tgi::backends::InitializeBackend() { [[nodiscard]] tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) { - tle::ExecutorConfig execConfig(1); + tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1); // Retrieve the compute capabilities to enable some options at runtime const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities(); @@ -60,7 +61,7 @@ tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig( const float_t temperature, const float_t repetition_penalty, const float_t frequency_penalty, - const uint64_t seed) { + const uint64_t seed) noexcept { return tle::SamplingConfig( 1, // TGI only use a single beam topK, From f4a74be384725980b80dc6c1da797c1ee9ba820c Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 2 Aug 2024 22:16:28 +0000 Subject: [PATCH 10/62] (backend) expose PullNewTokens --- backends/trtllm/include/backend.h | 20 +--------- backends/trtllm/include/ffi.h | 11 ++---- backends/trtllm/lib/backend.cpp | 17 ++------ backends/trtllm/src/ffi.cpp | 65 ++++++++++++++----------------- backends/trtllm/src/lib.rs | 28 ++++--------- 5 files changed, 47 insertions(+), 94 deletions(-) diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index 9fda8f87b92..83e862c558e 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -56,7 +56,7 @@ namespace huggingface::tgi::backends { const float_t repetition_penalty, const float_t frequency_penalty, const uint64_t seed - ); + ) noexcept; /** * @@ -72,12 +72,6 @@ namespace huggingface::tgi::backends { const std::filesystem::path &executorWorker ); - /** - * Indicate if the backend is ready to accept incoming request - * @return true if ready, false otherwise - */ - [[nodiscard]] bool IsReady() const; - /** * Query the executor for the number of token available for pulling * @return @@ -106,17 +100,7 @@ namespace huggingface::tgi::backends { const uint64_t seed ); - /** - * - * @param requestId The request id to poll the generation results - * @return - */ - std::vector Poll(RequestId requestId); - - /** - * Stop the underlying executor - */ - void Shutdown(); + [[nodiscard]] std::vector PullNewTokens(); }; } diff --git a/backends/trtllm/include/ffi.h b/backends/trtllm/include/ffi.h index df296918507..6127d29ac1c 100644 --- a/backends/trtllm/include/ffi.h +++ b/backends/trtllm/include/ffi.h @@ -54,18 +54,13 @@ namespace huggingface::tgi::backends { /*** * - * @param requestId - * @param ctx - * @param callback * @return */ - size_t StreamTokens( - const RequestId requestId, - huggingface::tgi::backends::GenerationContext *ctx, - rust::Fn callback); + std::unique_ptr> PullTokens(); }; + GenerationStep ConvertResponseToGenerationStep(const tle::Response &response); + /*** * * @param engineFolder diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 2eca477f539..9c9c5dff554 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -84,18 +84,11 @@ huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( const std::filesystem::path &executorWorker ) : config(json::parse(std::ifstream(enginesFolder / "config.json"))), - executor( - enginesFolder, - tensorrt_llm::executor::ModelType::kDECODER_ONLY, - GetExecutorConfig(config, executorWorker.string() - )) { + executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY, + GetExecutorConfig(config, executorWorker.string())) { SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref()); } -bool huggingface::tgi::backends::TensorRtLlmBackend::IsReady() const { - return executor.canEnqueueRequests(); -} - [[nodiscard("Returned number of requests needs to be consumed")]] size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const { return executor.getNumResponsesReady(); @@ -134,8 +127,6 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( return executor.enqueueRequest(tle::Request{tokens, maxNewTokensChecked, true, sampling, OUTPUT_CONFIG}); } - -void huggingface::tgi::backends::TensorRtLlmBackend::Shutdown() { - SPDLOG_INFO("Shutting down executor"); - executor.shutdown(); +std::vector huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() { + return std::move(executor.awaitResponses()); } diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index beca88ad997..e55204ab1ac 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -35,47 +35,42 @@ uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit( std::move(tokens_), topK, topP, temperature, repetition_penalty, frequency_penalty, seed); } -size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens( - const uint64_t requestId, - huggingface::tgi::backends::GenerationContext *ctx, - rust::Fn callback) { - - size_t numTokens = 0; - for (const auto &item: Poll(requestId)) { - GenerationStep step; - if (!item.hasError()) { - SPDLOG_DEBUG("\tStreamTokens -> Decoding token..."); - const auto decoded = item.getResult(); - - const auto token = decoded.outputTokenIds[0][0]; - const auto isFinal = decoded.isFinal; - const auto logProb = decoded.logProbs.value()[0][0]; - - ++numTokens; - - SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal); - step = huggingface::tgi::backends::GenerationStep{ - static_cast(token), logProb, isFinal, false, std::move(std::string()) - }; - SPDLOG_DEBUG("\tStreamTokens -> Post callback"); - } else { - // TODO : Return rest::Result with error - const auto what = item.getErrorMsg(); - SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", what); - step = huggingface::tgi::backends::GenerationStep{ - std::numeric_limits::max(), 0.0, true, true, std::move(what) - }; - } +std::unique_ptr> +huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() { + const auto responses = TensorRtLlmBackend::PullNewTokens(); + auto steps = std::make_unique>(responses.size()); + std::ranges::copy(std::views::transform(responses, ConvertResponseToGenerationStep), std::back_inserter(*steps)); + return steps; +} - callback(std::move(ctx), std::move(step)); +huggingface::tgi::backends::GenerationStep +huggingface::tgi::backends::ConvertResponseToGenerationStep(const tle::Response &response) { + const auto reqId = response.getRequestId(); + if (!response.hasError()) { + const auto result = response.getResult(); + return std::move(GenerationStep{ + reqId, + result.outputTokenIds[0][0], + result.logProbs.value()[0][0], + result.isFinal, + false, + std::string() + }); + } else { + return std::move(GenerationStep{ + reqId, + 0, + 0.0, + true, + true, + std::move(response.getErrorMsg()) + }); } - - return numTokens; } std::unique_ptr huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) { + SPDLOG_INFO("Creating TensorRT-LLM Backend"); // Unconditionally call this to initialize and discover TRTLLM plugins InitializeBackend(); diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index 5253096cca4..00a510a77ae 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -1,7 +1,7 @@ -pub use backend::{GenerationContext, TensorRtLlmBackend}; +pub use looper::TensorRtLlmBackendV2; -mod backend; pub mod errors; +mod looper; #[cxx::bridge(namespace = "huggingface::tgi::backends")] mod ffi { @@ -9,6 +9,7 @@ mod ffi { /// Struct used as shared type between rust and C++ to represent the result /// of a single decoding iteration pub struct GenerationStep { + request_id: u64, token_id: u32, log_prob: f32, is_final: bool, @@ -16,10 +17,6 @@ mod ffi { error_msg: String, } - extern "Rust" { - type GenerationContext; - } - unsafe extern "C++" { include!("backends/trtllm/src/ffi.cpp"); @@ -44,10 +41,7 @@ mod ffi { fn CreateTensorRtLlmBackend( engine_folder: &str, executor_worker: &str, - ) -> UniquePtr; - - // #[rust_name = "is_ready"] - // fn IsReady(self: &TensorRtLlmBackendImpl) -> bool; + ) -> Result>; #[rust_name = "num_responses_ready"] fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize; @@ -63,17 +57,11 @@ mod ffi { repetition_penalty: f32, frequency_penalty: f32, seed: u64, - ) -> u64; + ) -> Result; - #[rust_name = "stream_tokens"] - unsafe fn StreamTokens( + #[rust_name = "pull_tokens"] + fn PullTokens( self: Pin<&mut TensorRtLlmBackendImpl>, - request_id: u64, - ctx: *mut GenerationContext, - cb: unsafe fn(*mut GenerationContext, GenerationStep), - ) -> usize; - - // #[rust_name = "shutdown"] - // fn Shutdown(self: Pin<&mut TensorRtLlmBackendImpl>); + ) -> Result>>; } } From 2883c042edaebc961b7c38d005805852dc2bdd53 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 2 Aug 2024 22:17:02 +0000 Subject: [PATCH 11/62] (ffi) cleanup again --- backends/trtllm/include/ffi.h | 9 --------- backends/trtllm/src/ffi.cpp | 4 ---- 2 files changed, 13 deletions(-) diff --git a/backends/trtllm/include/ffi.h b/backends/trtllm/include/ffi.h index 6127d29ac1c..c2b29500d91 100644 --- a/backends/trtllm/include/ffi.h +++ b/backends/trtllm/include/ffi.h @@ -14,11 +14,8 @@ namespace huggingface::tgi::backends { #include "backends/trtllm/src/lib.rs.h" - namespace huggingface::tgi::backends { -// struct GenerationContext; - class TensorRtLlmBackendImpl : public TensorRtLlmBackend { public: /*** @@ -28,12 +25,6 @@ namespace huggingface::tgi::backends { */ TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker); - /*** - * - * @return - */ - bool IsReady() const; - /*** * * @param tokens diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index e55204ab1ac..1179fc85772 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -20,10 +20,6 @@ huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl( ) : TensorRtLlmBackend(engineFolder, executorWorker) {} -bool huggingface::tgi::backends::TensorRtLlmBackendImpl::IsReady() const { - return TensorRtLlmBackend::IsReady(); -} - uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit( rust::Slice tokens, uint32_t maxNewTokens, int32_t topK, float_t topP, float_t temperature, From 33c962ef41795d71df97e240a3c25b5350df66f1 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 2 Aug 2024 22:17:32 +0000 Subject: [PATCH 12/62] (ffi) add missing headers imports --- backends/trtllm/include/ffi.h | 2 ++ backends/trtllm/src/ffi.cpp | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/backends/trtllm/include/ffi.h b/backends/trtllm/include/ffi.h index c2b29500d91..a35449813d4 100644 --- a/backends/trtllm/include/ffi.h +++ b/backends/trtllm/include/ffi.h @@ -5,7 +5,9 @@ #ifndef TGI_TRTLLM_BACKEND_FFI_H #define TGI_TRTLLM_BACKEND_FFI_H +ad#include #include +#include #include "backend.h" namespace huggingface::tgi::backends { diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index 1179fc85772..adaaced6c0c 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -3,11 +3,13 @@ // #pragma once -#include +#include #include #include +#include #include #include +#include #include #include @@ -28,7 +30,7 @@ uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit( // This will copy all the items from the initial slice std::vector tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end())); return TensorRtLlmBackend::Submit( - std::move(tokens_), topK, topP, temperature, repetition_penalty, frequency_penalty, seed); + std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed); } std::unique_ptr> From 5f7c0b67c33ab63b7660d7fe19cacabdfeccfafb Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 2 Aug 2024 22:18:18 +0000 Subject: [PATCH 13/62] (ffi) add template specialization to catch and convert to Rust Result --- backends/trtllm/include/ffi.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/backends/trtllm/include/ffi.h b/backends/trtllm/include/ffi.h index a35449813d4..55f90f9f18a 100644 --- a/backends/trtllm/include/ffi.h +++ b/backends/trtllm/include/ffi.h @@ -14,6 +14,18 @@ namespace huggingface::tgi::backends { class TensorRtLlmBackendImpl; } +// Template to support returning error from TllmException back to Rust in a Result<> +#include + +namespace rust::behavior { + template + static void trycatch(Try &&func, Fail &&fail) noexcept try { + func(); + } catch (tensorrt_llm::common::TllmException &e) { + fail(e.what()); + } +} + #include "backends/trtllm/src/lib.rs.h" namespace huggingface::tgi::backends { From fb759bdd2ac5216e5943a24d0a937c1a506c5b56 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 2 Aug 2024 22:18:39 +0000 Subject: [PATCH 14/62] (looper) new looper initial implementation --- backends/trtllm/src/errors.rs | 2 + backends/trtllm/src/looper.rs | 182 ++++++++++++++++++++++++++++++++++ backends/trtllm/src/main.rs | 181 ++++++++++++++++++++++++++++++--- 3 files changed, 350 insertions(+), 15 deletions(-) create mode 100644 backends/trtllm/src/looper.rs diff --git a/backends/trtllm/src/errors.rs b/backends/trtllm/src/errors.rs index a672d2a406b..8ec6e1afc61 100644 --- a/backends/trtllm/src/errors.rs +++ b/backends/trtllm/src/errors.rs @@ -4,6 +4,8 @@ use text_generation_router::server; #[derive(Debug, Error)] pub enum TensorRtLlmBackendError { + #[error("TensorRT-LLM Runtime error: {0}")] + Runtime(String), #[error("Tokenizer error: {0}")] Tokenizer(String), #[error("Argument validation error: {0}")] diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs new file mode 100644 index 00000000000..29866c2fadf --- /dev/null +++ b/backends/trtllm/src/looper.rs @@ -0,0 +1,182 @@ +use std::hint; +use std::ops::Deref; +use std::path::Path; +use std::sync::OnceLock; + +use async_trait::async_trait; +use cxx::UniquePtr; +use hashbrown::HashMap; +use tokenizers::Tokenizer; +use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; +use tokio::task::JoinHandle; +use tokio_stream::wrappers::UnboundedReceiverStream; +use tracing::{error, info, Level, span}; + +use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; +use text_generation_router::infer::InferError::GenerationError; +use text_generation_router::validation::ValidGenerateRequest; + +use crate::errors::TensorRtLlmBackendError; +use crate::ffi::{create_tensorrt_llm_backend, TensorRtLlmBackendImpl}; + +// Value used to poll the state of the generation stream +static POLLING_INTERVAL_US: OnceLock = OnceLock::new(); + +// It's safe to send the backend between threads +unsafe impl Send for TensorRtLlmBackendImpl {} + +type InferResult = Result; + +fn executor_status_poller( + mut backend: UniquePtr, + mut waiting_requests: UnboundedReceiver, +) { + // Track the tuple (request_id, stream) for each request + let mut in_flights = HashMap::::with_capacity(128); + + // TODO: Does it need a spin-loop? + loop { + span!(Level::DEBUG, "in-flight submit").in_scope(|| { + // Is there any request pending to be scheduled? + let awaiting_requests = waiting_requests.len(); + if awaiting_requests > 0 { + // Retrieve all the requests + let mut requests = Vec::with_capacity(awaiting_requests); + let _ = waiting_requests.recv_many(&mut requests, awaiting_requests); + + // Submit all the request to the executor and move the context to the in-flight tracker + for ctx in requests { + let request = &ctx.request; + let generation_params = &request.parameters; + let stopping_params = &request.stopping_parameters; + + // Submit to the TensorRT-LLM executor for scheduling + match backend.pin_mut().submit( + &vec![], + stopping_params.max_new_tokens, + generation_params.top_k as i32, + generation_params.top_p, + generation_params.temperature, + generation_params.repetition_penalty, + generation_params.frequency_penalty, + generation_params.seed, + ) { + Ok(request_id) => { + // Insert the context linked to the generated request id in the tracker + in_flights.insert(request_id, ctx); + } + Err(e) => { + // Return to the caller + let what = Err(InferError::SchedulingError(e.to_string())); + if let Err(e) = ctx.streamer.send(what) { + error!("Failed to send back through the channel: {}", e); + } + } + }; + } + } + }); + + span!(Level::DEBUG, "in-flight poll").in_scope(|| { + if backend.num_responses_ready() > 0 { + match backend.pin_mut().pull_tokens() { + Ok(responses) => { + for step in responses.deref() { + let request_id = step.request_id; + match in_flights.get(&request_id) { + Some(ctx) => { + info!("New token for {} -> {}", request_id, step.token_id); + + if step.is_final { + let _ = in_flights.remove(&step.request_id); + } + } + None => { + error!("Got step for untracked request {}", request_id); + } + } + } + } + Err(err) => { + error!("Failed to retrieve tokens from the executor: {}", err); + } + } + } + }); + + // Hint the CPU we are spin-locking + hint::spin_loop(); + } +} + +struct GenerationContext { + request: ValidGenerateRequest, + streamer: UnboundedSender>, +} + +pub struct TensorRtLlmBackendV2 { + tokenizer: Tokenizer, + looper: JoinHandle<()>, + queue: UnboundedSender, +} + +impl TensorRtLlmBackendV2 { + pub fn new + Send, PP: AsRef + Send>( + tokenizer: Tokenizer, + engine_folder: P, + executor_worker_path: PP, + ) -> Result { + // Retrieve paths as &str for the backend creation + let engine_folder = engine_folder.as_ref(); + let executor_worker_path = executor_worker_path.as_ref(); + + let engine_folder = String::from( + engine_folder + .to_str() + .expect("Failed to convert engine_folder to valid UTF-8"), + ); + + let executor_worker_path = String::from( + executor_worker_path + .to_str() + .expect("Failed to convert executor_worker_path to valid UTF-8"), + ); + + // Allocate the IPC layer to communicate with the backend + let (requests_sender, requests_receiver) = unbounded_channel::(); + + // Create the FFI backend + let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path) + .map_err(|e| TensorRtLlmBackendError::Runtime(e.what().to_string()))?; + + // Looper is responsible for scheduling and pulling requests state at regular interval + let looper = + tokio::task::spawn_blocking(move || executor_status_poller(backend, requests_receiver)); + + Ok(TensorRtLlmBackendV2 { + tokenizer, + looper, + queue: requests_sender, + }) + } +} + +#[async_trait] +impl Backend for TensorRtLlmBackendV2 { + fn schedule( + &self, + request: ValidGenerateRequest, + ) -> Result>, InferError> { + let (streamer, receiver) = unbounded_channel::>(); + match self.queue.send(GenerationContext { request, streamer }) { + Ok(_) => Ok(UnboundedReceiverStream::new(receiver)), + Err(_) => Err(GenerationError( + "Failed to submit request to the backend".into(), + )), + } + } + + async fn health(&self, current_health: bool) -> bool { + current_health & !self.looper.is_finished() + } +} diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index e0ba46c7955..15f40f5ae62 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -1,10 +1,17 @@ +use std::path::{Path, PathBuf}; + use clap::Parser; -use std::collections::HashMap; -use std::path::PathBuf; +use hf_hub::{Cache, Repo, RepoType}; +use hf_hub::api::tokio::{Api, ApiBuilder}; +use tokenizers::Tokenizer; +use tracing::info; + use text_generation_backends_trtllm::errors::TensorRtLlmBackendError; -use text_generation_backends_trtllm::TensorRtLlmBackend; -use text_generation_router::server; -use tokenizers::{FromPretrainedParameters, Tokenizer}; +use text_generation_backends_trtllm::TensorRtLlmBackendV2; +use text_generation_router::{HubTokenizerConfig, server}; +use text_generation_router::server::{ + create_post_processor, get_base_tokenizer, get_hub_model_info, +}; /// App Configuration #[derive(Parser, Debug)] @@ -58,6 +65,147 @@ struct Args { executor_worker: PathBuf, } +async fn get_tokenizer( + tokenizer_name: &str, + tokenizer_config_path: Option<&str>, + revision: Option<&str>, +) -> Option { + // Parse Huggingface hub token + let authorization_token = std::env::var("HF_TOKEN") + .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")) + .ok(); + + // Tokenizer instance + let local_path = Path::new(tokenizer_name); + + // Shared API builder initialization + let api_builder = || { + let mut builder = ApiBuilder::new() + .with_progress(false) + .with_token(authorization_token); + + if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") { + builder = builder.with_cache_dir(cache_dir.into()); + } + + builder + }; + + // Decide if we need to use the API based on the revision and local path + let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir(); + + // Initialize API if needed + #[derive(Clone)] + enum Type { + Api(Api), + Cache(Cache), + None, + } + let api = if use_api { + if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) { + let cache = std::env::var("HUGGINGFACE_HUB_CACHE") + .map_err(|_| ()) + .map(|cache_dir| Cache::new(cache_dir.into())) + .unwrap_or_else(|_| Cache::default()); + tracing::warn!("Offline mode active using cache defaults"); + Type::Cache(cache) + } else { + tracing::info!("Using the Hugging Face API"); + match api_builder().build() { + Ok(api) => Type::Api(api), + Err(_) => { + tracing::warn!("Unable to build the Hugging Face API"); + Type::None + } + } + } + } else { + Type::None + }; + + // Load tokenizer and model info + let ( + tokenizer_filename, + config_filename, + tokenizer_config_filename, + preprocessor_config_filename, + processor_config_filename, + ) = match api { + Type::None => ( + Some(local_path.join("tokenizer.json")), + Some(local_path.join("config.json")), + Some(local_path.join("tokenizer_config.json")), + Some(local_path.join("preprocessor_config.json")), + Some(local_path.join("processor_config.json")), + ), + Type::Api(api) => { + let api_repo = api.repo(Repo::with_revision( + tokenizer_name.to_string(), + RepoType::Model, + revision.unwrap_or_else(|| "main").to_string(), + )); + + let tokenizer_filename = match api_repo.get("tokenizer.json").await { + Ok(tokenizer_filename) => Some(tokenizer_filename), + Err(_) => get_base_tokenizer(&api, &api_repo).await, + }; + let config_filename = api_repo.get("config.json").await.ok(); + let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok(); + let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok(); + let processor_config_filename = api_repo.get("processor_config.json").await.ok(); + + ( + tokenizer_filename, + config_filename, + tokenizer_config_filename, + preprocessor_config_filename, + processor_config_filename, + ) + } + Type::Cache(cache) => { + let repo = cache.repo(Repo::with_revision( + tokenizer_name.to_string(), + RepoType::Model, + revision.clone().unwrap_or_else(|| "main").to_string(), + )); + ( + repo.get("tokenizer.json"), + repo.get("config.json"), + repo.get("tokenizer_config.json"), + repo.get("preprocessor_config.json"), + repo.get("processor_config.json"), + ) + } + }; + + // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'. + let tokenizer_config: Option = if let Some(filename) = tokenizer_config_path + { + HubTokenizerConfig::from_file(filename) + } else { + tokenizer_config_filename.and_then(HubTokenizerConfig::from_file) + }; + let tokenizer_config = tokenizer_config.unwrap_or_else(|| { + tracing::warn!("Could not find tokenizer config locally and no API specified"); + HubTokenizerConfig::default() + }); + + tokenizer_filename.and_then(|filename| { + let mut tokenizer = Tokenizer::from_file(filename).ok(); + if let Some(tokenizer) = &mut tokenizer { + if let Some(class) = &tokenizer_config.tokenizer_class { + if class == "LlamaTokenizer" || class == "LlamaTokenizerFast"{ + if let Ok(post_processor) = create_post_processor(tokenizer, &tokenizer_config) { + tracing::info!("Overriding LlamaTokenizer with TemplateProcessing to follow python override defined in https://github.com/huggingface/transformers/blob/4aa17d00690b7f82c95bb2949ea57e22c35b4336/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205"); + tokenizer.with_post_processor(post_processor); + } + } + } + } + tokenizer + }) +} + #[tokio::main] async fn main() -> Result<(), TensorRtLlmBackendError> { // Get args @@ -124,18 +272,21 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { ))); } - // Run server - let tokenizer = Tokenizer::from_pretrained( - tokenizer_name.clone(), - Some(FromPretrainedParameters { - revision: revision.clone().unwrap_or(String::from("main")), - user_agent: HashMap::new(), - auth_token, - }), + // Create the backend + let tokenizer = get_tokenizer( + &tokenizer_name, + tokenizer_config_path.as_deref(), + revision.as_deref(), ) - .map_err(|e| TensorRtLlmBackendError::Tokenizer(e.to_string()))?; + .await + .expect("Failed to retrieve tokenizer implementation"); - let backend = TensorRtLlmBackend::new(tokenizer, model_id, executor_worker)?; + info!("Successfully retrieved tokenizer {}", &tokenizer_name); + let backend = TensorRtLlmBackendV2::new(tokenizer, model_id, executor_worker)?; + + info!("Successfully created backend"); + + // Run server server::run( backend, max_concurrent_requests, From 0b0c30fe8b067ae678a3a41c903868e3a2ca1dbd Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sat, 3 Aug 2024 21:55:04 +0000 Subject: [PATCH 15/62] (ffi) remove narrowing type warning --- backends/trtllm/include/ffi.h | 2 +- backends/trtllm/src/ffi.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/trtllm/include/ffi.h b/backends/trtllm/include/ffi.h index 55f90f9f18a..f4a998b22a7 100644 --- a/backends/trtllm/include/ffi.h +++ b/backends/trtllm/include/ffi.h @@ -5,7 +5,7 @@ #ifndef TGI_TRTLLM_BACKEND_FFI_H #define TGI_TRTLLM_BACKEND_FFI_H -ad#include +#include #include #include #include "backend.h" diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index adaaced6c0c..b9f3d009997 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -48,7 +48,7 @@ huggingface::tgi::backends::ConvertResponseToGenerationStep(const tle::Response const auto result = response.getResult(); return std::move(GenerationStep{ reqId, - result.outputTokenIds[0][0], + static_cast(result.outputTokenIds[0][0]), result.logProbs.value()[0][0], result.isFinal, false, From 933ab67aa1322aaa2062d001e6b7d5cb1cf6c39a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 5 Aug 2024 07:56:14 +0000 Subject: [PATCH 16/62] (ffi) encode the provided user prompt within each request thread --- backends/trtllm/src/lib.rs | 1 + backends/trtllm/src/looper.rs | 60 ++++++++++++++++++++++++++++++----- backends/trtllm/src/utils.rs | 22 +++++++++++++ 3 files changed, 75 insertions(+), 8 deletions(-) create mode 100644 backends/trtllm/src/utils.rs diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index 00a510a77ae..e6e97c03b2c 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -2,6 +2,7 @@ pub use looper::TensorRtLlmBackendV2; pub mod errors; mod looper; +mod utils; #[cxx::bridge(namespace = "huggingface::tgi::backends")] mod ffi { diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index 29866c2fadf..3db7d1ab445 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -6,7 +6,7 @@ use std::sync::OnceLock; use async_trait::async_trait; use cxx::UniquePtr; use hashbrown::HashMap; -use tokenizers::Tokenizer; +use tokenizers::{Encoding, Tokenizer}; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; use tokio::task::JoinHandle; use tokio_stream::wrappers::UnboundedReceiverStream; @@ -14,10 +14,12 @@ use tracing::{error, info, Level, span}; use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; use text_generation_router::infer::InferError::GenerationError; -use text_generation_router::validation::ValidGenerateRequest; +use text_generation_router::validation::{Chunk, ValidationError, ValidGenerateRequest}; +use text_generation_router::validation::ValidationError::UnsupportedModality; use crate::errors::TensorRtLlmBackendError; use crate::ffi::{create_tensorrt_llm_backend, TensorRtLlmBackendImpl}; +use crate::utils::first_line; // Value used to poll the state of the generation stream static POLLING_INTERVAL_US: OnceLock = OnceLock::new(); @@ -27,6 +29,11 @@ unsafe impl Send for TensorRtLlmBackendImpl {} type InferResult = Result; +struct ValidGenerateRequestWithTokens { + encoding: Encoding, + inner: ValidGenerateRequest, +} + fn executor_status_poller( mut backend: UniquePtr, mut waiting_requests: UnboundedReceiver, @@ -47,12 +54,12 @@ fn executor_status_poller( // Submit all the request to the executor and move the context to the in-flight tracker for ctx in requests { let request = &ctx.request; - let generation_params = &request.parameters; - let stopping_params = &request.stopping_parameters; + let generation_params = &request.inner.parameters; + let stopping_params = &request.inner.stopping_parameters; // Submit to the TensorRT-LLM executor for scheduling match backend.pin_mut().submit( - &vec![], + request.encoding.get_ids(), stopping_params.max_new_tokens, generation_params.top_k as i32, generation_params.top_p, @@ -110,7 +117,7 @@ fn executor_status_poller( } struct GenerationContext { - request: ValidGenerateRequest, + request: ValidGenerateRequestWithTokens, streamer: UnboundedSender>, } @@ -147,7 +154,7 @@ impl TensorRtLlmBackendV2 { // Create the FFI backend let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path) - .map_err(|e| TensorRtLlmBackendError::Runtime(e.what().to_string()))?; + .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?; // Looper is responsible for scheduling and pulling requests state at regular interval let looper = @@ -159,15 +166,52 @@ impl TensorRtLlmBackendV2 { queue: requests_sender, }) } + + fn validate(request: &ValidGenerateRequest) -> InferResult<&String> { + if request.top_n_tokens > 1 { + return Err(InferError::ValidationError( + ValidationError::TopNTokensDisabled, + )); + } + + // TODO: Is it really needed? How can it be validated before? + if request.parameters.grammar.is_some() { + return Err(InferError::ValidationError(ValidationError::Grammar)); + } + + match request.inputs.len() { + 0 => Err(InferError::ValidationError(ValidationError::EmptyInput)), + 2.. => Err(InferError::GenerationError( + "TensorRT-LLM backend don't support multi-chunk".into(), + )), + 1 => match request.inputs.first().expect("Single item-chunk") { + Chunk::Text(text) => Ok(text), + Chunk::Image(_) => Err(InferError::ValidationError(UnsupportedModality("image"))), + }, + } + } } #[async_trait] impl Backend for TensorRtLlmBackendV2 { fn schedule( &self, - request: ValidGenerateRequest, + inner: ValidGenerateRequest, ) -> Result>, InferError> { + let prompt = Self::validate(&inner)?; + + // We encode the prompt in every request context/thread + let encoding = self + .tokenizer + .encode(prompt.as_str(), true) + .map_err(|e| GenerationError(format!("Tokenization failed {}", e.to_string())))?; + + let request = ValidGenerateRequestWithTokens { encoding, inner }; + + // Open-up the stream to send tokens let (streamer, receiver) = unbounded_channel::>(); + + // Send the context to the executor for scheduling match self.queue.send(GenerationContext { request, streamer }) { Ok(_) => Ok(UnboundedReceiverStream::new(receiver)), Err(_) => Err(GenerationError( diff --git a/backends/trtllm/src/utils.rs b/backends/trtllm/src/utils.rs new file mode 100644 index 00000000000..4dedb007863 --- /dev/null +++ b/backends/trtllm/src/utils.rs @@ -0,0 +1,22 @@ +/// +/// Extract the first line of the provided string reference. +/// If there is no lines in the buffer, it returns a string +/// which content is defined by the content of `fail` +/// # Arguments +/// +/// * `s`: The string buffer to extract the first-line from +/// * `fail`: A string content which is returned if no lines are +/// present in `s` +/// +/// returns: String +/// +/// # Examples +/// +/// ``` +/// let s = "My name is Morgan.\n I'm working at Hugging Face."; +/// first_line(s, "No line in string"); +/// ``` +#[inline] +pub(crate) fn first_line(s: &str, fail: &str) -> String { + s.lines().next().unwrap_or(fail).to_string() +} From 0dca168bcbdaec8b3283cdf338ff7c2868249a16 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 5 Aug 2024 11:41:29 +0000 Subject: [PATCH 17/62] (misc) change scope identifiers --- backends/trtllm/src/looper.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index 3db7d1ab445..c7225062ea0 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -43,7 +43,7 @@ fn executor_status_poller( // TODO: Does it need a spin-loop? loop { - span!(Level::DEBUG, "in-flight submit").in_scope(|| { + span!(Level::DEBUG, "[in-flight][submit]").in_scope(|| { // Is there any request pending to be scheduled? let awaiting_requests = waiting_requests.len(); if awaiting_requests > 0 { @@ -84,7 +84,7 @@ fn executor_status_poller( } }); - span!(Level::DEBUG, "in-flight poll").in_scope(|| { + span!(Level::DEBUG, "[in-flight][poll]").in_scope(|| { if backend.num_responses_ready() > 0 { match backend.pin_mut().pull_tokens() { Ok(responses) => { From c2e21d87254a834a2b909779d53e61734e10ec1b Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 5 Aug 2024 13:27:18 +0000 Subject: [PATCH 18/62] (backend) implement the post_processor background thread --- backends/trtllm/src/lib.rs | 2 +- backends/trtllm/src/looper.rs | 146 +++++++++++++++++++++++++++++----- 2 files changed, 129 insertions(+), 19 deletions(-) diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index e6e97c03b2c..edd8caff154 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -6,9 +6,9 @@ mod utils; #[cxx::bridge(namespace = "huggingface::tgi::backends")] mod ffi { - /// Struct used as shared type between rust and C++ to represent the result /// of a single decoding iteration + #[derive(Debug, Clone)] pub struct GenerationStep { request_id: u64, token_id: u32, diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index c7225062ea0..7d8058632e2 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -6,19 +6,22 @@ use std::sync::OnceLock; use async_trait::async_trait; use cxx::UniquePtr; use hashbrown::HashMap; +use log::warn; use tokenizers::{Encoding, Tokenizer}; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; -use tokio::task::JoinHandle; +use tokio::task::{JoinHandle, spawn_blocking}; +use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{error, info, Level, span}; -use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; +use text_generation_router::{FinishReason, Token}; +use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; use text_generation_router::infer::InferError::GenerationError; use text_generation_router::validation::{Chunk, ValidationError, ValidGenerateRequest}; use text_generation_router::validation::ValidationError::UnsupportedModality; use crate::errors::TensorRtLlmBackendError; -use crate::ffi::{create_tensorrt_llm_backend, TensorRtLlmBackendImpl}; +use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; use crate::utils::first_line; // Value used to poll the state of the generation stream @@ -34,15 +37,21 @@ struct ValidGenerateRequestWithTokens { inner: ValidGenerateRequest, } +struct DecodedTokenContext { + tokens: Vec, + ctx: UnboundedSender>, +} + fn executor_status_poller( mut backend: UniquePtr, mut waiting_requests: UnboundedReceiver, + mut post_processor_sender: UnboundedSender, ) { // Track the tuple (request_id, stream) for each request let mut in_flights = HashMap::::with_capacity(128); // TODO: Does it need a spin-loop? - loop { + 'executor: loop { span!(Level::DEBUG, "[in-flight][submit]").in_scope(|| { // Is there any request pending to be scheduled? let awaiting_requests = waiting_requests.len(); @@ -84,18 +93,40 @@ fn executor_status_poller( } }); - span!(Level::DEBUG, "[in-flight][poll]").in_scope(|| { + if let Err(e) = span!(Level::DEBUG, "[in-flight][poll]").in_scope(|| { if backend.num_responses_ready() > 0 { match backend.pin_mut().pull_tokens() { Ok(responses) => { + // worse case scenario is one token for each response: with_capacity(responses.len()) + // grouper will group decoded tokens per request to decode multiple tokens + let mut grouper: HashMap = + HashMap::with_capacity(responses.len()); + + // Iterate through all the decoded token for step in responses.deref() { let request_id = step.request_id; + match in_flights.get(&request_id) { Some(ctx) => { info!("New token for {} -> {}", request_id, step.token_id); - if step.is_final { - let _ = in_flights.remove(&step.request_id); + if !step.has_error { + let req_group = grouper.entry_ref(&request_id).or_insert( + DecodedTokenContext { + tokens: vec![], + ctx: ctx.streamer.clone(), // Arc::clone() = cheap + }, + ); + req_group.tokens.push(step.clone()); // Should be ultra cheap + + if step.is_final { + let _ = in_flights.remove(&step.request_id); + } + } else { + warn!( + "Error for request: {} -> {}", + request_id, &step.error_msg + ); } } None => { @@ -103,19 +134,87 @@ fn executor_status_poller( } } } + + grouper + .into_values() + .map(|ctx| post_processor_sender.send(ctx)) + .collect()?; } Err(err) => { error!("Failed to retrieve tokens from the executor: {}", err); } } } - }); + + Ok(()) + }) { + error!( + "Caught an fatal error in the executor's loop, about to exit. {}", + e + ); + break 'executor; + } // Hint the CPU we are spin-locking hint::spin_loop(); } } +fn post_processor_looper( + tokenizer: Tokenizer, + mut decoded_tokens: UnboundedReceiver, +) { + 'post_processor: loop { + if decoded_tokens.is_closed() { + warn!("Post processor IPC is closed, loop will exit now."); + break 'post_processor; + } + + if let Some(ctx) = decoded_tokens.blocking_recv() { + ctx.tokens.iter().for_each(|step| { + let out = match tokenizer.decode(&[step.token_id], true) { + Ok(text) => { + let is_special = tokenizer.get_added_vocabulary().is_special_token(&text); + let token = Token { + id: step.token_id, + text, + logprob: step.log_prob, + special: is_special, + }; + + let response = if !step.is_final { + InferStreamResponse::Intermediate { + token, + top_tokens: vec![], + } + } else { + InferStreamResponse::End { + token, + top_tokens: vec![], + generated_text: GeneratedText { + text: String::from(""), + generated_tokens: 0, + finish_reason: FinishReason::Length, + seed: None, + }, + start: Instant::now(), // Handle start time + queued: Instant::now(), // Handle queued time + } + }; + + Ok(response) + } + Err(e) => Err(GenerationError(e.to_string())), + }; + + if let Err(e) = ctx.ctx.send(out) { + warn!("Failed to send back the decoded tokens: {}", e); + }; + }); + } + } +} + struct GenerationContext { request: ValidGenerateRequestWithTokens, streamer: UnboundedSender>, @@ -123,8 +222,9 @@ struct GenerationContext { pub struct TensorRtLlmBackendV2 { tokenizer: Tokenizer, - looper: JoinHandle<()>, - queue: UnboundedSender, + executor_looper: JoinHandle<()>, + post_processor_looper: JoinHandle<()>, + executor: UnboundedSender, } impl TensorRtLlmBackendV2 { @@ -150,20 +250,28 @@ impl TensorRtLlmBackendV2 { ); // Allocate the IPC layer to communicate with the backend - let (requests_sender, requests_receiver) = unbounded_channel::(); + let (executor_sender, executor_receiver) = unbounded_channel(); + let (post_processor_sender, post_processor_receiver) = unbounded_channel(); // Create the FFI backend let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path) .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?; - // Looper is responsible for scheduling and pulling requests state at regular interval - let looper = - tokio::task::spawn_blocking(move || executor_status_poller(backend, requests_receiver)); + // Executor looper is responsible for scheduling and pulling requests state at regular interval + let executor_looper = spawn_blocking(move || { + executor_status_poller(backend, executor_receiver, post_processor_sender) + }); + + // Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user + let tokenizer_ = tokenizer.clone(); + let post_processor_looper = + spawn_blocking(move || post_processor_looper(tokenizer_, post_processor_receiver)); Ok(TensorRtLlmBackendV2 { tokenizer, - looper, - queue: requests_sender, + executor_looper, + post_processor_looper, + executor: executor_sender, }) } @@ -212,7 +320,7 @@ impl Backend for TensorRtLlmBackendV2 { let (streamer, receiver) = unbounded_channel::>(); // Send the context to the executor for scheduling - match self.queue.send(GenerationContext { request, streamer }) { + match self.executor.send(GenerationContext { request, streamer }) { Ok(_) => Ok(UnboundedReceiverStream::new(receiver)), Err(_) => Err(GenerationError( "Failed to submit request to the backend".into(), @@ -221,6 +329,8 @@ impl Backend for TensorRtLlmBackendV2 { } async fn health(&self, current_health: bool) -> bool { - current_health & !self.looper.is_finished() + current_health + & !self.executor_looper.is_finished() + & !self.post_processor_looper.is_finished() } } From 7bebc629afe2044a96e2ee515a74be6a70534433 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 5 Aug 2024 13:39:14 +0000 Subject: [PATCH 19/62] (misc) missing Result types for Rust --- backends/trtllm/src/looper.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index 7d8058632e2..4bcf8a574ba 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -9,6 +9,7 @@ use hashbrown::HashMap; use log::warn; use tokenizers::{Encoding, Tokenizer}; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; +use tokio::sync::mpsc::error::SendError; use tokio::task::{JoinHandle, spawn_blocking}; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; @@ -111,7 +112,7 @@ fn executor_status_poller( info!("New token for {} -> {}", request_id, step.token_id); if !step.has_error { - let req_group = grouper.entry_ref(&request_id).or_insert( + let req_group = grouper.entry(request_id).or_insert( DecodedTokenContext { tokens: vec![], ctx: ctx.streamer.clone(), // Arc::clone() = cheap @@ -138,7 +139,7 @@ fn executor_status_poller( grouper .into_values() .map(|ctx| post_processor_sender.send(ctx)) - .collect()?; + .collect::>>()?; } Err(err) => { error!("Failed to retrieve tokens from the executor: {}", err); @@ -146,7 +147,7 @@ fn executor_status_poller( } } - Ok(()) + Ok::<(), SendError>(()) }) { error!( "Caught an fatal error in the executor's loop, about to exit. {}", From 291eaa99fbe8ff2a733070e48cc0d76b8c466c05 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 8 Aug 2024 00:33:10 +0200 Subject: [PATCH 20/62] use blocking_recv in looper to consume awaiting_requests at max before pulling in a single step --- backends/trtllm/src/looper.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index 4bcf8a574ba..c287fa553a0 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -56,13 +56,10 @@ fn executor_status_poller( span!(Level::DEBUG, "[in-flight][submit]").in_scope(|| { // Is there any request pending to be scheduled? let awaiting_requests = waiting_requests.len(); - if awaiting_requests > 0 { + for _ in 0..awaiting_requests { // Retrieve all the requests - let mut requests = Vec::with_capacity(awaiting_requests); - let _ = waiting_requests.recv_many(&mut requests, awaiting_requests); - - // Submit all the request to the executor and move the context to the in-flight tracker - for ctx in requests { + if let Some(ctx) = waiting_requests.blocking_recv() { + // Submit all the request to the executor and move the context to the in-flight tracker let request = &ctx.request; let generation_params = &request.inner.parameters; let stopping_params = &request.inner.stopping_parameters; From 089c5fe66847bf32f93a6ee089fc6d4142869dd2 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 8 Aug 2024 10:53:25 +0000 Subject: [PATCH 21/62] (server) forward auth_token to server::run --- backends/trtllm/src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index 15f40f5ae62..e78134b94d1 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -10,7 +10,7 @@ use text_generation_backends_trtllm::errors::TensorRtLlmBackendError; use text_generation_backends_trtllm::TensorRtLlmBackendV2; use text_generation_router::{HubTokenizerConfig, server}; use text_generation_router::server::{ - create_post_processor, get_base_tokenizer, get_hub_model_info, + create_post_processor, get_base_tokenizer, }; /// App Configuration @@ -296,7 +296,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { max_input_tokens, max_total_tokens, validation_workers, - None, + auth_token, tokenizer_name, tokenizer_config_path, revision, From dddc9a44bd33b56a24c91d0b933c669dc6f21c29 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 8 Aug 2024 09:44:15 +0200 Subject: [PATCH 22/62] (build) fetchcontent use archives instead of git --- backends/trtllm/Dockerfile | 1 + backends/trtllm/cmake/fmt.cmake | 3 +-- backends/trtllm/cmake/spdlog.cmake | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/backends/trtllm/Dockerfile b/backends/trtllm/Dockerfile index 5fd2f89f25f..79e6e8e9310 100644 --- a/backends/trtllm/Dockerfile +++ b/backends/trtllm/Dockerfile @@ -86,6 +86,7 @@ FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime WORKDIR /usr/local/tgi/bin ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH" +ENV OMPI_MCA_plm_rsh_agent="" COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt diff --git a/backends/trtllm/cmake/fmt.cmake b/backends/trtllm/cmake/fmt.cmake index f94a9c5668f..840280ca8ba 100644 --- a/backends/trtllm/cmake/fmt.cmake +++ b/backends/trtllm/cmake/fmt.cmake @@ -1,6 +1,5 @@ FetchContent_Declare( fmt - GIT_REPOSITORY https://github.com/fmtlib/fmt - GIT_TAG 11.0.1 + URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz ) FetchContent_MakeAvailable(fmt) diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake index c4ee5c97a58..97a4d449cef 100644 --- a/backends/trtllm/cmake/spdlog.cmake +++ b/backends/trtllm/cmake/spdlog.cmake @@ -11,7 +11,6 @@ endif () fetchcontent_declare( spdlog - GIT_REPOSITORY https://github.com/gabime/spdlog.git - GIT_TAG v1.14.1 + URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz ) fetchcontent_makeavailable(spdlog) From 8e648ce425c5e90beea4a511fc769ddf62ecd247 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 9 Aug 2024 22:45:18 +0200 Subject: [PATCH 23/62] (ffi) fix usage of wrong vector constructor making a capacity fill call --- backends/trtllm/include/ffi.h | 2 -- backends/trtllm/src/ffi.cpp | 58 ++++++++++++++++--------------- backends/trtllm/src/looper.rs | 64 ++++++++++++++++++++++------------- 3 files changed, 72 insertions(+), 52 deletions(-) diff --git a/backends/trtllm/include/ffi.h b/backends/trtllm/include/ffi.h index f4a998b22a7..449bcd4d739 100644 --- a/backends/trtllm/include/ffi.h +++ b/backends/trtllm/include/ffi.h @@ -64,8 +64,6 @@ namespace huggingface::tgi::backends { std::unique_ptr> PullTokens(); }; - GenerationStep ConvertResponseToGenerationStep(const tle::Response &response); - /*** * * @param engineFolder diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index b9f3d009997..b15a4c40542 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -36,34 +36,38 @@ uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit( std::unique_ptr> huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() { const auto responses = TensorRtLlmBackend::PullNewTokens(); - auto steps = std::make_unique>(responses.size()); - std::ranges::copy(std::views::transform(responses, ConvertResponseToGenerationStep), std::back_inserter(*steps)); - return steps; -} -huggingface::tgi::backends::GenerationStep -huggingface::tgi::backends::ConvertResponseToGenerationStep(const tle::Response &response) { - const auto reqId = response.getRequestId(); - if (!response.hasError()) { - const auto result = response.getResult(); - return std::move(GenerationStep{ - reqId, - static_cast(result.outputTokenIds[0][0]), - result.logProbs.value()[0][0], - result.isFinal, - false, - std::string() - }); - } else { - return std::move(GenerationStep{ - reqId, - 0, - 0.0, - true, - true, - std::move(response.getErrorMsg()) - }); - } + auto steps = std::make_unique>(); + steps->reserve(responses.size()); + + SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size()); + + // Transform tle::Response to GenerationStep + std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [&](const Response &r) { + const auto reqId = r.getRequestId(); + if (!r.hasError()) { + const auto result = r.getResult(); + return GenerationStep{ + reqId, + static_cast(result.outputTokenIds[0][0]), + result.logProbs.value()[0][0], + result.isFinal, + false, + std::string() + }; + } else { + return GenerationStep{ + reqId, + 0, + 0.0, + true, + true, + std::move(r.getErrorMsg()) + }; + } + }); + + return steps; } std::unique_ptr diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index c287fa553a0..f070bad6a43 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -8,18 +8,20 @@ use cxx::UniquePtr; use hashbrown::HashMap; use log::warn; use tokenizers::{Encoding, Tokenizer}; -use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; use tokio::sync::mpsc::error::SendError; -use tokio::task::{JoinHandle, spawn_blocking}; +use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; +use tokio::task::{spawn_blocking, JoinHandle}; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; -use tracing::{error, info, Level, span}; +use tracing::{debug, error, info, span, Level}; -use text_generation_router::{FinishReason, Token}; +use text_generation_router::infer::InferError::{GenerationError, ValidationError}; use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; -use text_generation_router::infer::InferError::GenerationError; -use text_generation_router::validation::{Chunk, ValidationError, ValidGenerateRequest}; -use text_generation_router::validation::ValidationError::UnsupportedModality; +use text_generation_router::validation::ValidationError::{ + EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality, +}; +use text_generation_router::validation::{Chunk, ValidGenerateRequest}; +use text_generation_router::{FinishReason, Token}; use crate::errors::TensorRtLlmBackendError; use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; @@ -95,6 +97,8 @@ fn executor_status_poller( if backend.num_responses_ready() > 0 { match backend.pin_mut().pull_tokens() { Ok(responses) => { + debug!("Received {} tokens from the executor", responses.len()); + // worse case scenario is one token for each response: with_capacity(responses.len()) // grouper will group decoded tokens per request to decode multiple tokens let mut grouper: HashMap = @@ -102,33 +106,49 @@ fn executor_status_poller( // Iterate through all the decoded token for step in responses.deref() { - let request_id = step.request_id; - - match in_flights.get(&request_id) { + match in_flights.get(&step.request_id) { Some(ctx) => { - info!("New token for {} -> {}", request_id, step.token_id); + debug!( + "{} -> (token={}, final={})", + step.request_id, step.token_id, step.is_final + ); + // If no error, let's forward to post-processor if !step.has_error { - let req_group = grouper.entry(request_id).or_insert( + let req_group = grouper.entry(step.request_id).or_insert( DecodedTokenContext { tokens: vec![], ctx: ctx.streamer.clone(), // Arc::clone() = cheap }, ); req_group.tokens.push(step.clone()); // Should be ultra cheap - - if step.is_final { - let _ = in_flights.remove(&step.request_id); - } } else { warn!( "Error for request: {} -> {}", - request_id, &step.error_msg + step.request_id, &step.error_msg ); + + // TODO: Send something back to the postprocessor for the client? + } + + // Remove from tracked requests + if step.is_final { + let _ = in_flights.remove(&step.request_id); } } None => { - error!("Got step for untracked request {}", request_id); + if step.has_error { + error!( + "Untracked request {} -> {}", + step.request_id, &step.error_msg + ); + continue; + } else { + error!( + "Got step for untracked request {}", + step.request_id + ); + } } } } @@ -275,18 +295,16 @@ impl TensorRtLlmBackendV2 { fn validate(request: &ValidGenerateRequest) -> InferResult<&String> { if request.top_n_tokens > 1 { - return Err(InferError::ValidationError( - ValidationError::TopNTokensDisabled, - )); + return Err(InferError::ValidationError(TopNTokensDisabled)); } // TODO: Is it really needed? How can it be validated before? if request.parameters.grammar.is_some() { - return Err(InferError::ValidationError(ValidationError::Grammar)); + return Err(InferError::ValidationError(Grammar)); } match request.inputs.len() { - 0 => Err(InferError::ValidationError(ValidationError::EmptyInput)), + 0 => Err(InferError::ValidationError(EmptyInput)), 2.. => Err(InferError::GenerationError( "TensorRT-LLM backend don't support multi-chunk".into(), )), From 3d0e90b63112cbc8bfe1ba8a56a26eed6b23b4b3 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sat, 10 Aug 2024 00:21:18 +0200 Subject: [PATCH 24/62] (ffi) missing namespace for tle::Response --- backends/trtllm/src/ffi.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index b15a4c40542..5121b75ca99 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -43,7 +43,7 @@ huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() { SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size()); // Transform tle::Response to GenerationStep - std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [&](const Response &r) { + std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [&](const tle::Response &r) { const auto reqId = r.getRequestId(); if (!r.hasError()) { const auto result = r.getResult(); From 483f1729387e5a834f7ea9ba40a255a8dbc45666 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 11 Aug 2024 14:10:12 +0200 Subject: [PATCH 25/62] (ffi) do not use reference capture in lambda as we are not capturing anything --- backends/trtllm/src/ffi.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index 5121b75ca99..35b0a48f8fa 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -43,7 +43,7 @@ huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() { SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size()); // Transform tle::Response to GenerationStep - std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [&](const tle::Response &r) { + std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) { const auto reqId = r.getRequestId(); if (!r.hasError()) { const auto result = r.getResult(); From b1846fb4e6e963a24405088604142b9ca4600b2d Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 11 Aug 2024 14:10:28 +0200 Subject: [PATCH 26/62] (backend) refactor & cleanup --- backends/trtllm/src/lib.rs | 9 +- backends/trtllm/src/looper.rs | 242 ++++++++++++++-------------------- 2 files changed, 108 insertions(+), 143 deletions(-) diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index edd8caff154..ca4ca024dc5 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -4,14 +4,17 @@ pub mod errors; mod looper; mod utils; +pub(crate) type RequestId = u64; +pub(crate) type TokenId = u32; + #[cxx::bridge(namespace = "huggingface::tgi::backends")] mod ffi { /// Struct used as shared type between rust and C++ to represent the result /// of a single decoding iteration #[derive(Debug, Clone)] pub struct GenerationStep { - request_id: u64, - token_id: u32, + request_id: RequestId, + token_id: TokenId, log_prob: f32, is_final: bool, has_error: bool, @@ -50,7 +53,7 @@ mod ffi { #[rust_name = "submit"] fn Submit( self: Pin<&mut TensorRtLlmBackendImpl>, - tokens: &[u32], + tokens: &[TokenId], max_new_tokens: u32, top_k: i32, top_p: f32, diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index f070bad6a43..99d75b81c75 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -1,11 +1,10 @@ use std::hint; use std::ops::Deref; use std::path::Path; -use std::sync::OnceLock; use async_trait::async_trait; use cxx::UniquePtr; -use hashbrown::HashMap; +use hashbrown::{HashMap, HashSet}; use log::warn; use tokenizers::{Encoding, Tokenizer}; use tokio::sync::mpsc::error::SendError; @@ -13,7 +12,7 @@ use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; use tokio::task::{spawn_blocking, JoinHandle}; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; -use tracing::{debug, error, info, span, Level}; +use tracing::{debug, debug_span, error, info, info_span, span, Level}; use text_generation_router::infer::InferError::{GenerationError, ValidationError}; use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; @@ -26,32 +25,74 @@ use text_generation_router::{FinishReason, Token}; use crate::errors::TensorRtLlmBackendError; use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; use crate::utils::first_line; +use crate::RequestId; -// Value used to poll the state of the generation stream -static POLLING_INTERVAL_US: OnceLock = OnceLock::new(); +type InferResult = Result; -// It's safe to send the backend between threads -unsafe impl Send for TensorRtLlmBackendImpl {} +struct IdentifiableRequest { + request_id: RequestId, + inner: T, +} -type InferResult = Result; +macro_rules! identifiable { + ($id: expr, $inner: expr) => { + IdentifiableRequest { + id: $id, + inner: $inner, + } + }; +} +/// Wrap the TGI server forwarded ValidGenerateRequest with the tokenized view of the prompt struct ValidGenerateRequestWithTokens { encoding: Encoding, inner: ValidGenerateRequest, } +/// Wrap the requests along with the channel used to stream back to the client the decoded tokens +struct GenerationContext { + request: ValidGenerateRequestWithTokens, + start: Instant, + queued: Option, + streamer: UnboundedSender>, +} + +#[derive(Debug, Copy, Clone)] +struct DecodedToken { + id: u32, + log_prob: f32, + is_final: bool, +} + +impl TryFrom for DecodedToken { + type Error = InferError; + + fn try_from(step: GenerationStep) -> Result { + if !step.has_error { + Ok(Self { + id: step.token_id, + log_prob: step.log_prob, + is_final: step.is_final, + }) + } else { + Err(GenerationError(step.error_msg)) + } + } +} + +/// Wraps the decoded token with the channel used to stream back to the client the decoded tokens struct DecodedTokenContext { - tokens: Vec, - ctx: UnboundedSender>, + token: DecodedToken, + channel: UnboundedSender>, } -fn executor_status_poller( +fn executor_status_looper( mut backend: UniquePtr, mut waiting_requests: UnboundedReceiver, - mut post_processor_sender: UnboundedSender, + mut post_processor_sender: UnboundedSender, ) { // Track the tuple (request_id, stream) for each request - let mut in_flights = HashMap::::with_capacity(128); + let mut in_flights = HashMap::::with_capacity(128); // TODO: Does it need a spin-loop? 'executor: loop { @@ -60,7 +101,7 @@ fn executor_status_poller( let awaiting_requests = waiting_requests.len(); for _ in 0..awaiting_requests { // Retrieve all the requests - if let Some(ctx) = waiting_requests.blocking_recv() { + if let Some(mut ctx) = waiting_requests.blocking_recv() { // Submit all the request to the executor and move the context to the in-flight tracker let request = &ctx.request; let generation_params = &request.inner.parameters; @@ -79,13 +120,15 @@ fn executor_status_poller( ) { Ok(request_id) => { // Insert the context linked to the generated request id in the tracker + debug!("[in-flight] Added {}", request_id); + ctx.queued = Instant::now(); in_flights.insert(request_id, ctx); } Err(e) => { // Return to the caller let what = Err(InferError::SchedulingError(e.to_string())); - if let Err(e) = ctx.streamer.send(what) { - error!("Failed to send back through the channel: {}", e); + if let Err(ref e) = ctx.streamer.send(what) { + error!("Failed to send the client", error = e.as_ref()); } } }; @@ -93,83 +136,38 @@ fn executor_status_poller( } }); - if let Err(e) = span!(Level::DEBUG, "[in-flight][poll]").in_scope(|| { + if let Err(ref e) = info_span!("[in-flight][poll]").in_scope(|| { if backend.num_responses_ready() > 0 { - match backend.pin_mut().pull_tokens() { - Ok(responses) => { - debug!("Received {} tokens from the executor", responses.len()); - - // worse case scenario is one token for each response: with_capacity(responses.len()) - // grouper will group decoded tokens per request to decode multiple tokens - let mut grouper: HashMap = - HashMap::with_capacity(responses.len()); - - // Iterate through all the decoded token - for step in responses.deref() { - match in_flights.get(&step.request_id) { - Some(ctx) => { - debug!( - "{} -> (token={}, final={})", - step.request_id, step.token_id, step.is_final - ); - - // If no error, let's forward to post-processor - if !step.has_error { - let req_group = grouper.entry(step.request_id).or_insert( - DecodedTokenContext { - tokens: vec![], - ctx: ctx.streamer.clone(), // Arc::clone() = cheap - }, - ); - req_group.tokens.push(step.clone()); // Should be ultra cheap - } else { - warn!( - "Error for request: {} -> {}", - step.request_id, &step.error_msg - ); - - // TODO: Send something back to the postprocessor for the client? - } - - // Remove from tracked requests - if step.is_final { - let _ = in_flights.remove(&step.request_id); - } - } - None => { - if step.has_error { - error!( - "Untracked request {} -> {}", - step.request_id, &step.error_msg - ); - continue; - } else { - error!( - "Got step for untracked request {}", - step.request_id - ); - } - } - } + let responses = backend + .pin_mut() + .pull_tokens() + .map_err(|e| Err(GenerationError(e.what())))?; + + // Iterate through all the decoded token + for step in responses.deref() { + if let Some(ctx) = in_flights.get(&step.request_id) { + let parcel = DecodedToken::try_from(step).map(|dt| DecodedTokenContext { + token: dt, + channel: ctx.streamer.clone(), + }); + + // Submit the work to the post_processor + let delivered = post_processor_sender.send(parcel); + + // Remove from tracked requests + if step.is_final { + debug!("Removing {}", step.request_id); + let _ = in_flights.remove(&step.request_id); } - grouper - .into_values() - .map(|ctx| post_processor_sender.send(ctx)) - .collect::>>()?; + delivered + } else { + warn!("Untracked request {}", step.request_id,); } - Err(err) => { - error!("Failed to retrieve tokens from the executor: {}", err); - } - } + }?; } - - Ok::<(), SendError>(()) }) { - error!( - "Caught an fatal error in the executor's loop, about to exit. {}", - e - ); + error!("Error in the executor's loop, exiting", error = e.as_ref()); break 'executor; } @@ -180,7 +178,7 @@ fn executor_status_poller( fn post_processor_looper( tokenizer: Tokenizer, - mut decoded_tokens: UnboundedReceiver, + mut decoded_tokens: UnboundedReceiver, ) { 'post_processor: loop { if decoded_tokens.is_closed() { @@ -188,56 +186,14 @@ fn post_processor_looper( break 'post_processor; } - if let Some(ctx) = decoded_tokens.blocking_recv() { - ctx.tokens.iter().for_each(|step| { - let out = match tokenizer.decode(&[step.token_id], true) { - Ok(text) => { - let is_special = tokenizer.get_added_vocabulary().is_special_token(&text); - let token = Token { - id: step.token_id, - text, - logprob: step.log_prob, - special: is_special, - }; - - let response = if !step.is_final { - InferStreamResponse::Intermediate { - token, - top_tokens: vec![], - } - } else { - InferStreamResponse::End { - token, - top_tokens: vec![], - generated_text: GeneratedText { - text: String::from(""), - generated_tokens: 0, - finish_reason: FinishReason::Length, - seed: None, - }, - start: Instant::now(), // Handle start time - queued: Instant::now(), // Handle queued time - } - }; + let mut states = HashMap::with_capacity(128); - Ok(response) - } - Err(e) => Err(GenerationError(e.to_string())), - }; - - if let Err(e) = ctx.ctx.send(out) { - warn!("Failed to send back the decoded tokens: {}", e); - }; - }); + if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() { + let state = states.entry(request_id).or_insert(vec![]); } } } -struct GenerationContext { - request: ValidGenerateRequestWithTokens, - streamer: UnboundedSender>, -} - pub struct TensorRtLlmBackendV2 { tokenizer: Tokenizer, executor_looper: JoinHandle<()>, @@ -277,7 +233,7 @@ impl TensorRtLlmBackendV2 { // Executor looper is responsible for scheduling and pulling requests state at regular interval let executor_looper = spawn_blocking(move || { - executor_status_poller(backend, executor_receiver, post_processor_sender) + executor_status_looper(backend, executor_receiver, post_processor_sender) }); // Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user @@ -295,22 +251,22 @@ impl TensorRtLlmBackendV2 { fn validate(request: &ValidGenerateRequest) -> InferResult<&String> { if request.top_n_tokens > 1 { - return Err(InferError::ValidationError(TopNTokensDisabled)); + return Err(ValidationError(TopNTokensDisabled)); } // TODO: Is it really needed? How can it be validated before? if request.parameters.grammar.is_some() { - return Err(InferError::ValidationError(Grammar)); + return Err(ValidationError(Grammar)); } match request.inputs.len() { - 0 => Err(InferError::ValidationError(EmptyInput)), - 2.. => Err(InferError::GenerationError( + 0 => Err(ValidationError(EmptyInput)), + 2.. => Err(GenerationError( "TensorRT-LLM backend don't support multi-chunk".into(), )), 1 => match request.inputs.first().expect("Single item-chunk") { Chunk::Text(text) => Ok(text), - Chunk::Image(_) => Err(InferError::ValidationError(UnsupportedModality("image"))), + Chunk::Image(_) => Err(ValidationError(UnsupportedModality("image"))), }, } } @@ -336,7 +292,13 @@ impl Backend for TensorRtLlmBackendV2 { let (streamer, receiver) = unbounded_channel::>(); // Send the context to the executor for scheduling - match self.executor.send(GenerationContext { request, streamer }) { + let start = Instant::now(); + match self.executor.send(GenerationContext { + request, + start, + queued: None, + streamer, + }) { Ok(_) => Ok(UnboundedReceiverStream::new(receiver)), Err(_) => Err(GenerationError( "Failed to submit request to the backend".into(), From 0f50539b77ee8ea4ea896235683d0c4b752e7ff8 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 11 Aug 2024 14:10:51 +0200 Subject: [PATCH 27/62] (Dockerfile.trtllm) delete for now --- Dockerfile.trtllm | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 Dockerfile.trtllm diff --git a/Dockerfile.trtllm b/Dockerfile.trtllm deleted file mode 100644 index 4543ae804ee..00000000000 --- a/Dockerfile.trtllm +++ /dev/null @@ -1,23 +0,0 @@ -# All the tooling for CUDA -FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS cuda-builder - -WORKDIR /usr/src/tgi/backends/trtllm -RUN apt update && apt install -y cmake git git-lfs gcc g++ ninja-build libopenmpi-dev python3-dev python3-pip wget - -COPY . /usr/src/tgi -RUN chmod +x scripts/install_tensorrt.sh && scripts/install_tensorrt.sh -RUN cmake -G Ninja -B build -DTRT_LIB_DIR=/usr/local/tensorrt/lib -DTRT_INCLUDE_DIR=/usr/local/tensorrt/include . -RUN cmake --build build --parallel -t tgi_trtllm_backend_impl - -# All the tooling for Rust -FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef -WORKDIR /usr/src - -# Include CUDA related libraries and tools to the Rust based image -COPY --from=cuda-builder /usr/local/cuda /usr/local/cuda -COPY --from=cuda-builder /usr/local/tensorrt /usr/local/tensorrt -COPY --from=cuda-builder /usr/src/tgi/backends/trtllm/build /usr/local/tgi/trtllm/build -ENV PATH=/usr/local/cuda/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:$LD_LIBRARY_PATH - -RUN apt update && apt install -y cmake git gcc g++ ninja-build libopenmpi3 From b41875c139c1cec598e25740ca0693f3d6385e36 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 26 Aug 2024 08:24:38 +0000 Subject: [PATCH 28/62] (misc) simplify [make_]move_iterator by using c++20 type inference --- backends/trtllm/src/ffi.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index 35b0a48f8fa..872ecb5c9d5 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -28,7 +28,7 @@ uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit( float_t repetition_penalty, float_t frequency_penalty, uint64_t seed) { // This will copy all the items from the initial slice - std::vector tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end())); + std::vector tokens_(std::move_iterator(tokens.begin()), std::move_iterator(tokens.end())); return TensorRtLlmBackend::Submit( std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed); } From 42ccf4e77cbbb29a4824c6b1409137712fc08cc8 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 26 Aug 2024 13:38:49 +0000 Subject: [PATCH 29/62] (misc) no need to move for uint32_t items --- backends/trtllm/src/ffi.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index 872ecb5c9d5..54c17bc4149 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -28,7 +28,7 @@ uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit( float_t repetition_penalty, float_t frequency_penalty, uint64_t seed) { // This will copy all the items from the initial slice - std::vector tokens_(std::move_iterator(tokens.begin()), std::move_iterator(tokens.end())); + std::vector tokens_(tokens.begin(), tokens.end()); return TensorRtLlmBackend::Submit( std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed); } From fa63db0d078766161a8b82394959c779a89deb9c Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 26 Aug 2024 13:39:20 +0000 Subject: [PATCH 30/62] (scheduler) rework submit/pull logic --- backends/trtllm/src/lib.rs | 10 +- backends/trtllm/src/looper.rs | 183 ++++++++++++++++------------------ 2 files changed, 90 insertions(+), 103 deletions(-) diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index ca4ca024dc5..6d3297d662c 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -4,17 +4,14 @@ pub mod errors; mod looper; mod utils; -pub(crate) type RequestId = u64; -pub(crate) type TokenId = u32; - #[cxx::bridge(namespace = "huggingface::tgi::backends")] mod ffi { /// Struct used as shared type between rust and C++ to represent the result /// of a single decoding iteration #[derive(Debug, Clone)] pub struct GenerationStep { - request_id: RequestId, - token_id: TokenId, + request_id: u64, + token_id: u32, log_prob: f32, is_final: bool, has_error: bool, @@ -53,7 +50,7 @@ mod ffi { #[rust_name = "submit"] fn Submit( self: Pin<&mut TensorRtLlmBackendImpl>, - tokens: &[TokenId], + tokens: &[u32], max_new_tokens: u32, top_k: i32, top_p: f32, @@ -68,4 +65,5 @@ mod ffi { self: Pin<&mut TensorRtLlmBackendImpl>, ) -> Result>>; } + } diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index 99d75b81c75..4247f338d81 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -3,46 +3,34 @@ use std::ops::Deref; use std::path::Path; use async_trait::async_trait; -use cxx::UniquePtr; -use hashbrown::{HashMap, HashSet}; +use cxx::{UniquePtr}; +use hashbrown::{HashMap}; use log::warn; use tokenizers::{Encoding, Tokenizer}; -use tokio::sync::mpsc::error::SendError; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; use tokio::task::{spawn_blocking, JoinHandle}; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; -use tracing::{debug, debug_span, error, info, info_span, span, Level}; +use tracing::{debug, error}; use text_generation_router::infer::InferError::{GenerationError, ValidationError}; -use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; +use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; use text_generation_router::validation::ValidationError::{ EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality, }; use text_generation_router::validation::{Chunk, ValidGenerateRequest}; -use text_generation_router::{FinishReason, Token}; use crate::errors::TensorRtLlmBackendError; use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; use crate::utils::first_line; -use crate::RequestId; type InferResult = Result; struct IdentifiableRequest { - request_id: RequestId, + request_id: u64, inner: T, } -macro_rules! identifiable { - ($id: expr, $inner: expr) => { - IdentifiableRequest { - id: $id, - inner: $inner, - } - }; -} - /// Wrap the TGI server forwarded ValidGenerateRequest with the tokenized view of the prompt struct ValidGenerateRequestWithTokens { encoding: Encoding, @@ -52,8 +40,8 @@ struct ValidGenerateRequestWithTokens { /// Wrap the requests along with the channel used to stream back to the client the decoded tokens struct GenerationContext { request: ValidGenerateRequestWithTokens, - start: Instant, - queued: Option, + start: Option, + queued: Instant, streamer: UnboundedSender>, } @@ -64,10 +52,10 @@ struct DecodedToken { is_final: bool, } -impl TryFrom for DecodedToken { +impl<'step> TryFrom<&'step GenerationStep> for DecodedToken { type Error = InferError; - fn try_from(step: GenerationStep) -> Result { + fn try_from(step: &'step GenerationStep) -> Result { if !step.has_error { Ok(Self { id: step.token_id, @@ -75,7 +63,7 @@ impl TryFrom for DecodedToken { is_final: step.is_final, }) } else { - Err(GenerationError(step.error_msg)) + Err(GenerationError(step.error_msg.clone())) } } } @@ -89,86 +77,84 @@ struct DecodedTokenContext { fn executor_status_looper( mut backend: UniquePtr, mut waiting_requests: UnboundedReceiver, - mut post_processor_sender: UnboundedSender, + post_processor_sender: UnboundedSender<(u64, InferResult)>, ) { // Track the tuple (request_id, stream) for each request - let mut in_flights = HashMap::::with_capacity(128); + let mut in_flights = HashMap::::with_capacity(128); // TODO: Does it need a spin-loop? - 'executor: loop { - span!(Level::DEBUG, "[in-flight][submit]").in_scope(|| { - // Is there any request pending to be scheduled? - let awaiting_requests = waiting_requests.len(); - for _ in 0..awaiting_requests { - // Retrieve all the requests - if let Some(mut ctx) = waiting_requests.blocking_recv() { - // Submit all the request to the executor and move the context to the in-flight tracker - let request = &ctx.request; - let generation_params = &request.inner.parameters; - let stopping_params = &request.inner.stopping_parameters; - - // Submit to the TensorRT-LLM executor for scheduling - match backend.pin_mut().submit( - request.encoding.get_ids(), - stopping_params.max_new_tokens, - generation_params.top_k as i32, - generation_params.top_p, - generation_params.temperature, - generation_params.repetition_penalty, - generation_params.frequency_penalty, - generation_params.seed, - ) { - Ok(request_id) => { - // Insert the context linked to the generated request id in the tracker - debug!("[in-flight] Added {}", request_id); - ctx.queued = Instant::now(); - in_flights.insert(request_id, ctx); + 'scheduler: loop { + // Is there any request pending to be scheduled? + let awaiting_requests = waiting_requests.len(); + for _ in 0..awaiting_requests { + // Retrieve all the requests + if let Some(mut ctx) = waiting_requests.blocking_recv() { + // Submit all the request to the executor and move the context to the in-flight tracker + let request = &ctx.request; + let generation_params = &request.inner.parameters; + let stopping_params = &request.inner.stopping_parameters; + + // Submit to the TensorRT-LLM executor for scheduling + match backend.pin_mut().submit( + request.encoding.get_ids(), + stopping_params.max_new_tokens, + generation_params.top_k as i32, + generation_params.top_p, + generation_params.temperature, + generation_params.repetition_penalty, + generation_params.frequency_penalty, + generation_params.seed, + ) { + Ok(request_id) => { + // Insert the context linked to the generated request id in the tracker + debug!("[in-flight] Added {}", request_id); + ctx.start = Some(Instant::now()); + in_flights.insert(request_id, ctx); + } + Err(e) => { + // Return to the caller + let what = e.to_string(); + error!(error = what.as_str(), "Failed to schedule request"); + + let err = Err(InferError::SchedulingError(what)); + if let Err(_) = ctx.streamer.send(err) { + error!("Failed to send back error to the client"); } - Err(e) => { - // Return to the caller - let what = Err(InferError::SchedulingError(e.to_string())); - if let Err(ref e) = ctx.streamer.send(what) { - error!("Failed to send the client", error = e.as_ref()); + } + }; + } + } + + if backend.num_responses_ready() > 0 { + match backend.pin_mut().pull_tokens() { + Ok(responses) => { + // Iterate through all the decoded token + for step in responses.deref() { + if let Some(ctx) = in_flights.get(&step.request_id) { + + // Remove from tracked requests + let parcel = DecodedToken::try_from(step).map(|dt| DecodedTokenContext { + token: dt, + channel: ctx.streamer.clone(), + }); + + // Submit the work to p:the post_processor + let posted = post_processor_sender.send((step.request_id, parcel)); + + if posted.is_err() || step.is_final { + debug!("Removing {}", step.request_id); + let _ = in_flights.remove(&step.request_id); } + } else { + warn!("Untracked request {}", step.request_id,); } }; } + Err(ref err) => { + error!("Failed to get responses from the executor: {}.", err.what()); + break 'scheduler; + } } - }); - - if let Err(ref e) = info_span!("[in-flight][poll]").in_scope(|| { - if backend.num_responses_ready() > 0 { - let responses = backend - .pin_mut() - .pull_tokens() - .map_err(|e| Err(GenerationError(e.what())))?; - - // Iterate through all the decoded token - for step in responses.deref() { - if let Some(ctx) = in_flights.get(&step.request_id) { - let parcel = DecodedToken::try_from(step).map(|dt| DecodedTokenContext { - token: dt, - channel: ctx.streamer.clone(), - }); - - // Submit the work to the post_processor - let delivered = post_processor_sender.send(parcel); - - // Remove from tracked requests - if step.is_final { - debug!("Removing {}", step.request_id); - let _ = in_flights.remove(&step.request_id); - } - - delivered - } else { - warn!("Untracked request {}", step.request_id,); - } - }?; - } - }) { - error!("Error in the executor's loop, exiting", error = e.as_ref()); - break 'executor; } // Hint the CPU we are spin-locking @@ -178,7 +164,7 @@ fn executor_status_looper( fn post_processor_looper( tokenizer: Tokenizer, - mut decoded_tokens: UnboundedReceiver, + mut decoded_tokens: UnboundedReceiver<(u64, InferResult)>, ) { 'post_processor: loop { if decoded_tokens.is_closed() { @@ -186,7 +172,7 @@ fn post_processor_looper( break 'post_processor; } - let mut states = HashMap::with_capacity(128); + let mut states: HashMap> = HashMap::with_capacity(128); if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() { let state = states.entry(request_id).or_insert(vec![]); @@ -194,6 +180,9 @@ fn post_processor_looper( } } + +unsafe impl Send for crate::ffi::TensorRtLlmBackendImpl {} + pub struct TensorRtLlmBackendV2 { tokenizer: Tokenizer, executor_looper: JoinHandle<()>, @@ -292,11 +281,11 @@ impl Backend for TensorRtLlmBackendV2 { let (streamer, receiver) = unbounded_channel::>(); // Send the context to the executor for scheduling - let start = Instant::now(); + let queued = Instant::now(); match self.executor.send(GenerationContext { request, - start, - queued: None, + start: None, + queued, streamer, }) { Ok(_) => Ok(UnboundedReceiverStream::new(receiver)), From 984ae9798f8cb19863282c0145501d329146d5d1 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 26 Aug 2024 14:28:44 +0000 Subject: [PATCH 31/62] (post) impl postprocessing --- backends/trtllm/src/looper.rs | 79 +++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 13 deletions(-) diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index 4247f338d81..ba10d9ee90e 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -3,22 +3,23 @@ use std::ops::Deref; use std::path::Path; use async_trait::async_trait; -use cxx::{UniquePtr}; -use hashbrown::{HashMap}; +use cxx::UniquePtr; +use hashbrown::HashMap; use log::warn; use tokenizers::{Encoding, Tokenizer}; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; -use tokio::task::{spawn_blocking, JoinHandle}; +use tokio::task::{JoinHandle, spawn_blocking}; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{debug, error}; +use text_generation_router::{FinishReason, Token}; +use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; use text_generation_router::infer::InferError::{GenerationError, ValidationError}; -use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; +use text_generation_router::validation::{Chunk, ValidGenerateRequest}; use text_generation_router::validation::ValidationError::{ EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality, }; -use text_generation_router::validation::{Chunk, ValidGenerateRequest}; use crate::errors::TensorRtLlmBackendError; use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; @@ -71,6 +72,8 @@ impl<'step> TryFrom<&'step GenerationStep> for DecodedToken { /// Wraps the decoded token with the channel used to stream back to the client the decoded tokens struct DecodedTokenContext { token: DecodedToken, + start: Option, + queued: Instant, channel: UnboundedSender>, } @@ -131,12 +134,14 @@ fn executor_status_looper( // Iterate through all the decoded token for step in responses.deref() { if let Some(ctx) = in_flights.get(&step.request_id) { - // Remove from tracked requests - let parcel = DecodedToken::try_from(step).map(|dt| DecodedTokenContext { - token: dt, - channel: ctx.streamer.clone(), - }); + let parcel = + DecodedToken::try_from(step).map(|dt| DecodedTokenContext { + token: dt, + start: ctx.start, + queued: ctx.queued, + channel: ctx.streamer.clone(), + }); // Submit the work to p:the post_processor let posted = post_processor_sender.send((step.request_id, parcel)); @@ -148,7 +153,7 @@ fn executor_status_looper( } else { warn!("Untracked request {}", step.request_id,); } - }; + } } Err(ref err) => { error!("Failed to get responses from the executor: {}.", err.what()); @@ -176,12 +181,60 @@ fn post_processor_looper( if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() { let state = states.entry(request_id).or_insert(vec![]); + + match decoded { + Ok(ctx) => { + state.push(ctx.token.id); + let out = match tokenizer.decode(&[ctx.token.id], false) { + Ok(text) => { + let is_special = + tokenizer.get_added_vocabulary().is_special_token(&text); + let token = Token { + id: ctx.token.id, + text, + logprob: ctx.token.log_prob, + special: is_special, + }; + + let out = if !ctx.token.is_final { + InferStreamResponse::Intermediate { + token, + top_tokens: vec![], + } + } else { + let text = tokenizer.decode(&state, true); + let generated_text = GeneratedText { + text: text.unwrap(), + generated_tokens: state.len() as u32, + finish_reason: FinishReason::EndOfSequenceToken, + seed: None, + }; + + InferStreamResponse::End { + token, + top_tokens: vec![], + generated_text, + start: ctx.start.unwrap(), + queued: ctx.queued, + } + }; + + Ok(out) + } + Err(err) => Err(GenerationError(err.to_string())), + }; + + if let Err(_) = ctx.channel.send(out) { + warn!("Failed to send decoded token back to the user") + } + } + Err(err) => {} + } } } } - -unsafe impl Send for crate::ffi::TensorRtLlmBackendImpl {} +unsafe impl Send for TensorRtLlmBackendImpl {} pub struct TensorRtLlmBackendV2 { tokenizer: Tokenizer, From b242f45c04d037ba86b8fa2e2b1df164716c2a4a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 3 Sep 2024 21:19:41 +0000 Subject: [PATCH 32/62] (misc) delete backend.rs --- backends/trtllm/src/backend.rs | 330 --------------------------------- 1 file changed, 330 deletions(-) delete mode 100644 backends/trtllm/src/backend.rs diff --git a/backends/trtllm/src/backend.rs b/backends/trtllm/src/backend.rs deleted file mode 100644 index b23aa6c01fe..00000000000 --- a/backends/trtllm/src/backend.rs +++ /dev/null @@ -1,330 +0,0 @@ -use std::future::Future; -use std::path::Path; -use std::pin::{pin, Pin}; -use std::str::FromStr; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::{Arc, OnceLock}; -use std::task::{Context, Poll}; -use std::time::Duration; - -use async_trait::async_trait; -use cxx::UniquePtr; -use log::{error, warn}; -use tokenizers::Tokenizer; -use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; -use tokio::time::{sleep, Instant}; -use tokio_stream::wrappers::UnboundedReceiverStream; -use tokio_stream::{Stream, StreamExt}; -use tracing::{instrument, span, Level}; - -// use tokio::sync::RwLock; -use parking_lot::RwLock; -use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; -use text_generation_router::validation::ValidationError::UnsupportedModality; -use text_generation_router::validation::{Chunk, ValidGenerateRequest, ValidationError}; -use text_generation_router::{FinishReason, Token}; - -use crate::errors::TensorRtLlmBackendError; -use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; - -// Value used to poll the state of the generation stream -static POLLING_INTERVAL_US: OnceLock = OnceLock::new(); - -type InferResult = Result; - -pub(crate) struct Generation { - executor: Arc>>, - done: Arc, -} - -/// Holds the user provided input to be executed along with a channel allowing -/// to bubble up all the generated tokens for that tokens the to end stream. -pub struct GenerationContext { - sender: UnboundedSender>, - tokenizer: Arc, - tokens: Vec, - done: Arc, - queued: Instant, - start: Option, -} - -impl Stream for Generation { - type Item = usize; - - fn poll_next(self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll> { - let interval = POLLING_INTERVAL_US.get_or_init(|| { - u64::from_str(option_env!("TRTLLM_BACKEND_POLLING_INTERVAL_US").unwrap_or("100")) - .expect("Invalid value provided for envvar POLLING_INTERVAL_US") - }); - - if !self.done.load(Ordering::Relaxed) { - let backend = pin!(self.executor.read()); - let status = match backend.poll(ctx) { - Poll::Ready(executor_r) => { - let ready = executor_r.num_responses_ready(); - if ready == 0 { - Poll::Pending - } else { - Poll::Ready(Some(ready)) - } - } - Poll::Pending => Poll::Pending, - }; - - let waker = ctx.waker().clone(); - tokio::spawn(async { - sleep(Duration::from_micros(*interval)).await; - waker.wake(); - }); - - status - } else { - Poll::Ready(None) // end of stream - } - } - - fn size_hint(&self) -> (usize, Option) { - (1, None) - } -} - -unsafe impl Send for TensorRtLlmBackendImpl {} -unsafe impl Sync for TensorRtLlmBackendImpl {} - -/// Implements the logic to execute generation with TensorRT-LLM executor API in background -pub struct TensorRtLlmBackend { - tokenizer: Arc, - - // Backing the backend behind a RwLock to allow concurrent read access to retrieve - // the number of available tokens (read only) in the Generation stream - backend: Arc>>, -} - -impl TensorRtLlmBackend { - pub fn new + Send + 'static, PP: AsRef + Send + 'static>( - tokenizer: Tokenizer, - engine_folder: P, - executor_worker_path: PP, - ) -> Result { - Ok(TensorRtLlmBackend { - tokenizer: Arc::new(tokenizer), - backend: Arc::new(RwLock::new(create_tensorrt_llm_backend( - engine_folder.as_ref().to_str().unwrap(), - executor_worker_path.as_ref().to_str().unwrap(), - ))), - }) - } - - fn validate(request: &ValidGenerateRequest) -> InferResult<&String> { - if request.top_n_tokens > 1 { - return Err(InferError::ValidationError( - ValidationError::TopNTokensDisabled, - )); - } - - // TODO: Is it really needed? How can it be validated before? - if request.parameters.grammar.is_some() { - return Err(InferError::ValidationError(ValidationError::Grammar)); - } - - match request.inputs.len() { - 0 => Err(InferError::ValidationError(ValidationError::EmptyInput)), - 2.. => Err(InferError::GenerationError( - "TensorRT-LLM backend don't support multi-chunk".into(), - )), - 1 => match request.inputs.first().expect("Single item-chunk") { - Chunk::Text(text) => Ok(text), - Chunk::Image(_) => Err(InferError::ValidationError(UnsupportedModality("image"))), - }, - } - } - - fn generate( - &self, - sender: UnboundedSender>, - tokens: Vec, - top_k: u32, - top_p: f32, - temperature: f32, - repetition_penalty: f32, - frequency_penalty: f32, - seed: u64, - ) { - let tokenizer = Arc::clone(&self.tokenizer); - let executor = Arc::clone(&self.backend); - - // Let's push this in async context - tokio::spawn(async move { - // Define the generation state - let mut generation = Generation { - executor: executor.clone(), - done: Arc::new(AtomicBool::new(false)), - }; - - // Define the context over the generation - // TODO(asap): Do we really need so many shared-ownership? - let ctx = Box::new(GenerationContext { - sender: sender.clone(), - tokenizer, - tokens: vec![], - done: Arc::clone(&generation.done), - start: None, - queued: Instant::now(), - }); - - // We are leaking the context on-purpose to avoid the box being dropped while there are - // still computation ongoing - // TODO(asap): Can we achieve the same with an Arc> without the need to go unsafe? - let ctx_ = Box::leak(ctx); - - // Submit the request to the batcher - let request_id = span!(Level::DEBUG, "submit") - .in_scope(|| async { - let mut handle = executor.write().await; - let request_id = handle.pin_mut().submit( - &tokens, - top_k as i32, - top_p, - temperature, - repetition_penalty, - frequency_penalty, - seed, - ); - - request_id - }) - .await; - - while let Some(_) = generation.next().await { - let mut executor_w = executor.write().await; - let executor = executor_w.pin_mut(); - - span!(Level::DEBUG, "decode") - .in_scope(|| async { - unsafe { - executor.stream_tokens( - request_id, - ctx_, - |ctx: *mut GenerationContext, step: GenerationStep| { - let inner_ctx = &mut *ctx; - - // Update the timestamp at which the request started effectively - // Can be a bit off, would need to be before the callback, let's see - inner_ctx.start.get_or_insert(Instant::now()); - inner_ctx.done.store(step.is_final, Ordering::Relaxed); - - // Ensure we are not running into errors - let parcel = if !step.has_error { - // Insert the latest generated token to the tracker - inner_ctx.tokens.push(step.token_id); - - // Decode the token - let text = inner_ctx - .tokenizer - .decode(&[step.token_id], true) - .expect("Failed to decode token"); - - let special = inner_ctx - .tokenizer - .get_added_vocabulary() - .is_special_token(&text); - - // Create the structure holding the token - let token = Token { - id: step.token_id, - text, - logprob: step.log_prob, - special, - }; - - if step.is_final { - let generated_text = inner_ctx - .tokenizer - .decode(&inner_ctx.tokens, true) - .expect("Failed to decode generated_tokens"); - - Ok(InferStreamResponse::End { - token, - top_tokens: vec![], - generated_text: GeneratedText { - text: generated_text, - generated_tokens: inner_ctx.tokens.len() as u32, - finish_reason: FinishReason::EndOfSequenceToken, - seed: None, - }, - start: inner_ctx.start.unwrap_or(Instant::now()), - queued: inner_ctx.queued, - }) - } else { - Ok(InferStreamResponse::Intermediate { - token, - top_tokens: vec![], - }) - } - } else { - error!("Error caught while decoding: {}", &step.error_msg); - Err(InferError::GenerationError(step.error_msg)) - }; - - // Send the parcel to the client - inner_ctx - .sender - .send(parcel) - .expect("Failed to sent msg through the channel"); - }, - ); - } - }) - .await; - } - - // "Properly" free the shared context... - // TODO: clean that piece of sh** asap - unsafe { - let _ = Box::from_raw(ctx_); - } - }); - } -} - -#[async_trait] -impl Backend for TensorRtLlmBackend { - #[instrument(skip_all)] - fn schedule( - &self, - request: ValidGenerateRequest, - ) -> InferResult>> { - // Let's add a few more validation - let input = TensorRtLlmBackend::validate(&request)?; - - // Channel to stream the generated token as they come from the worker thread back to the transport layer - let (sender, receiver) = unbounded_channel(); - - // Unpack parameters - let params = &request.parameters; - - // Preprocess the inputs to send to TRTLLM backend - let encoding = self - .tokenizer - .encode(input.as_str(), true) - .map_err(|e| InferError::GenerationError(e.to_string()))?; - - // Generate the response - self.generate( - sender, - Vec::from(encoding.get_ids()), - params.top_k, - params.top_p, - params.temperature, - params.repetition_penalty, - params.frequency_penalty, - params.seed, - ); - - Ok(UnboundedReceiverStream::new(receiver)) - } - - async fn health(&self, _current_health: bool) -> bool { - true - } -} From 507ff666925957b4c3d7ea079eb258b3659f41be Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 25 Sep 2024 10:01:21 +0000 Subject: [PATCH 33/62] (misc) rerun-if-changed all the cmake modules --- backends/trtllm/build.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index 08638262438..af91627004b 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -96,6 +96,10 @@ fn build_ffi_layer(deps_folder: &PathBuf) { .compile("tgi_trtllm_backend"); println!("cargo:rerun-if-changed=CMakeLists.txt"); + println!("cargo:rerun-if-changed=cmake/trtllm.cmake"); + println!("cargo:rerun-if-changed=cmake/json.cmake"); + println!("cargo:rerun-if-changed=cmake/fmt.cmake"); + println!("cargo:rerun-if-changed=cmake/spdlog.cmake"); println!("cargo:rerun-if-changed=include/backend.h"); println!("cargo:rerun-if-changed=lib/backend.cpp"); println!("cargo:rerun-if-changed=include/ffi.h"); From 213acc6e34e1877fde156feed6d619fcba3cf1d7 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 25 Sep 2024 10:08:45 +0000 Subject: [PATCH 34/62] (misc) move to latest trtllm --- backends/trtllm/cmake/trtllm.cmake | 2 +- backends/trtllm/src/looper.rs | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake index e59ad4cf3a6..e41a436c178 100644 --- a/backends/trtllm/cmake/trtllm.cmake +++ b/backends/trtllm/cmake/trtllm.cmake @@ -23,7 +23,7 @@ endif () fetchcontent_declare( trtllm GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git - GIT_TAG a681853d3803ee5893307e812530b5e7004bb6e1 + GIT_TAG 32ed92e4491baf2d54682a21d247e1948cca996e GIT_SHALLOW FALSE ) fetchcontent_makeavailable(trtllm) diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index ba10d9ee90e..0428a4dc75e 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -8,18 +8,18 @@ use hashbrown::HashMap; use log::warn; use tokenizers::{Encoding, Tokenizer}; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; -use tokio::task::{JoinHandle, spawn_blocking}; +use tokio::task::{spawn_blocking, JoinHandle}; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; -use tracing::{debug, error}; +use tracing::{debug, error, info}; -use text_generation_router::{FinishReason, Token}; -use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; use text_generation_router::infer::InferError::{GenerationError, ValidationError}; -use text_generation_router::validation::{Chunk, ValidGenerateRequest}; +use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; use text_generation_router::validation::ValidationError::{ EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality, }; +use text_generation_router::validation::{Chunk, ValidGenerateRequest}; +use text_generation_router::{FinishReason, Token}; use crate::errors::TensorRtLlmBackendError; use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; @@ -128,6 +128,7 @@ fn executor_status_looper( } } + // info!("Num response ready: {}", backend.num_responses_ready()); if backend.num_responses_ready() > 0 { match backend.pin_mut().pull_tokens() { Ok(responses) => { From 544c9d9dbae034aa71533ed54218241545f9c304 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 10 Oct 2024 12:36:32 +0000 Subject: [PATCH 35/62] (fix): HOPPER_SM_MAJOR is 9 not 8 --- backends/trtllm/include/hardware.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/trtllm/include/hardware.h b/backends/trtllm/include/hardware.h index da0bf4f3c0d..584dd974e1a 100644 --- a/backends/trtllm/include/hardware.h +++ b/backends/trtllm/include/hardware.h @@ -14,7 +14,7 @@ namespace huggingface::hardware::cuda { #define AMPERE_SM_MAJOR 8 -#define HOPPER_SM_MAJOR 8 +#define HOPPER_SM_MAJOR 9 /** * Store information about the version of the CUDA Compute Capabilities detected on the device From 188e4dc64f653ebe6983ed98785f2bcdf59d1c00 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 10 Oct 2024 12:40:32 +0000 Subject: [PATCH 36/62] (misc: build for sm_{75,80,86,89,90} by default --- backends/trtllm/build.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index af91627004b..e21d49ec4fc 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -36,7 +36,7 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf // Build the backend implementation through CMake let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi"); let tensorrt_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt"); - let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("90-real"); // Hopper by default + let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("75-real;80-real;86-real;89-real;90-real"); let mut install_path = PathBuf::from(install_path); if !install_path.is_absolute() { @@ -81,7 +81,12 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf (PathBuf::from(install_path), deps_folder) } -fn build_ffi_layer(deps_folder: &PathBuf) { +fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) { + let ndebug = match is_debug { + true => "1", + false => "0" + }; + CFG.include_prefix = "backends/trtllm"; cxx_build::bridge("src/lib.rs") .static_flag(true) @@ -93,6 +98,7 @@ fn build_ffi_layer(deps_folder: &PathBuf) { .include("/usr/local/tensorrt/include") .file("src/ffi.cpp") .std("c++20") + .define("NDEBUG", ndebug) .compile("tgi_trtllm_backend"); println!("cargo:rerun-if-changed=CMakeLists.txt"); @@ -119,7 +125,7 @@ fn main() { let (_backend_path, deps_folder) = build_backend(is_debug, opt_level, &out_dir); // Build the FFI layer calling the backend above - build_ffi_layer(&deps_folder); + build_ffi_layer(&deps_folder, is_debug); // Emit linkage search path probe!("ompi", MPI_REQUIRED_VERSION); From ce0cd1fce8e1b142f114592cae6d1d93e41367ce Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 10 Oct 2024 12:40:49 +0000 Subject: [PATCH 37/62] (misc): build with trtllm 0.13.0 --- backends/trtllm/cmake/trtllm.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake index e41a436c178..cf72ebac8ec 100644 --- a/backends/trtllm/cmake/trtllm.cmake +++ b/backends/trtllm/cmake/trtllm.cmake @@ -23,7 +23,7 @@ endif () fetchcontent_declare( trtllm GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git - GIT_TAG 32ed92e4491baf2d54682a21d247e1948cca996e + GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc GIT_SHALLOW FALSE ) fetchcontent_makeavailable(trtllm) From eb13d8d1f3f7d8e50773492958477fbd15b52bce Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 10 Oct 2024 12:41:20 +0000 Subject: [PATCH 38/62] (misc): increase verbosity of spdlog --- backends/trtllm/lib/backend.cpp | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 9c9c5dff554..16f45f5d214 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -8,6 +9,18 @@ #include "hardware.h" void huggingface::tgi::backends::InitializeBackend() { + if(const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")){ + std::string log_level(TRTLLM_LOG_LEVEL_CSTR); + std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) { + return std::tolower(c); + }); + + if(log_level == "debug") + spdlog::set_level(spdlog::level::debug); + else + spdlog::set_level(spdlog::level::info); + } + SPDLOG_INFO("Initializing Backend..."); nvmlInit_v2(); initTrtLlmPlugins(); @@ -91,7 +104,13 @@ huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( [[nodiscard("Returned number of requests needs to be consumed")]] size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const { - return executor.getNumResponsesReady(); + const auto numResponses = executor.getNumResponsesReady(); + +#ifdef NDEBUG + if(numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses); +#endif + + return numResponses; } [[nodiscard("Returned request id needs to be provided back to gather generated tokens")]] @@ -123,10 +142,18 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const auto maxNewTokensChecked = static_cast( std::min(maxNewTokens, static_cast(maxNumTokens - tokens.size()))); +#ifdef NDEBUG + SPDLOG_INFO( + FMT_STRING("Sampling config: topK={:d}, topP={:d}, temperature={:d}, repetition_penalty={:d}, frequency_penalty={:d}, seed={:d}"), + topK, topP, temperature, repetition_penalty, frequency_penalty, seed + ) + SPDLOG_INFO(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked); +#endif + const auto sampling = GetSamplingConfig(topK, topP, temperature, repetition_penalty, frequency_penalty, seed); return executor.enqueueRequest(tle::Request{tokens, maxNewTokensChecked, true, sampling, OUTPUT_CONFIG}); } std::vector huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() { - return std::move(executor.awaitResponses()); + return executor.awaitResponses(); } From c8a99af6c99f48b7efb0cfe8642b2daee27cb276 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 10 Oct 2024 12:41:46 +0000 Subject: [PATCH 39/62] (fix): do not recreate the stateful hashmap at every it --- backends/trtllm/src/looper.rs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index 0428a4dc75e..487c04577c0 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -128,7 +128,6 @@ fn executor_status_looper( } } - // info!("Num response ready: {}", backend.num_responses_ready()); if backend.num_responses_ready() > 0 { match backend.pin_mut().pull_tokens() { Ok(responses) => { @@ -172,20 +171,23 @@ fn post_processor_looper( tokenizer: Tokenizer, mut decoded_tokens: UnboundedReceiver<(u64, InferResult)>, ) { + let mut states: HashMap> = HashMap::with_capacity(128); + 'post_processor: loop { if decoded_tokens.is_closed() { warn!("Post processor IPC is closed, loop will exit now."); break 'post_processor; } - let mut states: HashMap> = HashMap::with_capacity(128); - if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() { - let state = states.entry(request_id).or_insert(vec![]); - match decoded { Ok(ctx) => { - state.push(ctx.token.id); + states.entry(request_id).and_modify(|s| s.push(*&ctx.token.id)).or_insert_with(|| { + let mut state = Vec::with_capacity(128); + state.push(*&ctx.token.id); + state + }); + let out = match tokenizer.decode(&[ctx.token.id], false) { Ok(text) => { let is_special = @@ -203,10 +205,11 @@ fn post_processor_looper( top_tokens: vec![], } } else { - let text = tokenizer.decode(&state, true); + let tokens = states.remove(&request_id).unwrap(); + let text = tokenizer.decode(&tokens, true); let generated_text = GeneratedText { text: text.unwrap(), - generated_tokens: state.len() as u32, + generated_tokens: tokens.len() as u32, finish_reason: FinishReason::EndOfSequenceToken, seed: None, }; From cb69c9a9676679b12537dbc93c686f852eca0e5e Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 10 Oct 2024 12:48:55 +0000 Subject: [PATCH 40/62] (misc): update dependency in trtllm dockerfile --- backends/trtllm/Dockerfile | 4 ++-- backends/trtllm/scripts/install_tensorrt.sh | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/backends/trtllm/Dockerfile b/backends/trtllm/Dockerfile index 79e6e8e9310..21aa9a510ac 100644 --- a/backends/trtllm/Dockerfile +++ b/backends/trtllm/Dockerfile @@ -10,7 +10,7 @@ COPY . . RUN cargo chef prepare --recipe-path recipe.json # CUDA dependent dependencies resolver stage -FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04 AS cuda-builder +FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu24.04 AS cuda-builder RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ @@ -82,7 +82,7 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$ cd backends/trtllm && \ CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release -FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime +FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu24.04 AS runtime WORKDIR /usr/local/tgi/bin ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH" diff --git a/backends/trtllm/scripts/install_tensorrt.sh b/backends/trtllm/scripts/install_tensorrt.sh index e0e2dd17b45..462171b9f11 100755 --- a/backends/trtllm/scripts/install_tensorrt.sh +++ b/backends/trtllm/scripts/install_tensorrt.sh @@ -2,12 +2,12 @@ set -ex -TRT_VER="10.2.0.19" -CUDA_VER="12.5" -CUDNN_VER="9.2.1.18-1" -NCCL_VER="2.22.3-1+cuda12.5" -CUBLAS_VER="12.5.3.2-1" -NVRTC_VER="12.5.82-1" +TRT_VER="10.4.0.26" +CUDA_VER="12.6" +CUDNN_VER="9.5.0.50-1" +NCCL_VER="2.22.3-1+cuda12.6" +CUBLAS_VER="12.6.3.3-1" +NVRTC_VER="12.6.77-1" for i in "$@"; do case $i in @@ -32,8 +32,8 @@ install_ubuntu_requirements() { ARCH=$(uname -m) if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi if [ "$ARCH" = "aarch64" ];then ARCH="sbsa";fi - curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-keyring_1.0-1_all.deb - dpkg -i cuda-keyring_1.0-1_all.deb + curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH}/cuda-keyring_1.1-1_all.deb + dpkg -i cuda-keyring_1.1-1_all.deb apt-get update if [[ $(apt list --installed | grep libcudnn9) ]]; then From 437c2aa142c365b925d3de990a35049f83f9cee4 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 10 Oct 2024 13:10:05 +0000 Subject: [PATCH 41/62] (misc): update dependency in trtllm dockerfile --- backends/trtllm/scripts/install_tensorrt.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/trtllm/scripts/install_tensorrt.sh b/backends/trtllm/scripts/install_tensorrt.sh index 462171b9f11..f05ef8c99e6 100755 --- a/backends/trtllm/scripts/install_tensorrt.sh +++ b/backends/trtllm/scripts/install_tensorrt.sh @@ -34,6 +34,7 @@ install_ubuntu_requirements() { if [ "$ARCH" = "aarch64" ];then ARCH="sbsa";fi curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH}/cuda-keyring_1.1-1_all.deb dpkg -i cuda-keyring_1.1-1_all.deb + rm /etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list apt-get update if [[ $(apt list --installed | grep libcudnn9) ]]; then From 0c3ba932ccff20e037f24bc55bd9f62b2210025e Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 10 Oct 2024 14:11:25 +0000 Subject: [PATCH 42/62] (misc): disable logging in release mode --- backends/trtllm/lib/backend.cpp | 14 ++++---------- backends/trtllm/src/ffi.cpp | 2 ++ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 16f45f5d214..96f5f9f4e38 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -106,7 +106,7 @@ huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const { const auto numResponses = executor.getNumResponsesReady(); -#ifdef NDEBUG +#ifndef NDEBUG if(numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses); #endif @@ -124,13 +124,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const float_t frequency_penalty, const uint64_t seed ) { -#ifdef NDEBUG - SPDLOG_DEBUG( - FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"), - tokens.size(), - executor.getLatestIterationStats().back().numActiveRequests - ); -#else +#ifndef NDEBUG SPDLOG_DEBUG( FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"), fmt::join(tokens, ", "), @@ -142,7 +136,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const auto maxNewTokensChecked = static_cast( std::min(maxNewTokens, static_cast(maxNumTokens - tokens.size()))); -#ifdef NDEBUG +#ifndef NDEBUG SPDLOG_INFO( FMT_STRING("Sampling config: topK={:d}, topP={:d}, temperature={:d}, repetition_penalty={:d}, frequency_penalty={:d}, seed={:d}"), topK, topP, temperature, repetition_penalty, frequency_penalty, seed @@ -156,4 +150,4 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( std::vector huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() { return executor.awaitResponses(); -} +} \ No newline at end of file diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index 54c17bc4149..80e74cf7ec1 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -40,7 +40,9 @@ huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() { auto steps = std::make_unique>(); steps->reserve(responses.size()); +#ifndef NDEBUG SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size()); +#endif // Transform tle::Response to GenerationStep std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) { From f9f10a6636352d995a4378fb0f275cab9de15f70 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 10 Oct 2024 14:11:41 +0000 Subject: [PATCH 43/62] (misc): improve trtllm download script robustness --- backends/trtllm/scripts/install_tensorrt.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/backends/trtllm/scripts/install_tensorrt.sh b/backends/trtllm/scripts/install_tensorrt.sh index f05ef8c99e6..4c2dc26b6bf 100755 --- a/backends/trtllm/scripts/install_tensorrt.sh +++ b/backends/trtllm/scripts/install_tensorrt.sh @@ -2,7 +2,8 @@ set -ex -TRT_VER="10.4.0.26" +TRT_VER_BASE="10.4.0" +TRT_VER_FULL="${TRT_VER_BASE}.26" CUDA_VER="12.6" CUDNN_VER="9.5.0.50-1" NCCL_VER="2.22.3-1+cuda12.6" @@ -72,7 +73,7 @@ install_centos_requirements() { install_tensorrt() { #PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))') #PARSED_PY_VERSION=$(echo "${PY_VERSION//./}") - TRT_CUDA_VERSION="12.5" + TRT_CUDA_VERSION="12.6" if [ -z "$RELEASE_URL_TRT" ];then ARCH=${TRT_TARGETARCH} @@ -80,12 +81,12 @@ install_tensorrt() { if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi - if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04" && OS="ubuntu-22.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi - RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz + if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-24.04" && OS="ubuntu-24.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi + RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/${TRT_VER_BASE}/tars/TensorRT-${TRT_VER_FULL}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz fi wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar tar -xf /tmp/TensorRT.tar -C /usr/local/ - mv /usr/local/TensorRT-${TRT_VER} /usr/local/tensorrt + mv /usr/local/TensorRT-${TRT_VER_FULL} /usr/local/tensorrt # pip3 install /usr/local/tensorrt/python/tensorrt-*-cp${PARSED_PY_VERSION}-*.whl rm -rf /tmp/TensorRT.tar } From dd94ccc9894331165071d22c447ffe770ad95286 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 10 Oct 2024 16:24:38 +0000 Subject: [PATCH 44/62] (fix): ore fixes for Dockerfile --- backends/trtllm/Dockerfile | 4 ++-- backends/trtllm/build.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backends/trtllm/Dockerfile b/backends/trtllm/Dockerfile index 21aa9a510ac..6a92b1b1225 100644 --- a/backends/trtllm/Dockerfile +++ b/backends/trtllm/Dockerfile @@ -10,7 +10,7 @@ COPY . . RUN cargo chef prepare --recipe-path recipe.json # CUDA dependent dependencies resolver stage -FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu24.04 AS cuda-builder +FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ @@ -82,7 +82,7 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$ cd backends/trtllm && \ CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release -FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu24.04 AS runtime +FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime WORKDIR /usr/local/tgi/bin ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH" diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index e21d49ec4fc..262c7dda83b 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -6,7 +6,7 @@ use std::path::{absolute, PathBuf}; const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"]; const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST"); -const CUDA_REQUIRED_VERSION: &str = "12.5"; +const CUDA_REQUIRED_VERSION: &str = "12.6"; const MPI_REQUIRED_VERSION: &str = "4.1"; const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX"); const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR"); From 819c9537710d255ad4dcc10d274c61c83b1c1d1a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 17 Oct 2024 13:12:16 +0200 Subject: [PATCH 45/62] misc(cuda): require 12.6 --- backends/trtllm/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index 92a6b65a34f..831372cdf99 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -8,6 +8,10 @@ if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug endif () endif () +if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") + cmake_policy(SET CMP0135 NEW) +endif () + project(tgi-trtllm-backend VERSION 1.0.0) set(CMAKE_CXX_STANDARD 20) @@ -22,7 +26,7 @@ set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located") # We are using nvidia-ml to query at runtime device information to enable some architecture-specific features -find_package(CUDAToolkit 12.5 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml) +find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml) #### External dependencies #### include(cmake/fmt.cmake) From f20ec288919c5650ec60b56eb0f72ddff7a74372 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 17 Oct 2024 13:12:34 +0200 Subject: [PATCH 46/62] chore(cmake): use correct policy for download_timestamp --- backends/trtllm/cmake/fmt.cmake | 1 + backends/trtllm/cmake/json.cmake | 1 + backends/trtllm/cmake/spdlog.cmake | 1 + backends/trtllm/cmake/trtllm.cmake | 1 + 4 files changed, 4 insertions(+) diff --git a/backends/trtllm/cmake/fmt.cmake b/backends/trtllm/cmake/fmt.cmake index 840280ca8ba..745e11d506e 100644 --- a/backends/trtllm/cmake/fmt.cmake +++ b/backends/trtllm/cmake/fmt.cmake @@ -1,5 +1,6 @@ FetchContent_Declare( fmt URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz + DOWNLOAD_EXTRACT_TIMESTAMP ) FetchContent_MakeAvailable(fmt) diff --git a/backends/trtllm/cmake/json.cmake b/backends/trtllm/cmake/json.cmake index 29e5753b341..0ef9649f13f 100644 --- a/backends/trtllm/cmake/json.cmake +++ b/backends/trtllm/cmake/json.cmake @@ -1,5 +1,6 @@ fetchcontent_declare( json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz + DOWNLOAD_EXTRACT_TIMESTAMP ) fetchcontent_makeavailable(json) diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake index 97a4d449cef..10fb498b858 100644 --- a/backends/trtllm/cmake/spdlog.cmake +++ b/backends/trtllm/cmake/spdlog.cmake @@ -12,5 +12,6 @@ endif () fetchcontent_declare( spdlog URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz + DOWNLOAD_EXTRACT_TIMESTAMP ) fetchcontent_makeavailable(spdlog) diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake index cf72ebac8ec..5f1b6c19c01 100644 --- a/backends/trtllm/cmake/trtllm.cmake +++ b/backends/trtllm/cmake/trtllm.cmake @@ -25,6 +25,7 @@ fetchcontent_declare( GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc GIT_SHALLOW FALSE + DOWNLOAD_EXTRACT_TIMESTAMP ) fetchcontent_makeavailable(trtllm) From 629153b44b3a2968bd23bbaf8547700eaf300af4 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 17 Oct 2024 13:13:34 +0200 Subject: [PATCH 47/62] feat(looper): check engine and executorWorker paths exist before creating the backend --- backends/trtllm/src/errors.rs | 5 +++++ backends/trtllm/src/looper.rs | 27 +++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/backends/trtllm/src/errors.rs b/backends/trtllm/src/errors.rs index 8ec6e1afc61..812fd6e30d8 100644 --- a/backends/trtllm/src/errors.rs +++ b/backends/trtllm/src/errors.rs @@ -1,9 +1,14 @@ +use std::path::PathBuf; use thiserror::Error; use text_generation_router::server; #[derive(Debug, Error)] pub enum TensorRtLlmBackendError { + #[error("Provided engine folder {0} doesn't exist")] + EngineFolderDoesntExists(PathBuf), + #[error("Provided executorWorker binary path {0} doesn't exist")] + ExecutorWorkerNotFound(PathBuf), #[error("TensorRT-LLM Runtime error: {0}")] Runtime(String), #[error("Tokenizer error: {0}")] diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index 487c04577c0..b6a18ca2cce 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -257,6 +257,33 @@ impl TensorRtLlmBackendV2 { let engine_folder = engine_folder.as_ref(); let executor_worker_path = executor_worker_path.as_ref(); + // Ensure the engine folder exists + if !engine_folder.exists() { + let err = + TensorRtLlmBackendError::EngineFolderDoesntExists(engine_folder.to_path_buf()); + + error!( + err, + engine_folder = engine_folder.display(), + executor_worker_path = executor_worker_path.display() + ); + + return Err(err); + } + + // Ensure executor worker binary exists + if !executor_worker_path.exists() { + let err = TensorRtLlmBackendError::ExecutorWorkerNotFound(engine_folder.to_path_buf()); + + error!( + err, + engine_folder = engine_folder.display(), + executor_worker_path = executor_worker_path.display() + ); + + return Err(err); + } + let engine_folder = String::from( engine_folder .to_str() From 027756c52dec937d5f2a32c1063b9dc93e81d54b Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 18 Oct 2024 00:07:53 +0200 Subject: [PATCH 48/62] chore(cmake): download timestamp should be before URL --- backends/trtllm/cmake/fmt.cmake | 2 +- backends/trtllm/cmake/json.cmake | 2 +- backends/trtllm/cmake/spdlog.cmake | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backends/trtllm/cmake/fmt.cmake b/backends/trtllm/cmake/fmt.cmake index 745e11d506e..afd6ea5f090 100644 --- a/backends/trtllm/cmake/fmt.cmake +++ b/backends/trtllm/cmake/fmt.cmake @@ -1,6 +1,6 @@ FetchContent_Declare( fmt - URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz DOWNLOAD_EXTRACT_TIMESTAMP + URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz ) FetchContent_MakeAvailable(fmt) diff --git a/backends/trtllm/cmake/json.cmake b/backends/trtllm/cmake/json.cmake index 0ef9649f13f..67eff2fe606 100644 --- a/backends/trtllm/cmake/json.cmake +++ b/backends/trtllm/cmake/json.cmake @@ -1,6 +1,6 @@ fetchcontent_declare( json - URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz DOWNLOAD_EXTRACT_TIMESTAMP + URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz ) fetchcontent_makeavailable(json) diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake index 10fb498b858..7f529a7d29e 100644 --- a/backends/trtllm/cmake/spdlog.cmake +++ b/backends/trtllm/cmake/spdlog.cmake @@ -11,7 +11,7 @@ endif () fetchcontent_declare( spdlog - URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz DOWNLOAD_EXTRACT_TIMESTAMP + URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz ) fetchcontent_makeavailable(spdlog) From 6687c06a21e76d40bdf14911b235116c269edf4d Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 18 Oct 2024 00:09:45 +0200 Subject: [PATCH 49/62] feat(looper): minor optimizations to avoid growing too much the containers --- backends/trtllm/src/looper.rs | 126 ++++++++++++++++++++-------------- backends/trtllm/src/main.rs | 15 ++-- 2 files changed, 82 insertions(+), 59 deletions(-) diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index b6a18ca2cce..beae8e8e13b 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -79,11 +79,13 @@ struct DecodedTokenContext { fn executor_status_looper( mut backend: UniquePtr, + max_inflight_requests: usize, mut waiting_requests: UnboundedReceiver, post_processor_sender: UnboundedSender<(u64, InferResult)>, ) { // Track the tuple (request_id, stream) for each request - let mut in_flights = HashMap::::with_capacity(128); + let mut in_flights = + HashMap::::with_capacity(max_inflight_requests * 2); // TODO: Does it need a spin-loop? 'scheduler: loop { @@ -169,9 +171,11 @@ fn executor_status_looper( fn post_processor_looper( tokenizer: Tokenizer, + max_num_tokens: usize, + max_inflight_requests: usize, mut decoded_tokens: UnboundedReceiver<(u64, InferResult)>, ) { - let mut states: HashMap> = HashMap::with_capacity(128); + let mut states: HashMap> = HashMap::with_capacity(max_inflight_requests * 2); 'post_processor: loop { if decoded_tokens.is_closed() { @@ -182,11 +186,14 @@ fn post_processor_looper( if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() { match decoded { Ok(ctx) => { - states.entry(request_id).and_modify(|s| s.push(*&ctx.token.id)).or_insert_with(|| { - let mut state = Vec::with_capacity(128); - state.push(*&ctx.token.id); - state - }); + states + .entry(request_id) + .and_modify(|s| s.push(*&ctx.token.id)) + .or_insert_with(|| { + let mut state = Vec::with_capacity(max_num_tokens); + state.push(*&ctx.token.id); + state + }); let out = match tokenizer.decode(&[ctx.token.id], false) { Ok(text) => { @@ -232,12 +239,53 @@ fn post_processor_looper( warn!("Failed to send decoded token back to the user") } } - Err(err) => {} + Err(_err) => { + todo!("what do we do?") + } } } } } +fn ensure_paths_exist, PP: AsRef>( + engine_folder: P, + executor_worker_path: PP, +) -> Result<(String, String), TensorRtLlmBackendError> { + // Retrieve paths as &str for the backend creation + let engine_folder = engine_folder.as_ref(); + let executor_worker_path = executor_worker_path.as_ref(); + + // Ensure the engine folder exists + if !engine_folder.exists() { + let err = TensorRtLlmBackendError::EngineFolderDoesntExists(engine_folder.to_path_buf()); + + error!("Path validation failed: {}", err,); + return Err(err); + } + + // Ensure executor worker binary exists + if !executor_worker_path.exists() { + let err = TensorRtLlmBackendError::ExecutorWorkerNotFound(engine_folder.to_path_buf()); + + error!("Path validation failed: {}", err,); + return Err(err); + } + + let engine_folder = String::from( + engine_folder + .to_str() + .expect("Failed to convert engine_folder to valid UTF-8"), + ); + + let executor_worker_path = String::from( + executor_worker_path + .to_str() + .expect("Failed to convert executor_worker_path to valid UTF-8"), + ); + + Ok((engine_folder, executor_worker_path)) +} + unsafe impl Send for TensorRtLlmBackendImpl {} pub struct TensorRtLlmBackendV2 { @@ -252,49 +300,10 @@ impl TensorRtLlmBackendV2 { tokenizer: Tokenizer, engine_folder: P, executor_worker_path: PP, + max_inflight_requests: usize, ) -> Result { - // Retrieve paths as &str for the backend creation - let engine_folder = engine_folder.as_ref(); - let executor_worker_path = executor_worker_path.as_ref(); - - // Ensure the engine folder exists - if !engine_folder.exists() { - let err = - TensorRtLlmBackendError::EngineFolderDoesntExists(engine_folder.to_path_buf()); - - error!( - err, - engine_folder = engine_folder.display(), - executor_worker_path = executor_worker_path.display() - ); - - return Err(err); - } - - // Ensure executor worker binary exists - if !executor_worker_path.exists() { - let err = TensorRtLlmBackendError::ExecutorWorkerNotFound(engine_folder.to_path_buf()); - - error!( - err, - engine_folder = engine_folder.display(), - executor_worker_path = executor_worker_path.display() - ); - - return Err(err); - } - - let engine_folder = String::from( - engine_folder - .to_str() - .expect("Failed to convert engine_folder to valid UTF-8"), - ); - - let executor_worker_path = String::from( - executor_worker_path - .to_str() - .expect("Failed to convert executor_worker_path to valid UTF-8"), - ); + let (engine_folder, executor_worker_path) = + ensure_paths_exist(engine_folder, executor_worker_path)?; // Allocate the IPC layer to communicate with the backend let (executor_sender, executor_receiver) = unbounded_channel(); @@ -306,13 +315,24 @@ impl TensorRtLlmBackendV2 { // Executor looper is responsible for scheduling and pulling requests state at regular interval let executor_looper = spawn_blocking(move || { - executor_status_looper(backend, executor_receiver, post_processor_sender) + executor_status_looper( + backend, + max_inflight_requests, + executor_receiver, + post_processor_sender, + ) }); // Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user let tokenizer_ = tokenizer.clone(); - let post_processor_looper = - spawn_blocking(move || post_processor_looper(tokenizer_, post_processor_receiver)); + let post_processor_looper = spawn_blocking(move || { + post_processor_looper( + tokenizer_, + 512, + max_inflight_requests, + post_processor_receiver, + ) + }); Ok(TensorRtLlmBackendV2 { tokenizer, diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index e78134b94d1..92712988652 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -1,17 +1,15 @@ use std::path::{Path, PathBuf}; use clap::Parser; -use hf_hub::{Cache, Repo, RepoType}; use hf_hub::api::tokio::{Api, ApiBuilder}; +use hf_hub::{Cache, Repo, RepoType}; use tokenizers::Tokenizer; use tracing::info; use text_generation_backends_trtllm::errors::TensorRtLlmBackendError; use text_generation_backends_trtllm::TensorRtLlmBackendV2; -use text_generation_router::{HubTokenizerConfig, server}; -use text_generation_router::server::{ - create_post_processor, get_base_tokenizer, -}; +use text_generation_router::server::{create_post_processor, get_base_tokenizer}; +use text_generation_router::{server, HubTokenizerConfig}; /// App Configuration #[derive(Parser, Debug)] @@ -282,7 +280,12 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { .expect("Failed to retrieve tokenizer implementation"); info!("Successfully retrieved tokenizer {}", &tokenizer_name); - let backend = TensorRtLlmBackendV2::new(tokenizer, model_id, executor_worker)?; + let backend = TensorRtLlmBackendV2::new( + tokenizer, + model_id, + executor_worker, + max_concurrent_requests, + )?; info!("Successfully created backend"); From 62f33d7ecdc13103d911c66066c3829fdda3c7c5 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 09:25:13 +0200 Subject: [PATCH 50/62] chore(trtllm): move dockerfile to right place --- backends/trtllm/Dockerfile => Dockerfile_trtllm | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename backends/trtllm/Dockerfile => Dockerfile_trtllm (100%) diff --git a/backends/trtllm/Dockerfile b/Dockerfile_trtllm similarity index 100% rename from backends/trtllm/Dockerfile rename to Dockerfile_trtllm From e3bce407be7ffcd15afc5dc3837653670da06647 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 09:25:31 +0200 Subject: [PATCH 51/62] chore(trtllm): disable tokenizer parallelism by default --- Dockerfile_trtllm | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm index 6a92b1b1225..b5bd91d8666 100644 --- a/Dockerfile_trtllm +++ b/Dockerfile_trtllm @@ -86,6 +86,7 @@ FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime WORKDIR /usr/local/tgi/bin ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH" +ENV TOKENIZERS_PARALLELISM=false ENV OMPI_MCA_plm_rsh_agent="" COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi From 85c03e33a91cc3b2df47da693d780abd6e459a56 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 09:38:51 +0200 Subject: [PATCH 52/62] chore(trtllm): fmt --- backends/trtllm/build.rs | 2 +- backends/trtllm/src/lib.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index 262c7dda83b..985019260b8 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -84,7 +84,7 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) { let ndebug = match is_debug { true => "1", - false => "0" + false => "0", }; CFG.include_prefix = "backends/trtllm"; diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index 6d3297d662c..edd8caff154 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -65,5 +65,4 @@ mod ffi { self: Pin<&mut TensorRtLlmBackendImpl>, ) -> Result>>; } - } From fb00f985ae0a82ebbc8683fad8f94dbe73c58f3f Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 12:31:24 +0200 Subject: [PATCH 53/62] chore(trtllm): post-rebase commit --- Cargo.lock | 68 ++++++++++--------------------------- backends/trtllm/src/main.rs | 4 +-- 2 files changed, 19 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5e85e384869..c1251832983 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2706,9 +2706,9 @@ dependencies = [ [[package]] name = "opentelemetry" -version = "0.23.0" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b69a91d4893e713e06f724597ad630f1fa76057a5e1026c0ca67054a9032a76" +checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96" dependencies = [ "futures-core", "futures-sink", @@ -2819,19 +2819,17 @@ dependencies = [ [[package]] name = "opentelemetry_sdk" -version = "0.23.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae312d58eaa90a82d2e627fd86e075cf5230b3f11794e2ed74199ebbe572d4fd" +checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df" dependencies = [ "async-trait", "futures-channel", "futures-executor", "futures-util", "glob", - "lazy_static", "once_cell", - "opentelemetry 0.23.0", - "ordered-float 4.3.0", + "opentelemetry 0.24.0", "percent-encoding", "rand", "thiserror", @@ -4185,16 +4183,17 @@ dependencies = [ "cmake", "cxx", "cxx-build", + "hashbrown 0.14.5", + "hf-hub", "log", - "parking_lot", "pkg-config", "text-generation-router", "thiserror", - "tokenizers 0.19.1", + "tokenizers", "tokio", "tokio-stream", "tracing", - "tracing-opentelemetry 0.24.0", + "tracing-opentelemetry 0.25.0", "tracing-subscriber", ] @@ -4212,7 +4211,7 @@ dependencies = [ "tabled", "text-generation-client", "thiserror", - "tokenizers 0.20.0", + "tokenizers", "tokio", "tracing", "tracing-subscriber", @@ -4292,7 +4291,7 @@ dependencies = [ "serde_json", "sysinfo", "thiserror", - "tokenizers 0.20.0", + "tokenizers", "tokio", "tokio-stream", "tower-http", @@ -4341,7 +4340,7 @@ dependencies = [ "slotmap", "text-generation-router", "thiserror", - "tokenizers 0.20.0", + "tokenizers", "tokio", "tokio-stream", "tonic 0.10.2", @@ -4392,7 +4391,7 @@ dependencies = [ "slotmap", "text-generation-router", "thiserror", - "tokenizers 0.20.0", + "tokenizers", "tokio", "tokio-stream", "tonic 0.10.2", @@ -4514,39 +4513,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "tokenizers" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd" -dependencies = [ - "aho-corasick", - "derive_builder", - "esaxx-rs", - "getrandom", - "hf-hub", - "indicatif", - "itertools 0.12.1", - "lazy_static", - "log", - "macro_rules_attribute", - "monostate", - "onig", - "paste", - "rand", - "rayon", - "rayon-cond", - "regex", - "regex-syntax 0.8.5", - "serde", - "serde_json", - "spm_precompiled", - "thiserror", - "unicode-normalization-alignments", - "unicode-segmentation", - "unicode_categories", -] - [[package]] name = "tokenizers" version = "0.20.0" @@ -4933,14 +4899,14 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.24.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f68803492bf28ab40aeccaecc7021096bd256baf7ca77c3d425d89b35a7be4e4" +checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b" dependencies = [ "js-sys", "once_cell", - "opentelemetry 0.23.0", - "opentelemetry_sdk 0.23.0", + "opentelemetry 0.24.0", + "opentelemetry_sdk 0.24.1", "smallvec", "tracing", "tracing-core", diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index 92712988652..3573fe4136c 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -9,6 +9,7 @@ use tracing::info; use text_generation_backends_trtllm::errors::TensorRtLlmBackendError; use text_generation_backends_trtllm::TensorRtLlmBackendV2; use text_generation_router::server::{create_post_processor, get_base_tokenizer}; +use text_generation_router::usage_stats::UsageStatsLevel; use text_generation_router::{server, HubTokenizerConfig}; /// App Configuration @@ -312,8 +313,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { messages_api_enabled, true, max_client_batch_size, - false, - false, + UsageStatsLevel::Off, ) .await?; Ok(()) From 31747163e726541920f11068583dd92236bafa6b Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 14:10:23 +0200 Subject: [PATCH 54/62] chore(trtllm): remove unused method --- backends/trtllm/include/backend.h | 6 ------ backends/trtllm/lib/backend.cpp | 24 +++++++----------------- 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index 83e862c558e..2864021ed32 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -72,12 +72,6 @@ namespace huggingface::tgi::backends { const std::filesystem::path &executorWorker ); - /** - * Query the executor for the number of token available for pulling - * @return - */ - [[nodiscard]] size_t NumResponsesReady() const; - /** * Submit a new generation task to the executor * @param tokens diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 96f5f9f4e38..e2e0cbeab57 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -9,13 +9,13 @@ #include "hardware.h" void huggingface::tgi::backends::InitializeBackend() { - if(const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")){ + if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) { std::string log_level(TRTLLM_LOG_LEVEL_CSTR); - std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) { + std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) { return std::tolower(c); }); - if(log_level == "debug") + if (log_level == "debug") spdlog::set_level(spdlog::level::debug); else spdlog::set_level(spdlog::level::info); @@ -102,17 +102,6 @@ huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref()); } -[[nodiscard("Returned number of requests needs to be consumed")]] -size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const { - const auto numResponses = executor.getNumResponsesReady(); - -#ifndef NDEBUG - if(numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses); -#endif - - return numResponses; -} - [[nodiscard("Returned request id needs to be provided back to gather generated tokens")]] tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const std::vector &tokens, @@ -138,10 +127,11 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( #ifndef NDEBUG SPDLOG_INFO( - FMT_STRING("Sampling config: topK={:d}, topP={:d}, temperature={:d}, repetition_penalty={:d}, frequency_penalty={:d}, seed={:d}"), - topK, topP, temperature, repetition_penalty, frequency_penalty, seed + FMT_STRING( + "Sampling config: topK={:d}, topP={:d}, temperature={:d}, repetition_penalty={:d}, frequency_penalty={:d}, seed={:d}"), + topK, topP, temperature, repetition_penalty, frequency_penalty, seed ) - SPDLOG_INFO(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked); + SPDLOG_INFO(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked); #endif const auto sampling = GetSamplingConfig(topK, topP, temperature, repetition_penalty, frequency_penalty, seed); From e6da212431dd19196d519376b34372e293fe3647 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 14:51:58 +0200 Subject: [PATCH 55/62] feat(trtllm): cache maxNumTokens to avoid calling JSON everytime --- backends/trtllm/include/backend.h | 19 +++++++++++++------ backends/trtllm/lib/backend.cpp | 31 ++++++++++++++----------------- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index 2864021ed32..7e6b8ab979c 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -24,6 +24,10 @@ namespace huggingface::tgi::backends { using TokenId = tle::TokenIdType; const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false); + constexpr auto FMT_EXECUTOR_STATS = FMT_STRING( + "Submitting inference [{}] to the executor ({:d} already in-flight)"); + constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING( + "Sampling: topK={:d}, topP={:.1f}, temperature={:.1f}, repetition_penalty={:.1f}, frequency_penalty={:.1f}, seed={:d}"); /** * Initialize all the components required by TRTLLM. @@ -50,12 +54,12 @@ namespace huggingface::tgi::backends { * @return */ tle::SamplingConfig GetSamplingConfig( - const uint32_t topK, - const float_t topP, - const float_t temperature, - const float_t repetition_penalty, - const float_t frequency_penalty, - const uint64_t seed + uint32_t topK, + float_t topP, + float_t temperature, + float_t repetition_penalty, + float_t frequency_penalty, + uint64_t seed ) noexcept; /** @@ -66,6 +70,9 @@ namespace huggingface::tgi::backends { const json config; tle::Executor executor; + /** Frequently accessed variables cached here **/ + uint32_t maxNumTokens; + public: explicit TensorRtLlmBackend( const std::filesystem::path &engineFolder, diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index e2e0cbeab57..269b381f97f 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -75,6 +75,7 @@ tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig( const float_t repetition_penalty, const float_t frequency_penalty, const uint64_t seed) noexcept { + return tle::SamplingConfig( 1, // TGI only use a single beam topK, @@ -100,6 +101,9 @@ huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY, GetExecutorConfig(config, executorWorker.string())) { SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref()); + + // Cache variables + maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get(); } [[nodiscard("Returned request id needs to be provided back to gather generated tokens")]] @@ -113,29 +117,22 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const float_t frequency_penalty, const uint64_t seed ) { + const auto maxNewTokensChecked = std::min(maxNewTokens, static_cast(maxNumTokens - tokens.size())); #ifndef NDEBUG - SPDLOG_DEBUG( - FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"), - fmt::join(tokens, ", "), - executor.getLatestIterationStats().front().numActiveRequests - ); -#endif + { + const auto &iterations = executor.getLatestIterationStats(); + const auto &lastIteration = iterations.front(); + SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests); - const auto maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get(); - const auto maxNewTokensChecked = static_cast( - std::min(maxNewTokens, static_cast(maxNumTokens - tokens.size()))); -#ifndef NDEBUG - SPDLOG_INFO( - FMT_STRING( - "Sampling config: topK={:d}, topP={:d}, temperature={:d}, repetition_penalty={:d}, frequency_penalty={:d}, seed={:d}"), - topK, topP, temperature, repetition_penalty, frequency_penalty, seed - ) - SPDLOG_INFO(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked); + SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetition_penalty, frequency_penalty, seed); + SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked); + } #endif const auto sampling = GetSamplingConfig(topK, topP, temperature, repetition_penalty, frequency_penalty, seed); - return executor.enqueueRequest(tle::Request{tokens, maxNewTokensChecked, true, sampling, OUTPUT_CONFIG}); + const auto maxNewTokensChecked_ = static_cast(maxNewTokensChecked); + return executor.enqueueRequest(tle::Request{tokens, maxNewTokensChecked_, true, sampling, OUTPUT_CONFIG}); } std::vector huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() { From 1a3da05f342277f363501a0dadca370ed93c7cea Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 14:57:19 +0200 Subject: [PATCH 56/62] misc(router): remove SchedulingError --- backends/trtllm/src/looper.rs | 3 ++- router/src/infer/mod.rs | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index beae8e8e13b..1411a8ea698 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -8,6 +8,7 @@ use hashbrown::HashMap; use log::warn; use tokenizers::{Encoding, Tokenizer}; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; +use tokio::sync::TryAcquireError; use tokio::task::{spawn_blocking, JoinHandle}; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; @@ -121,7 +122,7 @@ fn executor_status_looper( let what = e.to_string(); error!(error = what.as_str(), "Failed to schedule request"); - let err = Err(InferError::SchedulingError(what)); + let err = Err(InferError::Overloaded(TryAcquireError::NoPermits)); if let Err(_) = ctx.streamer.send(err) { error!("Failed to send back error to the client"); } diff --git a/router/src/infer/mod.rs b/router/src/infer/mod.rs index 39b6f8cff55..896f4f4318f 100644 --- a/router/src/infer/mod.rs +++ b/router/src/infer/mod.rs @@ -357,8 +357,6 @@ pub enum InferError { ToolError(String), #[error("Stream event serialization error")] StreamSerializationError(String), - #[error("Scheduling error: {0}")] - SchedulingError(String), } impl InferError { @@ -373,7 +371,6 @@ impl InferError { InferError::MissingTemplateVariable(_) => "missing_template_variable", InferError::ToolError(_) => "tool_error", InferError::StreamSerializationError(_) => "stream_serialization_error", - InferError::SchedulingError(_) => "schedling" } } } From 8d1c3c8ad445c540fad2f642be127f3c85bdb96c Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 15:06:54 +0200 Subject: [PATCH 57/62] feat(trtllm): do not tokenize twice --- backends/trtllm/src/looper.rs | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index 1411a8ea698..d97fa69fa24 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -33,15 +33,9 @@ struct IdentifiableRequest { inner: T, } -/// Wrap the TGI server forwarded ValidGenerateRequest with the tokenized view of the prompt -struct ValidGenerateRequestWithTokens { - encoding: Encoding, - inner: ValidGenerateRequest, -} - /// Wrap the requests along with the channel used to stream back to the client the decoded tokens struct GenerationContext { - request: ValidGenerateRequestWithTokens, + request: ValidGenerateRequest, start: Option, queued: Instant, streamer: UnboundedSender>, @@ -97,12 +91,13 @@ fn executor_status_looper( if let Some(mut ctx) = waiting_requests.blocking_recv() { // Submit all the request to the executor and move the context to the in-flight tracker let request = &ctx.request; - let generation_params = &request.inner.parameters; - let stopping_params = &request.inner.stopping_parameters; + let generation_params = &request.parameters; + let stopping_params = &request.stopping_parameters; + let input_ids = request.input_ids.as_deref(); // Submit to the TensorRT-LLM executor for scheduling match backend.pin_mut().submit( - request.encoding.get_ids(), + &input_ids.unwrap(), // This is checked beforehand in validate() stopping_params.max_new_tokens, generation_params.top_k as i32, generation_params.top_p, @@ -343,7 +338,11 @@ impl TensorRtLlmBackendV2 { }) } - fn validate(request: &ValidGenerateRequest) -> InferResult<&String> { + fn validate(request: &ValidGenerateRequest) -> InferResult<()> { + if request.input_ids.is_none() { + return Err(ValidationError(UnsupportedModality("No token provided"))); + } + if request.top_n_tokens > 1 { return Err(ValidationError(TopNTokensDisabled)); } @@ -359,7 +358,7 @@ impl TensorRtLlmBackendV2 { "TensorRT-LLM backend don't support multi-chunk".into(), )), 1 => match request.inputs.first().expect("Single item-chunk") { - Chunk::Text(text) => Ok(text), + Chunk::Text(text) => Ok(()), Chunk::Image(_) => Err(ValidationError(UnsupportedModality("image"))), }, } @@ -372,15 +371,7 @@ impl Backend for TensorRtLlmBackendV2 { &self, inner: ValidGenerateRequest, ) -> Result>, InferError> { - let prompt = Self::validate(&inner)?; - - // We encode the prompt in every request context/thread - let encoding = self - .tokenizer - .encode(prompt.as_str(), true) - .map_err(|e| GenerationError(format!("Tokenization failed {}", e.to_string())))?; - - let request = ValidGenerateRequestWithTokens { encoding, inner }; + Self::validate(&inner)?; // Open-up the stream to send tokens let (streamer, receiver) = unbounded_channel::>(); @@ -388,7 +379,7 @@ impl Backend for TensorRtLlmBackendV2 { // Send the context to the executor for scheduling let queued = Instant::now(); match self.executor.send(GenerationContext { - request, + request: inner, start: None, queued, streamer, From f5b9ee368af1ae37833ac7b02c6cb7bff5e7e33d Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 17:03:35 +0200 Subject: [PATCH 58/62] Revert "chore(trtllm): remove unused method" This reverts commit 31747163 --- backends/trtllm/include/backend.h | 6 ++++++ backends/trtllm/lib/backend.cpp | 14 ++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index 7e6b8ab979c..5b2963a8cdc 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -79,6 +79,12 @@ namespace huggingface::tgi::backends { const std::filesystem::path &executorWorker ); + /** + * Query the executor for the number of token available for pulling + * @return + */ + [[nodiscard]] size_t NumResponsesReady() const; + /** * Submit a new generation task to the executor * @param tokens diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 269b381f97f..72a75e2a84d 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -106,6 +106,17 @@ huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get(); } +[[nodiscard("Returned number of requests needs to be consumed")]] +size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const { + const auto numResponses = executor.getNumResponsesReady(); + +#ifndef NDEBUG + if(numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses); +#endif + + return numResponses; +} + [[nodiscard("Returned request id needs to be provided back to gather generated tokens")]] tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const std::vector &tokens, @@ -122,9 +133,8 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( { const auto &iterations = executor.getLatestIterationStats(); const auto &lastIteration = iterations.front(); - SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests); - + SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests); SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetition_penalty, frequency_penalty, seed); SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked); } From d73401ac7397440496019874176b02d11652f3c0 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 21:44:28 +0200 Subject: [PATCH 59/62] chore(rebase): fix invalid references --- backends/trtllm/src/looper.rs | 12 ++++-------- backends/trtllm/src/main.rs | 27 +++++---------------------- 2 files changed, 9 insertions(+), 30 deletions(-) diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index d97fa69fa24..95ba16a9396 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -5,14 +5,13 @@ use std::path::Path; use async_trait::async_trait; use cxx::UniquePtr; use hashbrown::HashMap; -use log::warn; -use tokenizers::{Encoding, Tokenizer}; +use tokenizers::Tokenizer; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; use tokio::sync::TryAcquireError; use tokio::task::{spawn_blocking, JoinHandle}; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; -use tracing::{debug, error, info}; +use tracing::{debug, error, warn}; use text_generation_router::infer::InferError::{GenerationError, ValidationError}; use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; @@ -285,7 +284,6 @@ fn ensure_paths_exist, PP: AsRef>( unsafe impl Send for TensorRtLlmBackendImpl {} pub struct TensorRtLlmBackendV2 { - tokenizer: Tokenizer, executor_looper: JoinHandle<()>, post_processor_looper: JoinHandle<()>, executor: UnboundedSender, @@ -320,10 +318,9 @@ impl TensorRtLlmBackendV2 { }); // Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user - let tokenizer_ = tokenizer.clone(); let post_processor_looper = spawn_blocking(move || { post_processor_looper( - tokenizer_, + tokenizer, 512, max_inflight_requests, post_processor_receiver, @@ -331,7 +328,6 @@ impl TensorRtLlmBackendV2 { }); Ok(TensorRtLlmBackendV2 { - tokenizer, executor_looper, post_processor_looper, executor: executor_sender, @@ -358,7 +354,7 @@ impl TensorRtLlmBackendV2 { "TensorRT-LLM backend don't support multi-chunk".into(), )), 1 => match request.inputs.first().expect("Single item-chunk") { - Chunk::Text(text) => Ok(()), + Chunk::Text(_) => Ok(()), Chunk::Image(_) => Err(ValidationError(UnsupportedModality("image"))), }, } diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index 3573fe4136c..ec54ccce722 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -8,7 +8,7 @@ use tracing::info; use text_generation_backends_trtllm::errors::TensorRtLlmBackendError; use text_generation_backends_trtllm::TensorRtLlmBackendV2; -use text_generation_router::server::{create_post_processor, get_base_tokenizer}; +use text_generation_router::server::get_base_tokenizer; use text_generation_router::usage_stats::UsageStatsLevel; use text_generation_router::{server, HubTokenizerConfig}; @@ -125,10 +125,10 @@ async fn get_tokenizer( // Load tokenizer and model info let ( tokenizer_filename, - config_filename, + _config_filename, tokenizer_config_filename, - preprocessor_config_filename, - processor_config_filename, + _preprocessor_config_filename, + _processor_config_filename, ) = match api { Type::None => ( Some(local_path.join("tokenizer.json")), @@ -184,25 +184,8 @@ async fn get_tokenizer( } else { tokenizer_config_filename.and_then(HubTokenizerConfig::from_file) }; - let tokenizer_config = tokenizer_config.unwrap_or_else(|| { - tracing::warn!("Could not find tokenizer config locally and no API specified"); - HubTokenizerConfig::default() - }); - tokenizer_filename.and_then(|filename| { - let mut tokenizer = Tokenizer::from_file(filename).ok(); - if let Some(tokenizer) = &mut tokenizer { - if let Some(class) = &tokenizer_config.tokenizer_class { - if class == "LlamaTokenizer" || class == "LlamaTokenizerFast"{ - if let Ok(post_processor) = create_post_processor(tokenizer, &tokenizer_config) { - tracing::info!("Overriding LlamaTokenizer with TemplateProcessing to follow python override defined in https://github.com/huggingface/transformers/blob/4aa17d00690b7f82c95bb2949ea57e22c35b4336/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205"); - tokenizer.with_post_processor(post_processor); - } - } - } - } - tokenizer - }) + tokenizer_filename.and_then(|filename| Tokenizer::from_file(filename).ok()) } #[tokio::main] From 18b473b019c7e3279f515570fe66ef4b2ef9fa10 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 22 Oct 2024 09:51:50 +0200 Subject: [PATCH 60/62] chore(router): add python dependency --- Dockerfile_trtllm | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm index b5bd91d8666..3185ea800ed 100644 --- a/Dockerfile_trtllm +++ b/Dockerfile_trtllm @@ -26,6 +26,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ ninja-build \ pkg-config \ python3 \ + python3-dev \ python3-setuptools \ tar \ wget @@ -83,6 +84,9 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$ CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime +RUN apt update && apt install -y python3 && \ + rm -rf /var/lib/{apt,dpkg,cache,log}/ + WORKDIR /usr/local/tgi/bin ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH" From b4b6322ede3202030760088e51b8dd65a66d6f81 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 25 Oct 2024 07:10:34 +0200 Subject: [PATCH 61/62] Lint. --- backends/trtllm/lib/backend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 72a75e2a84d..f369e1b74b4 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -147,4 +147,4 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( std::vector huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() { return executor.awaitResponses(); -} \ No newline at end of file +} From 4463856cc78bd1df4488b442bec59dba18ad15f5 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 25 Oct 2024 07:14:41 +0200 Subject: [PATCH 62/62] Fix bad rebase --- backends/trtllm/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index c13c95b9e76..6a247fc1d52 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -295,7 +295,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { None, true, max_client_batch_size, - UsageStatsLevel::Off, + usage_stats, ) .await?; Ok(())