diff --git a/.devcontainer/Dockerfile_trtllm b/.devcontainer/Dockerfile_trtllm index 21b7114ce03..239a7bf8c2c 100644 --- a/.devcontainer/Dockerfile_trtllm +++ b/.devcontainer/Dockerfile_trtllm @@ -72,4 +72,4 @@ RUN cargo install cargo-chef COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi -ENV MPI_HOME=/usr/local/mpi \ No newline at end of file +ENV MPI_HOME=/usr/local/mpi diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index 0a0f6e6bf70..d9c1aa15e6b 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -44,7 +44,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf } let mut config = cmake::Config::new("."); - config.uses_cxx11() + config + .uses_cxx11() .generator("Ninja") .profile(match is_debug { true => "Debug", @@ -57,12 +58,12 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list) .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path); - // Allow to override which Python to use ... - if let Some(python3) = option_env!("Python3_EXECUTABLE") { - config.define("Python3_EXECUTABLE", python3); - } + // Allow to override which Python to use ... + if let Some(python3) = option_env!("Python3_EXECUTABLE") { + config.define("Python3_EXECUTABLE", python3); + } - config.build(); + config.build(); // Additional transitive CMake dependencies let deps_folder = out_dir.join("build").join("_deps"); diff --git a/backends/trtllm/csrc/backend.hpp b/backends/trtllm/csrc/backend.hpp index f49c437a2b2..40b44a842b3 100644 --- a/backends/trtllm/csrc/backend.hpp +++ b/backends/trtllm/csrc/backend.hpp @@ -228,4 +228,4 @@ struct fmt::formatter : f } }; -#endif \ No newline at end of file +#endif diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp index de2333afe37..d0342d4bb38 100644 --- a/backends/trtllm/csrc/ffi.hpp +++ b/backends/trtllm/csrc/ffi.hpp @@ -159,4 +159,4 @@ namespace huggingface::tgi::backends::trtllm { ); } } -#endif \ No newline at end of file +#endif diff --git a/backends/trtllm/csrc/hardware.hpp b/backends/trtllm/csrc/hardware.hpp index 8e5fa696dbb..abfb4afd51d 100644 --- a/backends/trtllm/csrc/hardware.hpp +++ b/backends/trtllm/csrc/hardware.hpp @@ -78,4 +78,4 @@ namespace huggingface::tgi::hardware::cuda { [[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); } }; } -#endif \ No newline at end of file +#endif diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index 3addd95f96f..969046d184b 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -133,13 +133,16 @@ fn executor_status_looper( Ok(decoded_token) => { post_process_decoded_token(&tokenizer, ctx, decoded_token) } - Err(err) => Err(err) + Err(err) => Err(err), }; // Attempt to send back the response to the client if let Err(_) = ctx.streamer.send(response) { // Client has dropped, remove from tracked requests - debug!("Client dropped - removing request {} from tracked requests", step.request_id); + debug!( + "Client dropped - removing request {} from tracked requests", + step.request_id + ); backend.as_mut().cancel(step.request_id); let _ = in_flights.remove(&step.request_id); } @@ -160,11 +163,14 @@ fn executor_status_looper( } } -fn post_process_decoded_token(tokenizer: &Tokenizer, ctx: &mut GenerationContext, decoded_token: DecodedToken) -> InferResult { +fn post_process_decoded_token( + tokenizer: &Tokenizer, + ctx: &mut GenerationContext, + decoded_token: DecodedToken, +) -> InferResult { match tokenizer.decode(&[decoded_token.id], false) { Ok(text) => { - let is_special = - tokenizer.get_added_vocabulary().is_special_token(&text); + let is_special = tokenizer.get_added_vocabulary().is_special_token(&text); let token = Token { id: decoded_token.id, text, @@ -186,7 +192,7 @@ fn post_process_decoded_token(tokenizer: &Tokenizer, ctx: &mut GenerationContext let generated_text = GeneratedText { text: text.unwrap(), generated_tokens: ctx.tokens.len() as u32, - finish_reason: FinishReason::EndOfSequenceToken, // TODO : Map FinishReason + finish_reason: FinishReason::EndOfSequenceToken, // TODO : Map FinishReason seed: None, }; @@ -248,7 +254,6 @@ unsafe impl Send for TensorRtLlmBackendImpl {} pub struct TensorRtLlmBackendV2(UnboundedSender); - impl TensorRtLlmBackendV2 { pub fn new + Send, PP: AsRef + Send>( tokenizer: Tokenizer, @@ -268,12 +273,7 @@ impl TensorRtLlmBackendV2 { // Executor looper is responsible for scheduling and pulling requests state at regular interval spawn_blocking(move || { - executor_status_looper( - max_inflight_requests, - tokenizer, - backend, - executor_receiver, - ) + executor_status_looper(max_inflight_requests, tokenizer, backend, executor_receiver) }); Ok(TensorRtLlmBackendV2(executor_sender)) diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index 9c76bafa662..8b137891791 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -1,345 +1 @@ -use std::path::{Path, PathBuf}; -use clap::Parser; -use hf_hub::api::tokio::{Api, ApiBuilder}; -use hf_hub::{Cache, Repo, RepoType}; -use tracing::info; - -use text_generation_backends_trtllm::errors::TensorRtLlmBackendError; -use text_generation_backends_trtllm::TensorRtLlmBackendV2; -use text_generation_router::usage_stats::UsageStatsLevel; -use text_generation_router::{server, HubTokenizerConfig, Tokenizer}; -use text_generation_router::server::{get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer}; - -/// App Configuration -#[derive(Parser, Debug)] -#[clap(author, version, about, long_about = None)] -struct Args { - #[clap(default_value = "128", long, env)] - max_concurrent_requests: usize, - #[clap(default_value = "2", long, env)] - max_best_of: usize, - #[clap(default_value = "4", long, env)] - max_stop_sequences: usize, - #[clap(default_value = "5", long, env)] - max_top_n_tokens: u32, - #[clap(default_value = "1024", long, env)] - max_input_tokens: usize, - #[clap(default_value = "2048", long, env)] - max_total_tokens: usize, - #[clap(default_value = "4096", long, env)] - max_batch_prefill_tokens: u32, - #[clap(long, env)] - max_batch_total_tokens: Option, - #[clap(default_value = "0.0.0.0", long, env)] - hostname: String, - #[clap(default_value = "3000", long, short, env)] - port: u16, - #[clap(long, env, required = true)] - tokenizer_name: String, - #[clap(long, env)] - tokenizer_config_path: Option, - #[clap(long, env)] - revision: Option, - #[clap(long, env)] - model_id: String, - #[clap(default_value = "2", long, env)] - validation_workers: usize, - #[clap(long, env)] - json_output: bool, - #[clap(long, env)] - otlp_endpoint: Option, - #[clap(default_value = "text-generation-inference.router", long, env)] - otlp_service_name: String, - #[clap(long, env)] - cors_allow_origin: Option>, - #[clap(default_value = "4", long, env)] - max_client_batch_size: usize, - #[clap(long, env)] - auth_token: Option, - #[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")] - executor_worker: PathBuf, - #[clap(default_value = "on", long, env)] - usage_stats: UsageStatsLevel, - #[clap(default_value = "2000000", long, env)] - payload_limit: usize, -} - -async fn get_tokenizer( - tokenizer_name: &str, - tokenizer_config_path: Option<&str>, - revision: Option<&str>, -) -> Option { - // Parse Huggingface hub token - let authorization_token = std::env::var("HF_TOKEN") - .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")) - .ok(); - - // Tokenizer instance - let local_path = Path::new(tokenizer_name); - - // Shared API builder initialization - let api_builder = || { - let mut builder = ApiBuilder::new() - .with_progress(false) - .with_token(authorization_token); - - if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") { - builder = builder.with_cache_dir(cache_dir.into()); - } - - builder - }; - - // Decide if we need to use the API based on the revision and local path - let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir(); - - // Initialize API if needed - #[derive(Clone)] - enum Type { - Api(Api), - Cache(Cache), - None, - } - let api = if use_api { - if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) { - let cache = std::env::var("HUGGINGFACE_HUB_CACHE") - .map_err(|_| ()) - .map(|cache_dir| Cache::new(cache_dir.into())) - .unwrap_or_else(|_| Cache::default()); - tracing::warn!("Offline mode active using cache defaults"); - Type::Cache(cache) - } else { - tracing::info!("Using the Hugging Face API"); - match api_builder().build() { - Ok(api) => Type::Api(api), - Err(_) => { - tracing::warn!("Unable to build the Hugging Face API"); - Type::None - } - } - } - } else { - Type::None - }; - - // Load tokenizer and model info - let ( - config_filename, - _tokenizer_config_filename, - _preprocessor_config_filename, - _processor_config_filename, - _model_info - ) = match api { - Type::None => ( - Some(local_path.join("config.json")), - Some(local_path.join("tokenizer_config.json")), - Some(local_path.join("preprocessor_config.json")), - Some(local_path.join("processor_config.json")), - None - ), - Type::Api(api) => { - let api_repo = api.repo(Repo::with_revision( - tokenizer_name.to_string(), - RepoType::Model, - revision.unwrap_or_else(|| "main").to_string(), - )); - - - let config_filename = api_repo.get("config.json").await.ok(); - let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok(); - let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok(); - let processor_config_filename = api_repo.get("processor_config.json").await.ok(); - - let model_info = if let Some(model_info) = get_hub_model_info(&api_repo).await { - Some(model_info) - } else { - tracing::warn!("Could not retrieve model info from the Hugging Face hub."); - None - }; - ( - config_filename, - tokenizer_config_filename, - preprocessor_config_filename, - processor_config_filename, - model_info, - ) - } - Type::Cache(cache) => { - let repo = cache.repo(Repo::with_revision( - tokenizer_name.to_string(), - RepoType::Model, - revision.clone().unwrap_or_else(|| "main").to_string(), - )); - ( - repo.get("config.json"), - repo.get("tokenizer_config.json"), - repo.get("preprocessor_config.json"), - repo.get("processor_config.json"), - None - ) - } - }; - - // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'. - // let tokenizer_config: Option = if let Some(filename) = tokenizer_config_path - // { - // HubTokenizerConfig::from_file(filename) - // } else { - // tokenizer_config_filename.and_then(HubTokenizerConfig::from_file) - // }; - - // let tokenizer_config = tokenizer_config.unwrap_or_else(|| { - // tracing::warn!("Could not find tokenizer config locally and no API specified"); - // HubTokenizerConfig::default() - // }); - - let tokenizer: Tokenizer = { - use pyo3::prelude::*; - pyo3::Python::with_gil(|py| -> PyResult<()> { - py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), false)?; - Ok(()) - }) - .inspect_err(|err| { - tracing::error!("Failed to import python tokenizer {err}"); - }) - .or_else(|err| { - let out = legacy_tokenizer_handle(config_filename.as_ref()); - out.ok_or(err) - }) - .expect("We cannot load a tokenizer"); - let filename = "out/tokenizer.json"; - if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) { - Tokenizer::Rust(tok) - } else { - Tokenizer::Python { - tokenizer_name: tokenizer_name.to_string(), - revision: revision.map(|revision| revision.to_string()), - trust_remote_code: false, - } - } - }; - - Some(tokenizer) -} - -#[tokio::main] -async fn main() -> Result<(), TensorRtLlmBackendError> { - // Get args - let args = Args::parse(); - // Pattern match configuration - let Args { - max_concurrent_requests, - max_best_of, - max_stop_sequences, - max_top_n_tokens, - max_input_tokens, - max_total_tokens, - max_batch_prefill_tokens, - max_batch_total_tokens, - hostname, - port, - tokenizer_name, - tokenizer_config_path, - revision, - model_id, - validation_workers, - json_output, - otlp_endpoint, - otlp_service_name, - cors_allow_origin, - max_client_batch_size, - auth_token, - executor_worker, - usage_stats, - payload_limit, - } = args; - - // Launch Tokio runtime - text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output); - - // Validate args - if max_input_tokens >= max_total_tokens { - return Err(TensorRtLlmBackendError::ArgumentValidation( - "`max_input_tokens` must be < `max_total_tokens`".to_string(), - )); - } - if max_input_tokens as u32 > max_batch_prefill_tokens { - return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}"))); - } - - if validation_workers == 0 { - return Err(TensorRtLlmBackendError::ArgumentValidation( - "`validation_workers` must be > 0".to_string(), - )); - } - - if let Some(ref max_batch_total_tokens) = max_batch_total_tokens { - if max_batch_prefill_tokens > *max_batch_total_tokens { - return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}"))); - } - if max_total_tokens as u32 > *max_batch_total_tokens { - return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}"))); - } - } - - if !executor_worker.exists() { - return Err(TensorRtLlmBackendError::ArgumentValidation(format!( - "`executor_work` specified path doesn't exists: {}", - executor_worker.display() - ))); - } - - // Create the backend - match get_tokenizer( - &tokenizer_name, - tokenizer_config_path.as_deref(), - revision.as_deref(), - ) - .await - .expect("Failed to retrieve tokenizer implementation") { - Tokenizer::Python { .. } => { - Err(TensorRtLlmBackendError::Tokenizer("Failed to retrieve Rust based tokenizer".to_string())) - } - Tokenizer::Rust(tokenizer) => { - info!("Successfully retrieved tokenizer {}", &tokenizer_name); - let backend = TensorRtLlmBackendV2::new( - tokenizer, - model_id, - executor_worker, - max_concurrent_requests, - )?; - - info!("Successfully created backend"); - - // Run server - server::run( - backend, - max_concurrent_requests, - max_best_of, - max_stop_sequences, - max_top_n_tokens, - max_input_tokens, - max_total_tokens, - validation_workers, - auth_token, - tokenizer_name, - tokenizer_config_path, - revision, - false, - hostname, - port, - cors_allow_origin, - false, - None, - None, - true, - max_client_batch_size, - usage_stats, - payload_limit, - ).await?; - Ok(()) - } - } - -} diff --git a/backends/trtllm/tests/test_backend.cpp b/backends/trtllm/tests/test_backend.cpp index ae097405bc4..14d92b75434 100644 --- a/backends/trtllm/tests/test_backend.cpp +++ b/backends/trtllm/tests/test_backend.cpp @@ -149,4 +149,4 @@ TEST_CASE("sampling_params_t to tle::SamplingConfig", "[backend_t]") REQUIRE(config.getTemperature().has_value()); REQUIRE_THAT(*config.getTemperature(), Catch::Matchers::WithinAbs(params.temperature, 1e-6f)); -} \ No newline at end of file +} diff --git a/backends/trtllm/tests/test_hardware.cpp b/backends/trtllm/tests/test_hardware.cpp index 4cb7b562087..e14f1f357f4 100644 --- a/backends/trtllm/tests/test_hardware.cpp +++ b/backends/trtllm/tests/test_hardware.cpp @@ -79,4 +79,4 @@ TEST_CASE("is_at_least") { REQUIRE(HOPPER_CAPABILITIES.is_at_least(AMPERE)); REQUIRE(HOPPER_CAPABILITIES.is_at_least(ADA_LOVELACE)); REQUIRE(HOPPER_CAPABILITIES.is_at_least(HOPPER)); -} \ No newline at end of file +} diff --git a/docs/source/backends/trtllm.md b/docs/source/backends/trtllm.md index 8eb37180c7d..be6416b15e5 100644 --- a/docs/source/backends/trtllm.md +++ b/docs/source/backends/trtllm.md @@ -17,7 +17,7 @@ supported. You can use [Optimum-NVIDIA](https://github.com/huggingface/optimum-nvidia) to compile engines for the models you want to use. -```bash +```bash MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" # Install huggingface_cli @@ -32,7 +32,7 @@ mkdir -p /tmp/models/$MODEL_NAME # Create a directory to store the compiled engine mkdir -p /tmp/engines/$MODEL_NAME -# Download the model +# Download the model HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME # Compile the engine using Optimum-NVIDIA @@ -69,7 +69,7 @@ docker run \ -e MODEL=$MODEL_NAME \ -e PORT=3000 \ -e HF_TOKEN='hf_XXX' \ - -v /tmp/engines/$MODEL_NAME:/data \ + -v /tmp/engines/$MODEL_NAME:/data \ ghcr.io/huggingface/text-generation-inference:latest-trtllm \ --executor-worker executorWorker \ --model-id /data/$MODEL_NAME @@ -78,4 +78,4 @@ docker run \ ## Development To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in -`.devcontainer` directory. \ No newline at end of file +`.devcontainer` directory. diff --git a/docs/source/multi_backend_support.md b/docs/source/multi_backend_support.md index 5899e4b77d4..c4df15bc2ca 100644 --- a/docs/source/multi_backend_support.md +++ b/docs/source/multi_backend_support.md @@ -1,13 +1,13 @@ # Multi-backend support TGI (Text Generation Inference) offers flexibility by supporting multiple backends for serving large language models (LLMs). -With multi-backend support, you can choose the backend that best suits your needs, -whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with +With multi-backend support, you can choose the backend that best suits your needs, +whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with TGI remains consistent across backends, allowing you to switch between them seamlessly. **Supported backends:** -* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option +* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option within TGI. Developed in-house, it boasts numerous optimizations and is used in production by various projects, including those by Hugging Face. -* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference. - It utilizes specialized optimizations and custom kernels for enhanced performance. - However, it requires a model-specific compilation step for each GPU architecture. \ No newline at end of file +* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference. + It utilizes specialized optimizations and custom kernels for enhanced performance. + However, it requires a model-specific compilation step for each GPU architecture. diff --git a/server/text_generation_server/models/mllama_causal_lm.py b/server/text_generation_server/models/mllama_causal_lm.py index f212312f2bb..3a726f9f4e2 100644 --- a/server/text_generation_server/models/mllama_causal_lm.py +++ b/server/text_generation_server/models/mllama_causal_lm.py @@ -161,8 +161,8 @@ def from_pb_processor( dtype: torch.dtype, device: torch.device, ) -> "VlmCausalLMBatch": - batch_tokenized_inputs, image_inputs, _video_inputs = cls.batch_tokenized_inputs( - pb.requests, tokenizer, processor, config + batch_tokenized_inputs, image_inputs, _video_inputs = ( + cls.batch_tokenized_inputs(pb.requests, tokenizer, processor, config) ) batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device) # XXX: <|image|> token is actually out of bounds and bugs out the logit processors.