diff --git a/.devcontainer/Dockerfile_trtllm b/.devcontainer/Dockerfile_trtllm
index 21b7114ce03..239a7bf8c2c 100644
--- a/.devcontainer/Dockerfile_trtllm
+++ b/.devcontainer/Dockerfile_trtllm
@@ -72,4 +72,4 @@ RUN cargo install cargo-chef
 COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
 COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
 
-ENV MPI_HOME=/usr/local/mpi
\ No newline at end of file
+ENV MPI_HOME=/usr/local/mpi
diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
index 0a0f6e6bf70..d9c1aa15e6b 100644
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@@ -44,7 +44,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
     }
 
     let mut config = cmake::Config::new(".");
-    config.uses_cxx11()
+    config
+        .uses_cxx11()
         .generator("Ninja")
         .profile(match is_debug {
             true => "Debug",
@@ -57,12 +58,12 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
         .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
         .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);
 
-        // Allow to override which Python to use ...
-        if let Some(python3) = option_env!("Python3_EXECUTABLE") {
-            config.define("Python3_EXECUTABLE", python3);
-        }
+    // Allow to override which Python to use ...
+    if let Some(python3) = option_env!("Python3_EXECUTABLE") {
+        config.define("Python3_EXECUTABLE", python3);
+    }
 
-        config.build();
+    config.build();
 
     // Additional transitive CMake dependencies
     let deps_folder = out_dir.join("build").join("_deps");
diff --git a/backends/trtllm/csrc/backend.hpp b/backends/trtllm/csrc/backend.hpp
index f49c437a2b2..40b44a842b3 100644
--- a/backends/trtllm/csrc/backend.hpp
+++ b/backends/trtllm/csrc/backend.hpp
@@ -228,4 +228,4 @@ struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t> : f
     }
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
index de2333afe37..d0342d4bb38 100644
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -159,4 +159,4 @@ namespace huggingface::tgi::backends::trtllm {
         );
     }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/backends/trtllm/csrc/hardware.hpp b/backends/trtllm/csrc/hardware.hpp
index 8e5fa696dbb..abfb4afd51d 100644
--- a/backends/trtllm/csrc/hardware.hpp
+++ b/backends/trtllm/csrc/hardware.hpp
@@ -78,4 +78,4 @@ namespace huggingface::tgi::hardware::cuda {
         [[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
     };
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index 3addd95f96f..969046d184b 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -133,13 +133,16 @@ fn executor_status_looper(
                                 Ok(decoded_token) => {
                                     post_process_decoded_token(&tokenizer, ctx, decoded_token)
                                 }
-                                Err(err) => Err(err)
+                                Err(err) => Err(err),
                             };
 
                             // Attempt to send back the response to the client
                             if let Err(_) = ctx.streamer.send(response) {
                                 // Client has dropped, remove from tracked requests
-                                debug!("Client dropped - removing request {} from tracked requests", step.request_id);
+                                debug!(
+                                    "Client dropped - removing request {} from tracked requests",
+                                    step.request_id
+                                );
                                 backend.as_mut().cancel(step.request_id);
                                 let _ = in_flights.remove(&step.request_id);
                             }
@@ -160,11 +163,14 @@ fn executor_status_looper(
     }
 }
 
-fn post_process_decoded_token(tokenizer: &Tokenizer, ctx: &mut GenerationContext, decoded_token: DecodedToken) -> InferResult<InferStreamResponse> {
+fn post_process_decoded_token(
+    tokenizer: &Tokenizer,
+    ctx: &mut GenerationContext,
+    decoded_token: DecodedToken,
+) -> InferResult<InferStreamResponse> {
     match tokenizer.decode(&[decoded_token.id], false) {
         Ok(text) => {
-            let is_special =
-                tokenizer.get_added_vocabulary().is_special_token(&text);
+            let is_special = tokenizer.get_added_vocabulary().is_special_token(&text);
             let token = Token {
                 id: decoded_token.id,
                 text,
@@ -186,7 +192,7 @@ fn post_process_decoded_token(tokenizer: &Tokenizer, ctx: &mut GenerationContext
                 let generated_text = GeneratedText {
                     text: text.unwrap(),
                     generated_tokens: ctx.tokens.len() as u32,
-                    finish_reason: FinishReason::EndOfSequenceToken,  // TODO : Map FinishReason
+                    finish_reason: FinishReason::EndOfSequenceToken, // TODO : Map FinishReason
                     seed: None,
                 };
 
@@ -248,7 +254,6 @@ unsafe impl Send for TensorRtLlmBackendImpl {}
 
 pub struct TensorRtLlmBackendV2(UnboundedSender<GenerationContext>);
 
-
 impl TensorRtLlmBackendV2 {
     pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(
         tokenizer: Tokenizer,
@@ -268,12 +273,7 @@ impl TensorRtLlmBackendV2 {
 
         // Executor looper is responsible for scheduling and pulling requests state at regular interval
         spawn_blocking(move || {
-            executor_status_looper(
-                max_inflight_requests,
-                tokenizer,
-                backend,
-                executor_receiver,
-            )
+            executor_status_looper(max_inflight_requests, tokenizer, backend, executor_receiver)
         });
 
         Ok(TensorRtLlmBackendV2(executor_sender))
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
index 9c76bafa662..8b137891791 100644
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@@ -1,345 +1 @@
-use std::path::{Path, PathBuf};
 
-use clap::Parser;
-use hf_hub::api::tokio::{Api, ApiBuilder};
-use hf_hub::{Cache, Repo, RepoType};
-use tracing::info;
-
-use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
-use text_generation_backends_trtllm::TensorRtLlmBackendV2;
-use text_generation_router::usage_stats::UsageStatsLevel;
-use text_generation_router::{server, HubTokenizerConfig, Tokenizer};
-use text_generation_router::server::{get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer};
-
-/// App Configuration
-#[derive(Parser, Debug)]
-#[clap(author, version, about, long_about = None)]
-struct Args {
-    #[clap(default_value = "128", long, env)]
-    max_concurrent_requests: usize,
-    #[clap(default_value = "2", long, env)]
-    max_best_of: usize,
-    #[clap(default_value = "4", long, env)]
-    max_stop_sequences: usize,
-    #[clap(default_value = "5", long, env)]
-    max_top_n_tokens: u32,
-    #[clap(default_value = "1024", long, env)]
-    max_input_tokens: usize,
-    #[clap(default_value = "2048", long, env)]
-    max_total_tokens: usize,
-    #[clap(default_value = "4096", long, env)]
-    max_batch_prefill_tokens: u32,
-    #[clap(long, env)]
-    max_batch_total_tokens: Option<u32>,
-    #[clap(default_value = "0.0.0.0", long, env)]
-    hostname: String,
-    #[clap(default_value = "3000", long, short, env)]
-    port: u16,
-    #[clap(long, env, required = true)]
-    tokenizer_name: String,
-    #[clap(long, env)]
-    tokenizer_config_path: Option<String>,
-    #[clap(long, env)]
-    revision: Option<String>,
-    #[clap(long, env)]
-    model_id: String,
-    #[clap(default_value = "2", long, env)]
-    validation_workers: usize,
-    #[clap(long, env)]
-    json_output: bool,
-    #[clap(long, env)]
-    otlp_endpoint: Option<String>,
-    #[clap(default_value = "text-generation-inference.router", long, env)]
-    otlp_service_name: String,
-    #[clap(long, env)]
-    cors_allow_origin: Option<Vec<String>>,
-    #[clap(default_value = "4", long, env)]
-    max_client_batch_size: usize,
-    #[clap(long, env)]
-    auth_token: Option<String>,
-    #[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")]
-    executor_worker: PathBuf,
-    #[clap(default_value = "on", long, env)]
-    usage_stats: UsageStatsLevel,
-    #[clap(default_value = "2000000", long, env)]
-    payload_limit: usize,
-}
-
-async fn get_tokenizer(
-    tokenizer_name: &str,
-    tokenizer_config_path: Option<&str>,
-    revision: Option<&str>,
-) -> Option<Tokenizer> {
-    // Parse Huggingface hub token
-    let authorization_token = std::env::var("HF_TOKEN")
-        .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
-        .ok();
-
-    // Tokenizer instance
-    let local_path = Path::new(tokenizer_name);
-
-    // Shared API builder initialization
-    let api_builder = || {
-        let mut builder = ApiBuilder::new()
-            .with_progress(false)
-            .with_token(authorization_token);
-
-        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
-            builder = builder.with_cache_dir(cache_dir.into());
-        }
-
-        builder
-    };
-
-    // Decide if we need to use the API based on the revision and local path
-    let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir();
-
-    // Initialize API if needed
-    #[derive(Clone)]
-    enum Type {
-        Api(Api),
-        Cache(Cache),
-        None,
-    }
-    let api = if use_api {
-        if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) {
-            let cache = std::env::var("HUGGINGFACE_HUB_CACHE")
-                .map_err(|_| ())
-                .map(|cache_dir| Cache::new(cache_dir.into()))
-                .unwrap_or_else(|_| Cache::default());
-            tracing::warn!("Offline mode active using cache defaults");
-            Type::Cache(cache)
-        } else {
-            tracing::info!("Using the Hugging Face API");
-            match api_builder().build() {
-                Ok(api) => Type::Api(api),
-                Err(_) => {
-                    tracing::warn!("Unable to build the Hugging Face API");
-                    Type::None
-                }
-            }
-        }
-    } else {
-        Type::None
-    };
-
-    // Load tokenizer and model info
-    let (
-        config_filename,
-        _tokenizer_config_filename,
-        _preprocessor_config_filename,
-        _processor_config_filename,
-        _model_info
-    ) = match api {
-        Type::None => (
-            Some(local_path.join("config.json")),
-            Some(local_path.join("tokenizer_config.json")),
-            Some(local_path.join("preprocessor_config.json")),
-            Some(local_path.join("processor_config.json")),
-            None
-        ),
-        Type::Api(api) => {
-            let api_repo = api.repo(Repo::with_revision(
-                tokenizer_name.to_string(),
-                RepoType::Model,
-                revision.unwrap_or_else(|| "main").to_string(),
-            ));
-
-
-            let config_filename = api_repo.get("config.json").await.ok();
-            let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
-            let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
-            let processor_config_filename = api_repo.get("processor_config.json").await.ok();
-
-            let model_info = if let Some(model_info) = get_hub_model_info(&api_repo).await {
-                Some(model_info)
-            } else {
-                tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
-                None
-            };
-            (
-                config_filename,
-                tokenizer_config_filename,
-                preprocessor_config_filename,
-                processor_config_filename,
-                model_info,
-            )
-        }
-        Type::Cache(cache) => {
-            let repo = cache.repo(Repo::with_revision(
-                tokenizer_name.to_string(),
-                RepoType::Model,
-                revision.clone().unwrap_or_else(|| "main").to_string(),
-            ));
-            (
-                repo.get("config.json"),
-                repo.get("tokenizer_config.json"),
-                repo.get("preprocessor_config.json"),
-                repo.get("processor_config.json"),
-                None
-            )
-        }
-    };
-
-    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
-    // let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
-    // {
-    //     HubTokenizerConfig::from_file(filename)
-    // } else {
-    //     tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
-    // };
-
-    // let tokenizer_config = tokenizer_config.unwrap_or_else(|| {
-    //     tracing::warn!("Could not find tokenizer config locally and no API specified");
-    //     HubTokenizerConfig::default()
-    // });
-
-    let tokenizer: Tokenizer = {
-        use pyo3::prelude::*;
-        pyo3::Python::with_gil(|py| -> PyResult<()> {
-            py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), false)?;
-            Ok(())
-        })
-            .inspect_err(|err| {
-                tracing::error!("Failed to import python tokenizer {err}");
-            })
-            .or_else(|err| {
-                let out = legacy_tokenizer_handle(config_filename.as_ref());
-                out.ok_or(err)
-            })
-            .expect("We cannot load a tokenizer");
-        let filename = "out/tokenizer.json";
-        if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) {
-            Tokenizer::Rust(tok)
-        } else {
-            Tokenizer::Python {
-                tokenizer_name: tokenizer_name.to_string(),
-                revision: revision.map(|revision| revision.to_string()),
-                trust_remote_code: false,
-            }
-        }
-    };
-
-    Some(tokenizer)
-}
-
-#[tokio::main]
-async fn main() -> Result<(), TensorRtLlmBackendError> {
-    // Get args
-    let args = Args::parse();
-    // Pattern match configuration
-    let Args {
-        max_concurrent_requests,
-        max_best_of,
-        max_stop_sequences,
-        max_top_n_tokens,
-        max_input_tokens,
-        max_total_tokens,
-        max_batch_prefill_tokens,
-        max_batch_total_tokens,
-        hostname,
-        port,
-        tokenizer_name,
-        tokenizer_config_path,
-        revision,
-        model_id,
-        validation_workers,
-        json_output,
-        otlp_endpoint,
-        otlp_service_name,
-        cors_allow_origin,
-        max_client_batch_size,
-        auth_token,
-        executor_worker,
-        usage_stats,
-        payload_limit,
-    } = args;
-
-    // Launch Tokio runtime
-    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
-
-    // Validate args
-    if max_input_tokens >= max_total_tokens {
-        return Err(TensorRtLlmBackendError::ArgumentValidation(
-            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
-        ));
-    }
-    if max_input_tokens as u32 > max_batch_prefill_tokens {
-        return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
-    }
-
-    if validation_workers == 0 {
-        return Err(TensorRtLlmBackendError::ArgumentValidation(
-            "`validation_workers` must be > 0".to_string(),
-        ));
-    }
-
-    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
-        if max_batch_prefill_tokens > *max_batch_total_tokens {
-            return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
-        }
-        if max_total_tokens as u32 > *max_batch_total_tokens {
-            return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
-        }
-    }
-
-    if !executor_worker.exists() {
-        return Err(TensorRtLlmBackendError::ArgumentValidation(format!(
-            "`executor_work` specified path doesn't exists: {}",
-            executor_worker.display()
-        )));
-    }
-
-    // Create the backend
-    match get_tokenizer(
-        &tokenizer_name,
-        tokenizer_config_path.as_deref(),
-        revision.as_deref(),
-    )
-    .await
-    .expect("Failed to retrieve tokenizer implementation") {
-        Tokenizer::Python { .. } => {
-            Err(TensorRtLlmBackendError::Tokenizer("Failed to retrieve Rust based tokenizer".to_string()))
-        }
-        Tokenizer::Rust(tokenizer) => {
-            info!("Successfully retrieved tokenizer {}", &tokenizer_name);
-            let backend = TensorRtLlmBackendV2::new(
-                tokenizer,
-                model_id,
-                executor_worker,
-                max_concurrent_requests,
-            )?;
-
-            info!("Successfully created backend");
-
-            // Run server
-            server::run(
-                backend,
-                max_concurrent_requests,
-                max_best_of,
-                max_stop_sequences,
-                max_top_n_tokens,
-                max_input_tokens,
-                max_total_tokens,
-                validation_workers,
-                auth_token,
-                tokenizer_name,
-                tokenizer_config_path,
-                revision,
-                false,
-                hostname,
-                port,
-                cors_allow_origin,
-                false,
-                None,
-                None,
-                true,
-                max_client_batch_size,
-                usage_stats,
-                payload_limit,
-            ).await?;
-            Ok(())
-        }
-    }
-
-}
diff --git a/backends/trtllm/tests/test_backend.cpp b/backends/trtllm/tests/test_backend.cpp
index ae097405bc4..14d92b75434 100644
--- a/backends/trtllm/tests/test_backend.cpp
+++ b/backends/trtllm/tests/test_backend.cpp
@@ -149,4 +149,4 @@ TEST_CASE("sampling_params_t to tle::SamplingConfig", "[backend_t]")
 
     REQUIRE(config.getTemperature().has_value());
     REQUIRE_THAT(*config.getTemperature(), Catch::Matchers::WithinAbs(params.temperature, 1e-6f));
-}
\ No newline at end of file
+}
diff --git a/backends/trtllm/tests/test_hardware.cpp b/backends/trtllm/tests/test_hardware.cpp
index 4cb7b562087..e14f1f357f4 100644
--- a/backends/trtllm/tests/test_hardware.cpp
+++ b/backends/trtllm/tests/test_hardware.cpp
@@ -79,4 +79,4 @@ TEST_CASE("is_at_least") {
     REQUIRE(HOPPER_CAPABILITIES.is_at_least(AMPERE));
     REQUIRE(HOPPER_CAPABILITIES.is_at_least(ADA_LOVELACE));
     REQUIRE(HOPPER_CAPABILITIES.is_at_least(HOPPER));
-}
\ No newline at end of file
+}
diff --git a/docs/source/backends/trtllm.md b/docs/source/backends/trtllm.md
index 8eb37180c7d..be6416b15e5 100644
--- a/docs/source/backends/trtllm.md
+++ b/docs/source/backends/trtllm.md
@@ -17,7 +17,7 @@ supported.
 You can use [Optimum-NVIDIA](https://github.com/huggingface/optimum-nvidia) to compile engines for the models you
 want to use.
 
-```bash 
+```bash
 MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
 
 # Install huggingface_cli
@@ -32,7 +32,7 @@ mkdir -p /tmp/models/$MODEL_NAME
 # Create a directory to store the compiled engine
 mkdir -p /tmp/engines/$MODEL_NAME
 
-# Download the model 
+# Download the model
 HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME
 
 # Compile the engine using Optimum-NVIDIA
@@ -69,7 +69,7 @@ docker run \
   -e MODEL=$MODEL_NAME \
   -e PORT=3000 \
   -e HF_TOKEN='hf_XXX' \
-  -v /tmp/engines/$MODEL_NAME:/data \ 
+  -v /tmp/engines/$MODEL_NAME:/data \
   ghcr.io/huggingface/text-generation-inference:latest-trtllm \
   --executor-worker executorWorker \
   --model-id /data/$MODEL_NAME
@@ -78,4 +78,4 @@ docker run \
 ## Development
 
 To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in
-`.devcontainer` directory.
\ No newline at end of file
+`.devcontainer` directory.
diff --git a/docs/source/multi_backend_support.md b/docs/source/multi_backend_support.md
index 5899e4b77d4..c4df15bc2ca 100644
--- a/docs/source/multi_backend_support.md
+++ b/docs/source/multi_backend_support.md
@@ -1,13 +1,13 @@
 # Multi-backend support
 
 TGI (Text Generation Inference) offers flexibility by supporting multiple backends for serving large language models (LLMs).
-With multi-backend support, you can choose the backend that best suits your needs, 
-whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with 
+With multi-backend support, you can choose the backend that best suits your needs,
+whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with
 TGI remains consistent across backends, allowing you to switch between them seamlessly.
 
 **Supported backends:**
-* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option 
+* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option
   within TGI. Developed in-house, it boasts numerous optimizations and is used in production by various projects, including those by Hugging Face.
-* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference. 
-  It utilizes specialized optimizations and custom kernels for enhanced performance. 
-  However, it requires a model-specific compilation step for each GPU architecture.
\ No newline at end of file
+* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference.
+  It utilizes specialized optimizations and custom kernels for enhanced performance.
+  However, it requires a model-specific compilation step for each GPU architecture.
diff --git a/server/text_generation_server/models/mllama_causal_lm.py b/server/text_generation_server/models/mllama_causal_lm.py
index f212312f2bb..3a726f9f4e2 100644
--- a/server/text_generation_server/models/mllama_causal_lm.py
+++ b/server/text_generation_server/models/mllama_causal_lm.py
@@ -161,8 +161,8 @@ def from_pb_processor(
         dtype: torch.dtype,
         device: torch.device,
     ) -> "VlmCausalLMBatch":
-        batch_tokenized_inputs, image_inputs, _video_inputs = cls.batch_tokenized_inputs(
-            pb.requests, tokenizer, processor, config
+        batch_tokenized_inputs, image_inputs, _video_inputs = (
+            cls.batch_tokenized_inputs(pb.requests, tokenizer, processor, config)
         )
         batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
         # XXX: <|image|> token is actually out of bounds and bugs out the logit processors.