diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/.clang-format b/cpp/tensorrt_llm/cortex.tensorrt-llm/.clang-format
new file mode 100644
index 000000000..dc69733e1
--- /dev/null
+++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/.clang-format
@@ -0,0 +1,83 @@
+# Google C/C++ Code Style settings
+# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+# Author: Kehan Xue, kehan.xue (at) gmail.com
+
+Language: Cpp
+BasedOnStyle: Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: None
+AlignOperands: Align
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: Empty
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: Never  # To avoid conflict, set this "Never" and each "if statement" should include brace when coding
+AllowShortLambdasOnASingleLine: Inline
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterCaseLabel: false
+  AfterClass: false
+  AfterStruct: false
+  AfterControlStatement: Never
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: false
+  BeforeElse: false
+  BeforeLambdaBody: false
+  IndentBraces: false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+ColumnLimit: 80
+CompactNamespaces: false
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false  # Make sure the * or & align on the left
+EmptyLineBeforeAccessModifier: LogicalBlock
+FixNamespaceComments: true
+IncludeBlocks: Preserve
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth: 2
+KeepEmptyLinesAtTheStartOfBlocks: true
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PointerAlignment: Left
+ReflowComments: false
+# SeparateDefinitionBlocks: Always   # Only support since clang-format 14
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInContainerLiterals: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: c++11
+TabWidth: 4
+UseTab: Never
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc
index fd5dcbf94..ba407f5d5 100644
--- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc
+++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc
@@ -2,141 +2,169 @@
 #include "models/chat_completion_request.h"
 #include "nlohmann/json.hpp"
 
-#include "src/models/load_model_request.h"
-#include "tensorrt_llm/runtime/generationInput.h"
-#include "tensorrt_llm/runtime/generationOutput.h"
-#include "tensorrt_llm/runtime/samplingConfig.h"
-#include "utils/tensorrt-llm_utils.h"
-#include "json/writer.h"
-#include "cpp-tiktoken/encoding.h" //include to use tiktoken
+#include <trantor/utils/Logger.h>
 #include <algorithm>
+#include <chrono>
 #include <cstdint>
 #include <memory>
 #include <queue>
 #include <string>
 #include <thread>
-#include <trantor/utils/Logger.h>
 #include <vector>
-#include <chrono>
+#include "cpp-tiktoken/encoding.h"  //include to use tiktoken
+#include "json/writer.h"
+#include "src/models/load_model_request.h"
+#include "tensorrt_llm/runtime/generationInput.h"
+#include "tensorrt_llm/runtime/generationOutput.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
+#include "utils/tensorrt-llm_utils.h"
 
 using json = nlohmann::json;
 using namespace tensorrtllm;
 
 namespace {
-  constexpr const int k200OK = 200;
-  constexpr const int k400BadRequest = 400;
-  constexpr const int k409Conflict = 409;
-  constexpr const int k500InternalServerError = 500;
-
-  // https://nvidia.github.io/TensorRT-LLM/_cpp_gen/runtime.html#generationinput-h
-  // stopWordsList 
-  // 'im', '_' , 'end', '</s>', '<|im_end|>'
-  const std::vector<int32_t> kOpenhermesStopWords = {321, 28730, 416, 2, 32000, 3, 4, 5, -1, -1};
-  const std::string kOhUserPrompt = "<|im_end|>\n<|im_start|>user\n";
-  const std::string kOhAiPrompt = "<|im_end|>\n<|im_start|>assistant\n";
-  const std::string kOhSystemPrompt = "<|im_start|>system\n";
-  const std::unordered_map<std::string, int> kOpenhermesTemplate = {{"<|im_end|>", 32000} , {"<|im_start|>", 32001}};
-
-  // '[', 'INST', ']', '[INST]', ''[, '/' , 'INST',']', '[/INST]', '</s>'
-  const std::vector<int32_t> kMistral_V0_3_StopWords 
-    = {29560, 17057, 29561, 3, 29560, 29516, 17057, 29561, 4, 2, 3, 4, 8, 9, 10, -1, -1, -1, -1, -1};
-
-  enum class MistralTemplate: int32_t {
-    kBos = 1,
-    kEos = 2,
-    kBeginInst = 3,
-    kEndInst = 4
-  };
+constexpr const int k200OK = 200;
+constexpr const int k400BadRequest = 400;
+constexpr const int k409Conflict = 409;
+constexpr const int k500InternalServerError = 500;
+
+// https://nvidia.github.io/TensorRT-LLM/_cpp_gen/runtime.html#generationinput-h
+// stopWordsList
+// 'im', '_' , 'end', '</s>', '<|im_end|>'
+const std::vector<int32_t> kOpenhermesStopWords = {321, 28730, 416, 2,  32000,
+                                                   3,   4,     5,   -1, -1};
+const std::string kOhUserPrompt = "<|im_end|>\n<|im_start|>user\n";
+const std::string kOhAiPrompt = "<|im_end|>\n<|im_start|>assistant\n";
+const std::string kOhSystemPrompt = "<|im_start|>system\n";
+const std::unordered_map<std::string, int> kOpenhermesTemplate = {
+    {"<|im_end|>", 32000},
+    {"<|im_start|>", 32001}};
+
+// '[', 'INST', ']', '[INST]', ''[, '/' , 'INST',']', '[/INST]', '</s>'
+const std::vector<int32_t> kMistral_V0_3_StopWords = {
+    29560, 17057, 29561, 3, 29560, 29516, 17057, 29561, 4,  2,
+    3,     4,     8,     9, 10,    -1,    -1,    -1,    -1, -1};
+
+enum class MistralTemplate : int32_t {
+  kBos = 1,
+  kEos = 2,
+  kBeginInst = 3,
+  kEndInst = 4
+};
 
-  enum class Llama3Template: int32_t{
-    kBeginOfText = 128000,
-    kEndOfText = 128001,
-    kEndOfTurn = 128009,
-    kStartHeaderId = 128006,
-    kEndHeaderId = 128007,
-    kParagraph = 271
-  };
+enum class Llama3Template : int32_t {
+  kBeginOfText = 128000,
+  kEndOfText = 128001,
+  kEndOfTurn = 128009,
+  kStartHeaderId = 128006,
+  kEndHeaderId = 128007,
+  kParagraph = 271
+};
 
-  // "<|end_of_text|>", "<|eot_id|>"
-  const std::vector<int32_t> Llama3StopWords = {128001, 128009, 1, 2};
+// "<|end_of_text|>", "<|eot_id|>"
+const std::vector<int32_t> Llama3StopWords = {128001, 128009, 1, 2};
 
-  // TODO(sang) This is fragile, just a temporary solution. Maybe can use a config file or model architect, etc... 
-  bool IsOpenhermes(const std::string& s) {
-    if (s.find("mistral") != std::string::npos || s.find("Mistral") != std::string::npos) {
-      return false;
-    } 
-    return true;
+// TODO(sang) This is fragile, just a temporary solution. Maybe can use a config file or model architect, etc...
+bool IsOpenhermes(const std::string& s) {
+  if (s.find("mistral") != std::string::npos ||
+      s.find("Mistral") != std::string::npos) {
+    return false;
   }
-    ModelType GetModelType(const std::string& s){
-    if (s.find("Llama3") != std::string::npos || s.find("llama3") != std::string::npos) {
-      return ModelType::kLlama3;
-    } 
-    else if (s.find("mistral") != std::string::npos || s.find("Mistral") != std::string::npos)
-    {
-      return  ModelType::kMistral;
-    }
-    else{
-      return ModelType::kOpenHermes;
-    }
-    
+  return true;
+}
+ModelType GetModelType(const std::string& s) {
+  if (s.find("Llama3") != std::string::npos ||
+      s.find("llama3") != std::string::npos) {
+    return ModelType::kLlama3;
+  } else if (s.find("mistral") != std::string::npos ||
+             s.find("Mistral") != std::string::npos) {
+    return ModelType::kMistral;
+  } else {
+    return ModelType::kOpenHermes;
   }
 }
+}  // namespace
 TensorrtllmEngine::~TensorrtllmEngine() {}
 
 void RemoveId(std::vector<int>& vec, int id) {
   vec.erase(std::remove(vec.begin(), vec.end(), id), vec.end());
 }
 
-bool HandleMatch(std::string const& rew_text, 
-                std::shared_ptr<InferenceState> infer_state, 
-                std::function<void(Json::Value&&, Json::Value&&)> cb, 
-                ModelType model_type) {
+bool HandleMatch(std::string const& rew_text,
+                 std::shared_ptr<InferenceState> infer_state,
+                 std::function<void(Json::Value&&, Json::Value&&)> cb,
+                 ModelType model_type) {
   if (infer_state->IsComplete(model_type)) {
     return false;
   }
   if (infer_state->stop_word_match_len == 0) {
-    if ((model_type == ModelType::kOpenHermes && rew_text.find('<') != std::string::npos) || 
-        (model_type != ModelType::kOpenHermes && rew_text.find('[') != std::string::npos)) {
-      infer_state->stop_word_match_len++; // Move to next state
+    if ((model_type == ModelType::kOpenHermes &&
+         rew_text.find('<') != std::string::npos) ||
+        (model_type != ModelType::kOpenHermes &&
+         rew_text.find('[') != std::string::npos)) {
+      infer_state->stop_word_match_len++;  // Move to next state
       return true;
     }
-  } else if (rew_text == infer_state->GetSequence(model_type, infer_state->stop_word_match_len)) {
-    infer_state->stop_word_match_len++; // Move to next state
+  } else if (rew_text == infer_state->GetSequence(
+                             model_type, infer_state->stop_word_match_len)) {
+    infer_state->stop_word_match_len++;  // Move to next state
     return true;
-  } else if (infer_state->stop_word_match_len > 0 && rew_text == infer_state->GetSequence(model_type, 0u)) {
-    infer_state->stop_word_match_len = 1; // Restart from first match if sequence breaks but matches start
+  } else if (infer_state->stop_word_match_len > 0 &&
+             rew_text == infer_state->GetSequence(model_type, 0u)) {
+    infer_state->stop_word_match_len =
+        1;  // Restart from first match if sequence breaks but matches start
     return true;
   } else {
     infer_state->Reset();
-    return false; // Reset to start if sequence breaks
+    return false;  // Reset to start if sequence breaks
   }
   return false;
 }
 
-GenerationInput::TensorPtr TensorrtllmEngine::GetTensorSingleStopWordList(int stopToken) {
-  std::vector<int32_t> stop_words_tokens = {stopToken, -1, 1, -1}; // Extend with -1 for increased length
-  return gpt_session->getBufferManager().copyFrom(stop_words_tokens, ITensor::makeShape({1, 2, 2}), MemoryType::kGPU);
+GenerationInput::TensorPtr TensorrtllmEngine::GetTensorSingleStopWordList(
+    int stopToken) {
+  std::vector<int32_t> stop_words_tokens = {
+      stopToken, -1, 1, -1};  // Extend with -1 for increased length
+  return gpt_session->getBufferManager().copyFrom(
+      stop_words_tokens, ITensor::makeShape({1, 2, 2}), MemoryType::kGPU);
 }
 
 GenerationInput::TensorPtr TensorrtllmEngine::GetTensorChatMLStopWordList() {
-  if(model_type_ == ModelType::kOpenHermes) {
-    return gpt_session->getBufferManager().copyFrom(kOpenhermesStopWords, ITensor::makeShape({1, 2, static_cast<int>(kOpenhermesStopWords.size()/2)}), MemoryType::kGPU);
+  if (model_type_ == ModelType::kOpenHermes) {
+    return gpt_session->getBufferManager().copyFrom(
+        kOpenhermesStopWords,
+        ITensor::makeShape(
+            {1, 2, static_cast<int>(kOpenhermesStopWords.size() / 2)}),
+        MemoryType::kGPU);
   } else if (model_type_ == ModelType::kMistral) {
-    return gpt_session->getBufferManager().copyFrom(kMistral_V0_3_StopWords, ITensor::makeShape({1, 2, static_cast<int>(kMistral_V0_3_StopWords.size()/2)}), MemoryType::kGPU);
-  }
-  else{
-    return gpt_session->getBufferManager().copyFrom(Llama3StopWords, ITensor::makeShape({1, 2, static_cast<int>(Llama3StopWords.size()/2)}), MemoryType::kGPU);
+    return gpt_session->getBufferManager().copyFrom(
+        kMistral_V0_3_StopWords,
+        ITensor::makeShape(
+            {1, 2, static_cast<int>(kMistral_V0_3_StopWords.size() / 2)}),
+        MemoryType::kGPU);
+  } else {
+    return gpt_session->getBufferManager().copyFrom(
+        Llama3StopWords,
+        ITensor::makeShape(
+            {1, 2, static_cast<int>(Llama3StopWords.size() / 2)}),
+        MemoryType::kGPU);
   }
 }
 
-GenerationInput TensorrtllmEngine::CreateGenerationInput(std::vector<int32_t> input_ids_host) {
+GenerationInput TensorrtllmEngine::CreateGenerationInput(
+    std::vector<int32_t> input_ids_host) {
   int input_len = input_ids_host.size();
   std::vector<int32_t> input_lengths_host(batch_size_, input_len);
-  GenerationInput::TensorPtr input_lengths
-      = gpt_session->getBufferManager().copyFrom(input_lengths_host, ITensor::makeShape({batch_size_}), MemoryType::kGPU);
-  GenerationInput::TensorPtr input_ids = gpt_session->getBufferManager().copyFrom(
-      input_ids_host, ITensor::makeShape({batch_size_, input_len}), MemoryType::kGPU);
-  GenerationInput generation_input{0, 0, input_ids, input_lengths, model_config_->usePackedInput()};
+  GenerationInput::TensorPtr input_lengths =
+      gpt_session->getBufferManager().copyFrom(
+          input_lengths_host, ITensor::makeShape({batch_size_}),
+          MemoryType::kGPU);
+  GenerationInput::TensorPtr input_ids =
+      gpt_session->getBufferManager().copyFrom(
+          input_ids_host, ITensor::makeShape({batch_size_, input_len}),
+          MemoryType::kGPU);
+  GenerationInput generation_input{0, 0, input_ids, input_lengths,
+                                   model_config_->usePackedInput()};
   generation_input.stopWordsList = GetTensorChatMLStopWordList();
 
   LOG_INFO << "Create generation input successfully";
@@ -144,10 +172,11 @@ GenerationInput TensorrtllmEngine::CreateGenerationInput(std::vector<int32_t> in
 }
 
 GenerationOutput TensorrtllmEngine::CreateGenerationOutput() {
-  GenerationOutput generation_output {
-    gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
-    gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)
-  };
+  GenerationOutput generation_output{
+      gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU,
+                                                  nvinfer1::DataType::kINT32),
+      gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU,
+                                                  nvinfer1::DataType::kINT32)};
   LOG_INFO << "Create generation output successfully";
   return generation_output;
 }
@@ -156,65 +185,77 @@ void InferenceThread(
     std::shared_ptr<InferenceState> infer_state,
     std::vector<int32_t> input_ids_host,
     std::function<void(Json::Value&&, Json::Value&&)>&& callback,
-    TensorrtllmEngine* self,
-    SamplingConfig sampling_config,
-    int input_len,
+    TensorrtllmEngine* self, SamplingConfig sampling_config, int input_len,
     int outputLen, ModelType model_type) {
 
   // Input preparation
   LOG_INFO << "Inference thread started";
-  GenerationInput generation_input = self->CreateGenerationInput(input_ids_host);
+  GenerationInput generation_input =
+      self->CreateGenerationInput(input_ids_host);
   GenerationOutput generation_output = self->CreateGenerationOutput();
 
   // Define the callback to stream each generated token
-  generation_output.onTokenGenerated = [&infer_state, input_len, outputLen, self, &generation_output, model_type](
-                                          GenerationOutput::TensorPtr const& output_ids, SizeType32 step, bool finished) {
-    // LOG_INFO << "Generating tokenizer in thread";                                            
+  generation_output
+      .onTokenGenerated = [&infer_state, input_len, outputLen, self,
+                           &generation_output, model_type](
+                              GenerationOutput::TensorPtr const& output_ids,
+                              SizeType32 step, bool finished) {
+    // LOG_INFO << "Generating tokenizer in thread";
     // Assuming the shape of output_ids tensor is (1, 1, 160), where 160 is the number of tokens
-    int output_length = output_ids->getShape().d[2]; // Get the length of output IDs based on the tensor shape
+    int output_length =
+        output_ids->getShape()
+            .d[2];  // Get the length of output IDs based on the tensor shape
     // Copy output IDs from GPU to host for printing
     std::vector<int32_t> output_idsHost(output_length);
-    self->gpt_session->getBufferManager().copy(*output_ids, output_idsHost.data(), MemoryType::kCPU);
+    self->gpt_session->getBufferManager().copy(
+        *output_ids, output_idsHost.data(), MemoryType::kCPU);
     // Find the last non-zero value in the output IDs starting from the end of the input sequence
-    std::vector<int> output_idsHostDecode(output_idsHost.begin() + input_len, output_idsHost.end());
+    std::vector<int> output_idsHostDecode(output_idsHost.begin() + input_len,
+                                          output_idsHost.end());
 
     RemoveId(output_idsHostDecode, 0);
-    if(model_type == ModelType::kOpenHermes) {
-      for(auto const& [_, v]: kOpenhermesTemplate) {
+    if (model_type == ModelType::kOpenHermes) {
+      for (auto const& [_, v] : kOpenhermesTemplate) {
         RemoveId(output_idsHostDecode, v);
       }
     } else if (model_type == ModelType::kMistral) {
-      RemoveId(output_idsHostDecode, static_cast<int32_t>(MistralTemplate::kBeginInst));
-      RemoveId(output_idsHostDecode, static_cast<int32_t>(MistralTemplate::kEndInst));
-    }
-    else if(model_type == ModelType::kLlama3){
-      RemoveId(output_idsHostDecode, static_cast<int32_t>(Llama3Template::kEndOfText));
-      RemoveId(output_idsHostDecode, static_cast<int32_t>(Llama3Template::kEndOfTurn));
+      RemoveId(output_idsHostDecode,
+               static_cast<int32_t>(MistralTemplate::kBeginInst));
+      RemoveId(output_idsHostDecode,
+               static_cast<int32_t>(MistralTemplate::kEndInst));
+    } else if (model_type == ModelType::kLlama3) {
+      RemoveId(output_idsHostDecode,
+               static_cast<int32_t>(Llama3Template::kEndOfText));
+      RemoveId(output_idsHostDecode,
+               static_cast<int32_t>(Llama3Template::kEndOfTurn));
     }
     std::string text = self->cortex_tokenizer->Decode(output_idsHostDecode);
 
     if (infer_state->prev_pos >= 0 && infer_state->prev_pos < text.size()) {
       // Valid prev_pos, proceed with slicing the string from prev_pos to the end
       std::string string_tok(text.begin() + infer_state->prev_pos, text.end());
-      std::lock_guard<std::mutex> guard(infer_state->queue_mutex); // Protect access with a lock
+      std::lock_guard<std::mutex> guard(
+          infer_state->queue_mutex);  // Protect access with a lock
       infer_state->texts_to_stream.push(string_tok);
       ++infer_state->token_gen_count;
-    }
-    else if (infer_state->prev_pos >= text.size()) {
+    } else if (infer_state->prev_pos >= text.size()) {
       infer_state->prev_pos = text.size();
     }
     infer_state->prev_pos = text.size();
     if (finished) {
-      std::lock_guard<std::mutex> guard(infer_state->queue_mutex); // Protect access with a lock
+      std::lock_guard<std::mutex> guard(
+          infer_state->queue_mutex);  // Protect access with a lock
       infer_state->texts_to_stream.push("[DONE]");
-      LOG_INFO << "Cortex.tensorrtllm generated " << infer_state->token_gen_count << " tokens";
+      LOG_INFO << "Cortex.tensorrtllm generated "
+               << infer_state->token_gen_count << " tokens";
       return;
     }
     return;
   };
   // The rest of the logic inside the `chat_completion` remains unchanged...
   // After finishing the setup, call the inference logic
-  self->gpt_session->generate(generation_output, generation_input, sampling_config);
+  self->gpt_session->generate(generation_output, generation_input,
+                              sampling_config);
 }
 
 inline std::string GetModelId(const Json::Value& json_body) {
@@ -236,7 +277,8 @@ inline std::string GetModelId(const Json::Value& json_body) {
   return {};
 }
 
-bool TensorrtllmEngine::CheckModelLoaded(std::function<void(Json::Value&&, Json::Value&&)>& callback) {
+bool TensorrtllmEngine::CheckModelLoaded(
+    std::function<void(Json::Value&&, Json::Value&&)>& callback) {
   if (!model_loaded_) {
     LOG_WARN << "Model is not loaded yet";
     Json::Value json_resp;
@@ -253,28 +295,34 @@ bool TensorrtllmEngine::CheckModelLoaded(std::function<void(Json::Value&&, Json:
   return true;
 }
 
-std::vector<int> TensorrtllmEngine::EncodeHeaderLlama3(const std::string& role){
+std::vector<int> TensorrtllmEngine::EncodeHeaderLlama3(
+    const std::string& role) {
   std::vector<int> tokens = {};
-  tokens.push_back(static_cast<int32_t>(Llama3Template::kStartHeaderId)); // <|start_header_id|>
+  tokens.push_back(static_cast<int32_t>(
+      Llama3Template::kStartHeaderId));  // <|start_header_id|>
   auto new_tokens = cortex_tokenizer->Encode(role);
   tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end());
-  tokens.push_back(static_cast<int32_t>(Llama3Template::kEndHeaderId)); // <|end_header_id|>
-  tokens.push_back(static_cast<int32_t>(Llama3Template::kParagraph)); // \n\n
+  tokens.push_back(
+      static_cast<int32_t>(Llama3Template::kEndHeaderId));  // <|end_header_id|>
+  tokens.push_back(static_cast<int32_t>(Llama3Template::kParagraph));  // \n\n
   return tokens;
 }
-std::vector<int> TensorrtllmEngine::EncodeMessageLlama3( const std::string& role, const std::string& content) {
-  std::vector<int> tokens = EncodeHeaderLlama3( role);
+std::vector<int> TensorrtllmEngine::EncodeMessageLlama3(
+    const std::string& role, const std::string& content) {
+  std::vector<int> tokens = EncodeHeaderLlama3(role);
   auto new_tokens = cortex_tokenizer->Encode(content);
   tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end());
-  tokens.push_back(static_cast<int32_t>(Llama3Template::kEndOfTurn)); // <|eot_id|>
+  tokens.push_back(
+      static_cast<int32_t>(Llama3Template::kEndOfTurn));  // <|eot_id|>
   return tokens;
 }
 //#########################
 //### ENGINE END POINTS ###
 //#########################
 
-
-void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_body, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+void TensorrtllmEngine::HandleChatCompletion(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
   inferences::ChatCompletionRequest request = inferences::fromJson(json_body);
   std::string formatted_input = pre_prompt_;
   nlohmann::json data;
@@ -286,10 +334,10 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_b
   // tokens for Mistral v0.3
   // TODO(sang): too much hard code here, need to refactor it soon
   std::vector<int32_t> tokens;
-  if (model_type_ == ModelType::kLlama3){
-    tokens.push_back(static_cast<int32_t>(Llama3Template::kBeginOfText)); // <|begin_of_text|>
-  }
-  else if (model_type_ == ModelType::kMistral){
+  if (model_type_ == ModelType::kLlama3) {
+    tokens.push_back(static_cast<int32_t>(
+        Llama3Template::kBeginOfText));  // <|begin_of_text|>
+  } else if (model_type_ == ModelType::kMistral) {
     tokens = {static_cast<int32_t>(MistralTemplate::kBos)};
   }
   // Format the input from user
@@ -297,61 +345,57 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_b
   for (auto const& message : messages) {
     std::string input_role = message["role"].asString();
     std::string role;
-    if (model_type_ == ModelType::kLlama3){
+    if (model_type_ == ModelType::kLlama3) {
       std::string content = message["content"].asString();
       auto new_tokens = EncodeMessageLlama3(input_role, content);
       tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end());
-    }
-    else{
+    } else {
       if (input_role == "user") {
-          role = user_prompt_;
-          std::string content = message["content"].asString();
-          formatted_input += role + content;
-          if(model_type_ == ModelType::kMistral) {
-            auto new_tokens = cortex_tokenizer->Encode(content);
-            new_tokens.insert(new_tokens.begin(), static_cast<int32_t>(MistralTemplate::kBeginInst));
-            new_tokens.push_back(static_cast<int32_t>(MistralTemplate::kEndInst));
-            tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end());
-          }
-      }
-      else if (input_role == "assistant") {
-          role = ai_prompt_;
-          std::string content = message["content"].asString();
-          formatted_input += role + content;
-          if(model_type_ == ModelType::kMistral) {
-            auto new_tokens = cortex_tokenizer->Encode(content);
-            if(msg_count == messages.size() - 1) {
-              new_tokens.push_back(static_cast<int32_t>(MistralTemplate::kEos));
-            }
-            tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end());
+        role = user_prompt_;
+        std::string content = message["content"].asString();
+        formatted_input += role + content;
+        if (model_type_ == ModelType::kMistral) {
+          auto new_tokens = cortex_tokenizer->Encode(content);
+          new_tokens.insert(new_tokens.begin(),
+                            static_cast<int32_t>(MistralTemplate::kBeginInst));
+          new_tokens.push_back(static_cast<int32_t>(MistralTemplate::kEndInst));
+          tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end());
+        }
+      } else if (input_role == "assistant") {
+        role = ai_prompt_;
+        std::string content = message["content"].asString();
+        formatted_input += role + content;
+        if (model_type_ == ModelType::kMistral) {
+          auto new_tokens = cortex_tokenizer->Encode(content);
+          if (msg_count == messages.size() - 1) {
+            new_tokens.push_back(static_cast<int32_t>(MistralTemplate::kEos));
           }
-      }
-      else if (input_role == "system") {
-          role = system_prompt_;
-          std::string content = message["content"].asString();
-          formatted_input = role + content + formatted_input;
-      }
-      else {
-          role = input_role;
-          std::string content = message["content"].asString();
-          formatted_input += role + content;
+          tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end());
+        }
+      } else if (input_role == "system") {
+        role = system_prompt_;
+        std::string content = message["content"].asString();
+        formatted_input = role + content + formatted_input;
+      } else {
+        role = input_role;
+        std::string content = message["content"].asString();
+        formatted_input += role + content;
       }
     }
     msg_count++;
   }
   formatted_input += ai_prompt_;
 
-  std::shared_ptr<InferenceState> infer_state = std::make_shared<InferenceState>();
+  std::shared_ptr<InferenceState> infer_state =
+      std::make_shared<InferenceState>();
 
   std::vector<int32_t> input_ids_host;
 
-
-  if(model_type_ == ModelType::kOpenHermes  ) {
+  if (model_type_ == ModelType::kOpenHermes) {
     input_ids_host = cortex_tokenizer->Encode(formatted_input);
-  } else if( model_type_ == ModelType::kMistral) {
+  } else if (model_type_ == ModelType::kMistral) {
     input_ids_host = tokens;
-  }
-  else if (model_type_ == ModelType::kLlama3){
+  } else if (model_type_ == ModelType::kLlama3) {
     auto footer_tokens = EncodeHeaderLlama3("assistant");
     tokens.insert(tokens.end(), footer_tokens.begin(), footer_tokens.end());
     input_ids_host = tokens;
@@ -370,24 +414,32 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_b
   sampling_config.repetitionPenalty = std::vector{request.frequency_penalty};
   // Input preparation
 
-  std::thread inference_thread(InferenceThread, infer_state, input_ids_host, callback, this, sampling_config, input_len, outputLen, model_type_);
-  inference_thread.detach(); // Detach the thread to allow it to run independently
+  std::thread inference_thread(InferenceThread, infer_state, input_ids_host,
+                               callback, this, sampling_config, input_len,
+                               outputLen, model_type_);
+  inference_thread
+      .detach();  // Detach the thread to allow it to run independently
 
   q_->runTaskInQueue([this, cb = std::move(callback), infer_state]() {
     LOG_INFO << "Preparing to run inference task queue...";
-    while (true) { // Continuously check if the queue is not empty
-      std::unique_lock<std::mutex> lock(infer_state->queue_mutex); // Lock the queue for exclusive access
+    while (true) {  // Continuously check if the queue is not empty
+      std::unique_lock<std::mutex> lock(
+          infer_state->queue_mutex);  // Lock the queue for exclusive access
       if (!infer_state->texts_to_stream.empty()) {
         std::string rew_text = infer_state->texts_to_stream.front();
         infer_state->texts_to_stream.pop();
-        if (HandleMatch(rew_text, infer_state, cb, model_type_ ) && rew_text != "[DONE]") {
-            continue;
+        if (HandleMatch(rew_text, infer_state, cb, model_type_) &&
+            rew_text != "[DONE]") {
+          continue;
         };
 
         if (rew_text == "[DONE]") {
-          const std::string str
-              = "data: " + tensorrtllm_utils::CreateReturnJson(tensorrtllm_utils::GenerateRandomString(20), model_id_, "", "stop")
-              + "\n\n" + "data: [DONE]" + "\n\n";
+          const std::string str =
+              "data: " +
+              tensorrtllm_utils::CreateReturnJson(
+                  tensorrtllm_utils::GenerateRandomString(20), model_id_, "",
+                  "stop") +
+              "\n\n" + "data: [DONE]" + "\n\n";
 
           infer_state->is_finished = true;
 
@@ -401,12 +453,16 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_b
           cb(std::move(status), std::move(resp_data));
           break;
         }
-        const std::string text_to_stream
-            = "data: " + tensorrtllm_utils::CreateReturnJson(tensorrtllm_utils::GenerateRandomString(20), model_id_, rew_text) + "\n\n";
-
-        lock.unlock(); // Unlock as soon as possible
+        const std::string text_to_stream =
+            "data: " +
+            tensorrtllm_utils::CreateReturnJson(
+                tensorrtllm_utils::GenerateRandomString(20), model_id_,
+                rew_text) +
+            "\n\n";
+
+        lock.unlock();  // Unlock as soon as possible
         // std::cout << rew_text;
-        
+
         Json::Value resp_data;
         resp_data["data"] = text_to_stream;
         Json::Value status;
@@ -427,96 +483,109 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_b
   return;
 };
 
-void TensorrtllmEngine::LoadModel(std::shared_ptr<Json::Value> json_body, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-    model::LoadModelRequest request = model::fromJson(json_body);
-    std::filesystem::path model_dir = request.model_path;
-    model_type_ = GetModelType(request.model_path);
-
-    int ctx_len = request.ctx_len;
-    // We only support 2 models for now, it is ugly but it works :(
-    if(model_type_ == ModelType::kOpenHermes) {
-      user_prompt_ = request.user_prompt.empty() ? kOhUserPrompt : request.user_prompt;
-      ai_prompt_ = request.ai_prompt.empty() ? kOhAiPrompt : request.ai_prompt;
-      system_prompt_ = request.system_prompt.empty() ? kOhSystemPrompt : request.system_prompt;
-    }
-    model_id_ = GetModelId(*json_body);
+void TensorrtllmEngine::LoadModel(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+  model::LoadModelRequest request = model::fromJson(json_body);
+  std::filesystem::path model_dir = request.model_path;
+  model_type_ = GetModelType(request.model_path);
+
+  int ctx_len = request.ctx_len;
+  // We only support 2 models for now, it is ugly but it works :(
+  if (model_type_ == ModelType::kOpenHermes) {
+    user_prompt_ =
+        request.user_prompt.empty() ? kOhUserPrompt : request.user_prompt;
+    ai_prompt_ = request.ai_prompt.empty() ? kOhAiPrompt : request.ai_prompt;
+    system_prompt_ =
+        request.system_prompt.empty() ? kOhSystemPrompt : request.system_prompt;
+  }
+  model_id_ = GetModelId(*json_body);
 
-    logger_ = std::make_shared<TllmLogger>();
-    logger_->setLevel(nvinfer1::ILogger::Severity::kINFO);
-    initTrtLlmPlugins(logger_.get());
+  logger_ = std::make_shared<TllmLogger>();
+  logger_->setLevel(nvinfer1::ILogger::Severity::kINFO);
+  initTrtLlmPlugins(logger_.get());
 
-    std::filesystem::path tokenizer_model_name = model_dir / "tokenizer.model";
-    if(model_type_ == ModelType::kLlama3){
-      cortex_tokenizer = std::make_unique<TiktokenTokenizer>(tokenizer_model_name.string());
-    }
-    else{
-      cortex_tokenizer = std::make_unique<SentencePieceTokenizer>(tokenizer_model_name.string());
-    }
-    
-    LOG_INFO << "Loaded tokenizer from " << tokenizer_model_name.string();
-
-    std::filesystem::path json_file_name = model_dir / "config.json";
-    auto json = GptJsonConfig::parse(json_file_name);
-    auto config = json.getModelConfig();
-    model_config_ = std::make_unique<ModelConfig>(config);
-    auto world_config = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism());
-    LOG_INFO << "Loaded config from " << json_file_name.string();
-    // auto dtype = model_config->getDataType();
-
-    // Currently doing fixed session config
-    session_config_.maxBatchSize = batch_size_;
-    session_config_.maxBeamWidth = 1; // Fixed for simplicity
-    session_config_.maxSequenceLength = ctx_len;
-    session_config_.cudaGraphMode = true; // Fixed for simplicity
-
-    // Init gpt_session
-    auto model_path = model_dir / json.engineFilename(world_config, model_id_);
+  std::filesystem::path tokenizer_model_name = model_dir / "tokenizer.model";
+  if (model_type_ == ModelType::kLlama3) {
+    cortex_tokenizer =
+        std::make_unique<TiktokenTokenizer>(tokenizer_model_name.string());
+  } else {
+    cortex_tokenizer =
+        std::make_unique<SentencePieceTokenizer>(tokenizer_model_name.string());
+  }
+
+  LOG_INFO << "Loaded tokenizer from " << tokenizer_model_name.string();
+
+  std::filesystem::path json_file_name = model_dir / "config.json";
+  auto json = GptJsonConfig::parse(json_file_name);
+  auto config = json.getModelConfig();
+  model_config_ = std::make_unique<ModelConfig>(config);
+  auto world_config = WorldConfig::mpi(1, json.getTensorParallelism(),
+                                       json.getPipelineParallelism());
+  LOG_INFO << "Loaded config from " << json_file_name.string();
+  // auto dtype = model_config->getDataType();
+
+  // Currently doing fixed session config
+  session_config_.maxBatchSize = batch_size_;
+  session_config_.maxBeamWidth = 1;  // Fixed for simplicity
+  session_config_.maxSequenceLength = ctx_len;
+  session_config_.cudaGraphMode = true;  // Fixed for simplicity
+
+  // Init gpt_session
+  auto model_path = model_dir / json.engineFilename(world_config, model_id_);
+  try {
+    gpt_session = std::make_unique<GptSession>(session_config_, *model_config_,
+                                               world_config,
+                                               model_path.string(), logger_);
+  } catch (const std::exception& e) {
+    LOG_ERROR << "Failed to load model: " << e.what();
+    LOG_INFO << "Retry once with smaller maxSequenceLength";
+    gpt_session.reset();
+    // Retry again with smaller maxSequenceLength once
+    session_config_.maxSequenceLength /= 2;
     try {
-      gpt_session = std::make_unique<GptSession>(session_config_, *model_config_, world_config, model_path.string(), logger_);
-    } catch(const std::exception& e) {
+      gpt_session = std::make_unique<GptSession>(session_config_,
+                                                 *model_config_, world_config,
+                                                 model_path.string(), logger_);
+    } catch (const std::exception& e) {
       LOG_ERROR << "Failed to load model: " << e.what();
-      LOG_INFO << "Retry once with smaller maxSequenceLength";
       gpt_session.reset();
-      // Retry again with smaller maxSequenceLength once
-      session_config_.maxSequenceLength /= 2;
-      try {
-        gpt_session = std::make_unique<GptSession>(session_config_, *model_config_, world_config, model_path.string(), logger_);
-      } catch(const std::exception& e) {
-        LOG_ERROR << "Failed to load model: " << e.what();
-        gpt_session.reset();
-        cortex_tokenizer.reset();
-        q_.reset();
-        model_config_.reset();
-        logger_.reset();
-        Json::Value json_resp;
-        json_resp["message"] = "Failed to load model";
-        Json::Value status;
-        status["is_done"] = false;
-        status["has_error"] = true;
-        status["is_stream"] = false;
-        status["status_code"] = k500InternalServerError;
-        callback(std::move(status), std::move(json_resp));
-        return;
-      }
+      cortex_tokenizer.reset();
+      q_.reset();
+      model_config_.reset();
+      logger_.reset();
+      Json::Value json_resp;
+      json_resp["message"] = "Failed to load model";
+      Json::Value status;
+      status["is_done"] = false;
+      status["has_error"] = true;
+      status["is_stream"] = false;
+      status["status_code"] = k500InternalServerError;
+      callback(std::move(status), std::move(json_resp));
+      return;
     }
+  }
 
-    model_loaded_ = true;
-    if (q_ == nullptr) {
-     q_ = std::make_unique<trantor::ConcurrentTaskQueue>(1, model_id_);
-    }
+  model_loaded_ = true;
+  if (q_ == nullptr) {
+    q_ = std::make_unique<trantor::ConcurrentTaskQueue>(1, model_id_);
+  }
 
-    // Model loaded successfully
-    LOG_INFO << "Model " << model_id_ << " loaded successfully from path " << model_path.string();
-    Json::Value json_resp;
-    json_resp["message"] = "Model loaded successfully";
-    Json::Value status_resp;
-    status_resp["status_code"] = k200OK;
-    callback(std::move(status_resp), std::move(json_resp));
-    start_time_ = std::chrono::system_clock::now().time_since_epoch() /
-      std::chrono::milliseconds(1);
+  // Model loaded successfully
+  LOG_INFO << "Model " << model_id_ << " loaded successfully from path "
+           << model_path.string();
+  Json::Value json_resp;
+  json_resp["message"] = "Model loaded successfully";
+  Json::Value status_resp;
+  status_resp["status_code"] = k200OK;
+  callback(std::move(status_resp), std::move(json_resp));
+  start_time_ = std::chrono::system_clock::now().time_since_epoch() /
+                std::chrono::milliseconds(1);
 };
 
-void TensorrtllmEngine::UnloadModel(std::shared_ptr<Json::Value> json_body, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+void TensorrtllmEngine::UnloadModel(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
   if (!CheckModelLoaded(callback)) {
     LOG_WARN << "Model was not loaded";
     Json::Value json_resp;
@@ -526,7 +595,7 @@ void TensorrtllmEngine::UnloadModel(std::shared_ptr<Json::Value> json_body, std:
     callback(std::move(status), std::move(json_resp));
     return;
   }
-    
+
   gpt_session.reset();
   cortex_tokenizer.reset();
   q_.reset();
@@ -545,7 +614,9 @@ void TensorrtllmEngine::UnloadModel(std::shared_ptr<Json::Value> json_body, std:
   LOG_INFO << "Model unloaded sucessfully";
 }
 
-void TensorrtllmEngine::HandleEmbedding( std::shared_ptr<Json::Value> json_body, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+void TensorrtllmEngine::HandleEmbedding(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
   LOG_WARN << "Engine does not support embedding yet";
   Json::Value json_resp;
   json_resp["message"] = "Engine does not support embedding yet";
@@ -554,7 +625,9 @@ void TensorrtllmEngine::HandleEmbedding( std::shared_ptr<Json::Value> json_body,
   callback(std::move(status), std::move(json_resp));
 }
 
-void TensorrtllmEngine::GetModelStatus(std::shared_ptr<Json::Value> json_body, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+void TensorrtllmEngine::GetModelStatus(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
   LOG_WARN << "Engine does not support get model status method yet";
   Json::Value json_resp;
   json_resp["message"] = "Engine does not support get model status method yet";
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h
index 171eda9bb..93645ff14 100644
--- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h
+++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h
@@ -7,62 +7,56 @@
 #include <queue>
 #include <string>
 
+#include <nlohmann/json.hpp>
 #include "NvInfer.h"
 #include "base/cortex-common/enginei.h"
+#include "cpp-tiktoken/emdedded_resource_reader.h"  //include to use tiktoken
+#include "cpp-tiktoken/encoding.h"                  //include to use tiktoken
 #include "models/chat_completion_request.h"
 #include "models/load_model_request.h"
 #include "sentencepiece_processor.h"
-#include "cpp-tiktoken/encoding.h" //include to use tiktoken
-#include "cpp-tiktoken/emdedded_resource_reader.h" //include to use tiktoken
 #include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/generationInput.h"
 #include "tensorrt_llm/runtime/generationOutput.h"
 #include "tensorrt_llm/runtime/gptJsonConfig.h"
-#include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/gptSession.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/samplingConfig.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 #include "trantor/utils/ConcurrentTaskQueue.h"
 #include "trantor/utils/Logger.h"
-#include <nlohmann/json.hpp>
-
 
 using namespace tensorrt_llm::runtime;
 // This class is file source reader from https://github.com/gh-markt/cpp-tiktoken/blob/master/ut/tests.cpp
 class TFilePathResourceReader : public IResourceReader {
-public:
-    TFilePathResourceReader(const std::string& path) 
-        : path_(path)
-    {
+ public:
+  TFilePathResourceReader(const std::string& path) : path_(path) {}
+
+  std::vector<std::string> readLines() override {
+    std::ifstream file(path_);
+    if (!file.is_open()) {
+      throw std::runtime_error("Embedded resource '" + path_ + "' not found.");
     }
 
-    std::vector<std::string> readLines() override {
-        std::ifstream file(path_);
-        if (!file.is_open()) {
-            throw std::runtime_error("Embedded resource '" + path_ + "' not found.");
-        }
+    std::string line;
+    std::vector<std::string> lines;
+    while (std::getline(file, line)) {
+      lines.push_back(line);
+    }
 
-        std::string line;
-        std::vector<std::string> lines;
-        while (std::getline(file, line)) {
-            lines.push_back(line);
-        }
+    return lines;
+  }
 
-        return lines;
-    }
-private:
-    std::string path_;
+ private:
+  std::string path_;
 };
 
 class Tokenizer {
 
  public:
-  Tokenizer() {
-  }
+  Tokenizer() {}
 
-  virtual std::string DecodeWithSpace(const int id) {
-    return "";
-  }
+  virtual std::string DecodeWithSpace(const int id) { return ""; }
 
   virtual std::string Decode(const std::vector<int32_t> ids) = 0;
 
@@ -73,11 +67,12 @@ class SentencePieceTokenizer : public Tokenizer {
  private:
   sentencepiece::SentencePieceProcessor processor;
 
-  void ReplaceSubstring(std::string& base, const std::string& from, const std::string& to) {
+  void ReplaceSubstring(std::string& base, const std::string& from,
+                        const std::string& to) {
     size_t start_pos = 0;
     while ((start_pos = base.find(from, start_pos)) != std::string::npos) {
-        base.replace(start_pos, from.length(), to);
-        start_pos += to.length();
+      base.replace(start_pos, from.length(), to);
+      start_pos += to.length();
     }
   }
 
@@ -115,7 +110,8 @@ class TiktokenTokenizer : public Tokenizer {
  public:
   TiktokenTokenizer(const std::string& model_path) : Tokenizer() {
     TFilePathResourceReader reader(model_path);
-    encoder = GptEncoding::get_encoding_llama3(LanguageModel::CL100K_BASE, &reader);
+    encoder =
+        GptEncoding::get_encoding_llama3(LanguageModel::CL100K_BASE, &reader);
     LOG_INFO << "Successully loaded the tokenizer";
   }
 
@@ -129,26 +125,24 @@ class TiktokenTokenizer : public Tokenizer {
     return ids;
   }
 };
-  enum class ModelType {
-    kOpenHermes, kLlama3, kMistral
-};
+enum class ModelType { kOpenHermes, kLlama3, kMistral };
 
 struct InferenceState {
   int prev_pos{0};
   bool is_finished;
   std::queue<std::string> texts_to_stream;
-  std::mutex queue_mutex; // Mutex to protect access to textsToStream
+  std::mutex queue_mutex;  // Mutex to protect access to textsToStream
   size_t stop_word_match_len = 0;
-  std::vector<std::string> sequence_openhermes = {"<", "|", "im", "_", "end", "|", ">"};
+  std::vector<std::string> sequence_openhermes = {"<",   "|", "im", "_",
+                                                  "end", "|", ">"};
   std::vector<std::string> sequence_mistral = {"[", "INST", "]"};
   int token_gen_count = 0;
 
-  void Reset() {
-    stop_word_match_len = 0;
-  }
+  void Reset() { stop_word_match_len = 0; }
 
   bool IsComplete(ModelType model_type) const {
-    if(model_type == ModelType::kOpenHermes || model_type == ModelType::kLlama3) {
+    if (model_type == ModelType::kOpenHermes ||
+        model_type == ModelType::kLlama3) {
       return stop_word_match_len >= sequence_openhermes.size();
     } else {
       return stop_word_match_len >= sequence_mistral.size();
@@ -156,18 +150,17 @@ struct InferenceState {
   }
 
   const std::string& GetSequence(ModelType model_type, size_t index) {
-    if(model_type == ModelType::kOpenHermes || model_type == ModelType::kLlama3) {
+    if (model_type == ModelType::kOpenHermes ||
+        model_type == ModelType::kLlama3) {
       return sequence_openhermes[index];
     } else {
       return sequence_mistral[index];
     }
-
   }
 };
 
 namespace tensorrtllm {
 
-
 class TensorrtllmEngine : public EngineI {
  public:
   ~TensorrtllmEngine() final;
@@ -188,7 +181,8 @@ class TensorrtllmEngine : public EngineI {
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
   virtual std::vector<int> EncodeHeaderLlama3(const std::string& role);
-  virtual std::vector<int> EncodeMessageLlama3( const std::string& role, const std::string& content);
+  virtual std::vector<int> EncodeMessageLlama3(const std::string& role,
+                                               const std::string& content);
   // API to get running models.
   void GetModels(
       std::shared_ptr<Json::Value> json_body,
@@ -221,4 +215,4 @@ class TensorrtllmEngine : public EngineI {
   ModelType model_type_ = ModelType::kOpenHermes;
 };
 
-} // namespace inferences
+}  // namespace tensorrtllm