diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/.clang-format b/cpp/tensorrt_llm/cortex.tensorrt-llm/.clang-format new file mode 100644 index 000000000..dc69733e1 --- /dev/null +++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/.clang-format @@ -0,0 +1,83 @@ +# Google C/C++ Code Style settings +# https://clang.llvm.org/docs/ClangFormatStyleOptions.html +# Author: Kehan Xue, kehan.xue (at) gmail.com + +Language: Cpp +BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: None +AlignOperands: Align +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: Empty +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: Never # To avoid conflict, set this "Never" and each "if statement" should include brace when coding +AllowShortLambdasOnASingleLine: Inline +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: None +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: true +BreakBeforeBraces: Custom +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterStruct: false + AfterControlStatement: Never + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +BreakBeforeBinaryOperators: None +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: BeforeColon +ColumnLimit: 80 +CompactNamespaces: false +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false # Make sure the * or & align on the left +EmptyLineBeforeAccessModifier: LogicalBlock +FixNamespaceComments: true +IncludeBlocks: Preserve +IndentCaseLabels: true +IndentPPDirectives: None +IndentWidth: 2 +KeepEmptyLinesAtTheStartOfBlocks: true +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PointerAlignment: Left +ReflowComments: false +# SeparateDefinitionBlocks: Always # Only support since clang-format 14 +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInCStyleCastParentheses: false +SpacesInContainerLiterals: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: c++11 +TabWidth: 4 +UseTab: Never \ No newline at end of file diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc index fd5dcbf94..ba407f5d5 100644 --- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc +++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc @@ -2,141 +2,169 @@ #include "models/chat_completion_request.h" #include "nlohmann/json.hpp" -#include "src/models/load_model_request.h" -#include "tensorrt_llm/runtime/generationInput.h" -#include "tensorrt_llm/runtime/generationOutput.h" -#include "tensorrt_llm/runtime/samplingConfig.h" -#include "utils/tensorrt-llm_utils.h" -#include "json/writer.h" -#include "cpp-tiktoken/encoding.h" //include to use tiktoken +#include #include +#include #include #include #include #include #include -#include #include -#include +#include "cpp-tiktoken/encoding.h" //include to use tiktoken +#include "json/writer.h" +#include "src/models/load_model_request.h" +#include "tensorrt_llm/runtime/generationInput.h" +#include "tensorrt_llm/runtime/generationOutput.h" +#include "tensorrt_llm/runtime/samplingConfig.h" +#include "utils/tensorrt-llm_utils.h" using json = nlohmann::json; using namespace tensorrtllm; namespace { - constexpr const int k200OK = 200; - constexpr const int k400BadRequest = 400; - constexpr const int k409Conflict = 409; - constexpr const int k500InternalServerError = 500; - - // https://nvidia.github.io/TensorRT-LLM/_cpp_gen/runtime.html#generationinput-h - // stopWordsList - // 'im', '_' , 'end', '', '<|im_end|>' - const std::vector kOpenhermesStopWords = {321, 28730, 416, 2, 32000, 3, 4, 5, -1, -1}; - const std::string kOhUserPrompt = "<|im_end|>\n<|im_start|>user\n"; - const std::string kOhAiPrompt = "<|im_end|>\n<|im_start|>assistant\n"; - const std::string kOhSystemPrompt = "<|im_start|>system\n"; - const std::unordered_map kOpenhermesTemplate = {{"<|im_end|>", 32000} , {"<|im_start|>", 32001}}; - - // '[', 'INST', ']', '[INST]', ''[, '/' , 'INST',']', '[/INST]', '' - const std::vector kMistral_V0_3_StopWords - = {29560, 17057, 29561, 3, 29560, 29516, 17057, 29561, 4, 2, 3, 4, 8, 9, 10, -1, -1, -1, -1, -1}; - - enum class MistralTemplate: int32_t { - kBos = 1, - kEos = 2, - kBeginInst = 3, - kEndInst = 4 - }; +constexpr const int k200OK = 200; +constexpr const int k400BadRequest = 400; +constexpr const int k409Conflict = 409; +constexpr const int k500InternalServerError = 500; + +// https://nvidia.github.io/TensorRT-LLM/_cpp_gen/runtime.html#generationinput-h +// stopWordsList +// 'im', '_' , 'end', '', '<|im_end|>' +const std::vector kOpenhermesStopWords = {321, 28730, 416, 2, 32000, + 3, 4, 5, -1, -1}; +const std::string kOhUserPrompt = "<|im_end|>\n<|im_start|>user\n"; +const std::string kOhAiPrompt = "<|im_end|>\n<|im_start|>assistant\n"; +const std::string kOhSystemPrompt = "<|im_start|>system\n"; +const std::unordered_map kOpenhermesTemplate = { + {"<|im_end|>", 32000}, + {"<|im_start|>", 32001}}; + +// '[', 'INST', ']', '[INST]', ''[, '/' , 'INST',']', '[/INST]', '' +const std::vector kMistral_V0_3_StopWords = { + 29560, 17057, 29561, 3, 29560, 29516, 17057, 29561, 4, 2, + 3, 4, 8, 9, 10, -1, -1, -1, -1, -1}; + +enum class MistralTemplate : int32_t { + kBos = 1, + kEos = 2, + kBeginInst = 3, + kEndInst = 4 +}; - enum class Llama3Template: int32_t{ - kBeginOfText = 128000, - kEndOfText = 128001, - kEndOfTurn = 128009, - kStartHeaderId = 128006, - kEndHeaderId = 128007, - kParagraph = 271 - }; +enum class Llama3Template : int32_t { + kBeginOfText = 128000, + kEndOfText = 128001, + kEndOfTurn = 128009, + kStartHeaderId = 128006, + kEndHeaderId = 128007, + kParagraph = 271 +}; - // "<|end_of_text|>", "<|eot_id|>" - const std::vector Llama3StopWords = {128001, 128009, 1, 2}; +// "<|end_of_text|>", "<|eot_id|>" +const std::vector Llama3StopWords = {128001, 128009, 1, 2}; - // TODO(sang) This is fragile, just a temporary solution. Maybe can use a config file or model architect, etc... - bool IsOpenhermes(const std::string& s) { - if (s.find("mistral") != std::string::npos || s.find("Mistral") != std::string::npos) { - return false; - } - return true; +// TODO(sang) This is fragile, just a temporary solution. Maybe can use a config file or model architect, etc... +bool IsOpenhermes(const std::string& s) { + if (s.find("mistral") != std::string::npos || + s.find("Mistral") != std::string::npos) { + return false; } - ModelType GetModelType(const std::string& s){ - if (s.find("Llama3") != std::string::npos || s.find("llama3") != std::string::npos) { - return ModelType::kLlama3; - } - else if (s.find("mistral") != std::string::npos || s.find("Mistral") != std::string::npos) - { - return ModelType::kMistral; - } - else{ - return ModelType::kOpenHermes; - } - + return true; +} +ModelType GetModelType(const std::string& s) { + if (s.find("Llama3") != std::string::npos || + s.find("llama3") != std::string::npos) { + return ModelType::kLlama3; + } else if (s.find("mistral") != std::string::npos || + s.find("Mistral") != std::string::npos) { + return ModelType::kMistral; + } else { + return ModelType::kOpenHermes; } } +} // namespace TensorrtllmEngine::~TensorrtllmEngine() {} void RemoveId(std::vector& vec, int id) { vec.erase(std::remove(vec.begin(), vec.end(), id), vec.end()); } -bool HandleMatch(std::string const& rew_text, - std::shared_ptr infer_state, - std::function cb, - ModelType model_type) { +bool HandleMatch(std::string const& rew_text, + std::shared_ptr infer_state, + std::function cb, + ModelType model_type) { if (infer_state->IsComplete(model_type)) { return false; } if (infer_state->stop_word_match_len == 0) { - if ((model_type == ModelType::kOpenHermes && rew_text.find('<') != std::string::npos) || - (model_type != ModelType::kOpenHermes && rew_text.find('[') != std::string::npos)) { - infer_state->stop_word_match_len++; // Move to next state + if ((model_type == ModelType::kOpenHermes && + rew_text.find('<') != std::string::npos) || + (model_type != ModelType::kOpenHermes && + rew_text.find('[') != std::string::npos)) { + infer_state->stop_word_match_len++; // Move to next state return true; } - } else if (rew_text == infer_state->GetSequence(model_type, infer_state->stop_word_match_len)) { - infer_state->stop_word_match_len++; // Move to next state + } else if (rew_text == infer_state->GetSequence( + model_type, infer_state->stop_word_match_len)) { + infer_state->stop_word_match_len++; // Move to next state return true; - } else if (infer_state->stop_word_match_len > 0 && rew_text == infer_state->GetSequence(model_type, 0u)) { - infer_state->stop_word_match_len = 1; // Restart from first match if sequence breaks but matches start + } else if (infer_state->stop_word_match_len > 0 && + rew_text == infer_state->GetSequence(model_type, 0u)) { + infer_state->stop_word_match_len = + 1; // Restart from first match if sequence breaks but matches start return true; } else { infer_state->Reset(); - return false; // Reset to start if sequence breaks + return false; // Reset to start if sequence breaks } return false; } -GenerationInput::TensorPtr TensorrtllmEngine::GetTensorSingleStopWordList(int stopToken) { - std::vector stop_words_tokens = {stopToken, -1, 1, -1}; // Extend with -1 for increased length - return gpt_session->getBufferManager().copyFrom(stop_words_tokens, ITensor::makeShape({1, 2, 2}), MemoryType::kGPU); +GenerationInput::TensorPtr TensorrtllmEngine::GetTensorSingleStopWordList( + int stopToken) { + std::vector stop_words_tokens = { + stopToken, -1, 1, -1}; // Extend with -1 for increased length + return gpt_session->getBufferManager().copyFrom( + stop_words_tokens, ITensor::makeShape({1, 2, 2}), MemoryType::kGPU); } GenerationInput::TensorPtr TensorrtllmEngine::GetTensorChatMLStopWordList() { - if(model_type_ == ModelType::kOpenHermes) { - return gpt_session->getBufferManager().copyFrom(kOpenhermesStopWords, ITensor::makeShape({1, 2, static_cast(kOpenhermesStopWords.size()/2)}), MemoryType::kGPU); + if (model_type_ == ModelType::kOpenHermes) { + return gpt_session->getBufferManager().copyFrom( + kOpenhermesStopWords, + ITensor::makeShape( + {1, 2, static_cast(kOpenhermesStopWords.size() / 2)}), + MemoryType::kGPU); } else if (model_type_ == ModelType::kMistral) { - return gpt_session->getBufferManager().copyFrom(kMistral_V0_3_StopWords, ITensor::makeShape({1, 2, static_cast(kMistral_V0_3_StopWords.size()/2)}), MemoryType::kGPU); - } - else{ - return gpt_session->getBufferManager().copyFrom(Llama3StopWords, ITensor::makeShape({1, 2, static_cast(Llama3StopWords.size()/2)}), MemoryType::kGPU); + return gpt_session->getBufferManager().copyFrom( + kMistral_V0_3_StopWords, + ITensor::makeShape( + {1, 2, static_cast(kMistral_V0_3_StopWords.size() / 2)}), + MemoryType::kGPU); + } else { + return gpt_session->getBufferManager().copyFrom( + Llama3StopWords, + ITensor::makeShape( + {1, 2, static_cast(Llama3StopWords.size() / 2)}), + MemoryType::kGPU); } } -GenerationInput TensorrtllmEngine::CreateGenerationInput(std::vector input_ids_host) { +GenerationInput TensorrtllmEngine::CreateGenerationInput( + std::vector input_ids_host) { int input_len = input_ids_host.size(); std::vector input_lengths_host(batch_size_, input_len); - GenerationInput::TensorPtr input_lengths - = gpt_session->getBufferManager().copyFrom(input_lengths_host, ITensor::makeShape({batch_size_}), MemoryType::kGPU); - GenerationInput::TensorPtr input_ids = gpt_session->getBufferManager().copyFrom( - input_ids_host, ITensor::makeShape({batch_size_, input_len}), MemoryType::kGPU); - GenerationInput generation_input{0, 0, input_ids, input_lengths, model_config_->usePackedInput()}; + GenerationInput::TensorPtr input_lengths = + gpt_session->getBufferManager().copyFrom( + input_lengths_host, ITensor::makeShape({batch_size_}), + MemoryType::kGPU); + GenerationInput::TensorPtr input_ids = + gpt_session->getBufferManager().copyFrom( + input_ids_host, ITensor::makeShape({batch_size_, input_len}), + MemoryType::kGPU); + GenerationInput generation_input{0, 0, input_ids, input_lengths, + model_config_->usePackedInput()}; generation_input.stopWordsList = GetTensorChatMLStopWordList(); LOG_INFO << "Create generation input successfully"; @@ -144,10 +172,11 @@ GenerationInput TensorrtllmEngine::CreateGenerationInput(std::vector in } GenerationOutput TensorrtllmEngine::CreateGenerationOutput() { - GenerationOutput generation_output { - gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), - gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32) - }; + GenerationOutput generation_output{ + gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, + nvinfer1::DataType::kINT32), + gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, + nvinfer1::DataType::kINT32)}; LOG_INFO << "Create generation output successfully"; return generation_output; } @@ -156,65 +185,77 @@ void InferenceThread( std::shared_ptr infer_state, std::vector input_ids_host, std::function&& callback, - TensorrtllmEngine* self, - SamplingConfig sampling_config, - int input_len, + TensorrtllmEngine* self, SamplingConfig sampling_config, int input_len, int outputLen, ModelType model_type) { // Input preparation LOG_INFO << "Inference thread started"; - GenerationInput generation_input = self->CreateGenerationInput(input_ids_host); + GenerationInput generation_input = + self->CreateGenerationInput(input_ids_host); GenerationOutput generation_output = self->CreateGenerationOutput(); // Define the callback to stream each generated token - generation_output.onTokenGenerated = [&infer_state, input_len, outputLen, self, &generation_output, model_type]( - GenerationOutput::TensorPtr const& output_ids, SizeType32 step, bool finished) { - // LOG_INFO << "Generating tokenizer in thread"; + generation_output + .onTokenGenerated = [&infer_state, input_len, outputLen, self, + &generation_output, model_type]( + GenerationOutput::TensorPtr const& output_ids, + SizeType32 step, bool finished) { + // LOG_INFO << "Generating tokenizer in thread"; // Assuming the shape of output_ids tensor is (1, 1, 160), where 160 is the number of tokens - int output_length = output_ids->getShape().d[2]; // Get the length of output IDs based on the tensor shape + int output_length = + output_ids->getShape() + .d[2]; // Get the length of output IDs based on the tensor shape // Copy output IDs from GPU to host for printing std::vector output_idsHost(output_length); - self->gpt_session->getBufferManager().copy(*output_ids, output_idsHost.data(), MemoryType::kCPU); + self->gpt_session->getBufferManager().copy( + *output_ids, output_idsHost.data(), MemoryType::kCPU); // Find the last non-zero value in the output IDs starting from the end of the input sequence - std::vector output_idsHostDecode(output_idsHost.begin() + input_len, output_idsHost.end()); + std::vector output_idsHostDecode(output_idsHost.begin() + input_len, + output_idsHost.end()); RemoveId(output_idsHostDecode, 0); - if(model_type == ModelType::kOpenHermes) { - for(auto const& [_, v]: kOpenhermesTemplate) { + if (model_type == ModelType::kOpenHermes) { + for (auto const& [_, v] : kOpenhermesTemplate) { RemoveId(output_idsHostDecode, v); } } else if (model_type == ModelType::kMistral) { - RemoveId(output_idsHostDecode, static_cast(MistralTemplate::kBeginInst)); - RemoveId(output_idsHostDecode, static_cast(MistralTemplate::kEndInst)); - } - else if(model_type == ModelType::kLlama3){ - RemoveId(output_idsHostDecode, static_cast(Llama3Template::kEndOfText)); - RemoveId(output_idsHostDecode, static_cast(Llama3Template::kEndOfTurn)); + RemoveId(output_idsHostDecode, + static_cast(MistralTemplate::kBeginInst)); + RemoveId(output_idsHostDecode, + static_cast(MistralTemplate::kEndInst)); + } else if (model_type == ModelType::kLlama3) { + RemoveId(output_idsHostDecode, + static_cast(Llama3Template::kEndOfText)); + RemoveId(output_idsHostDecode, + static_cast(Llama3Template::kEndOfTurn)); } std::string text = self->cortex_tokenizer->Decode(output_idsHostDecode); if (infer_state->prev_pos >= 0 && infer_state->prev_pos < text.size()) { // Valid prev_pos, proceed with slicing the string from prev_pos to the end std::string string_tok(text.begin() + infer_state->prev_pos, text.end()); - std::lock_guard guard(infer_state->queue_mutex); // Protect access with a lock + std::lock_guard guard( + infer_state->queue_mutex); // Protect access with a lock infer_state->texts_to_stream.push(string_tok); ++infer_state->token_gen_count; - } - else if (infer_state->prev_pos >= text.size()) { + } else if (infer_state->prev_pos >= text.size()) { infer_state->prev_pos = text.size(); } infer_state->prev_pos = text.size(); if (finished) { - std::lock_guard guard(infer_state->queue_mutex); // Protect access with a lock + std::lock_guard guard( + infer_state->queue_mutex); // Protect access with a lock infer_state->texts_to_stream.push("[DONE]"); - LOG_INFO << "Cortex.tensorrtllm generated " << infer_state->token_gen_count << " tokens"; + LOG_INFO << "Cortex.tensorrtllm generated " + << infer_state->token_gen_count << " tokens"; return; } return; }; // The rest of the logic inside the `chat_completion` remains unchanged... // After finishing the setup, call the inference logic - self->gpt_session->generate(generation_output, generation_input, sampling_config); + self->gpt_session->generate(generation_output, generation_input, + sampling_config); } inline std::string GetModelId(const Json::Value& json_body) { @@ -236,7 +277,8 @@ inline std::string GetModelId(const Json::Value& json_body) { return {}; } -bool TensorrtllmEngine::CheckModelLoaded(std::function& callback) { +bool TensorrtllmEngine::CheckModelLoaded( + std::function& callback) { if (!model_loaded_) { LOG_WARN << "Model is not loaded yet"; Json::Value json_resp; @@ -253,28 +295,34 @@ bool TensorrtllmEngine::CheckModelLoaded(std::function TensorrtllmEngine::EncodeHeaderLlama3(const std::string& role){ +std::vector TensorrtllmEngine::EncodeHeaderLlama3( + const std::string& role) { std::vector tokens = {}; - tokens.push_back(static_cast(Llama3Template::kStartHeaderId)); // <|start_header_id|> + tokens.push_back(static_cast( + Llama3Template::kStartHeaderId)); // <|start_header_id|> auto new_tokens = cortex_tokenizer->Encode(role); tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end()); - tokens.push_back(static_cast(Llama3Template::kEndHeaderId)); // <|end_header_id|> - tokens.push_back(static_cast(Llama3Template::kParagraph)); // \n\n + tokens.push_back( + static_cast(Llama3Template::kEndHeaderId)); // <|end_header_id|> + tokens.push_back(static_cast(Llama3Template::kParagraph)); // \n\n return tokens; } -std::vector TensorrtllmEngine::EncodeMessageLlama3( const std::string& role, const std::string& content) { - std::vector tokens = EncodeHeaderLlama3( role); +std::vector TensorrtllmEngine::EncodeMessageLlama3( + const std::string& role, const std::string& content) { + std::vector tokens = EncodeHeaderLlama3(role); auto new_tokens = cortex_tokenizer->Encode(content); tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end()); - tokens.push_back(static_cast(Llama3Template::kEndOfTurn)); // <|eot_id|> + tokens.push_back( + static_cast(Llama3Template::kEndOfTurn)); // <|eot_id|> return tokens; } //######################### //### ENGINE END POINTS ### //######################### - -void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr json_body, std::function&& callback) { +void TensorrtllmEngine::HandleChatCompletion( + std::shared_ptr json_body, + std::function&& callback) { inferences::ChatCompletionRequest request = inferences::fromJson(json_body); std::string formatted_input = pre_prompt_; nlohmann::json data; @@ -286,10 +334,10 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr json_b // tokens for Mistral v0.3 // TODO(sang): too much hard code here, need to refactor it soon std::vector tokens; - if (model_type_ == ModelType::kLlama3){ - tokens.push_back(static_cast(Llama3Template::kBeginOfText)); // <|begin_of_text|> - } - else if (model_type_ == ModelType::kMistral){ + if (model_type_ == ModelType::kLlama3) { + tokens.push_back(static_cast( + Llama3Template::kBeginOfText)); // <|begin_of_text|> + } else if (model_type_ == ModelType::kMistral) { tokens = {static_cast(MistralTemplate::kBos)}; } // Format the input from user @@ -297,61 +345,57 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr json_b for (auto const& message : messages) { std::string input_role = message["role"].asString(); std::string role; - if (model_type_ == ModelType::kLlama3){ + if (model_type_ == ModelType::kLlama3) { std::string content = message["content"].asString(); auto new_tokens = EncodeMessageLlama3(input_role, content); tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end()); - } - else{ + } else { if (input_role == "user") { - role = user_prompt_; - std::string content = message["content"].asString(); - formatted_input += role + content; - if(model_type_ == ModelType::kMistral) { - auto new_tokens = cortex_tokenizer->Encode(content); - new_tokens.insert(new_tokens.begin(), static_cast(MistralTemplate::kBeginInst)); - new_tokens.push_back(static_cast(MistralTemplate::kEndInst)); - tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end()); - } - } - else if (input_role == "assistant") { - role = ai_prompt_; - std::string content = message["content"].asString(); - formatted_input += role + content; - if(model_type_ == ModelType::kMistral) { - auto new_tokens = cortex_tokenizer->Encode(content); - if(msg_count == messages.size() - 1) { - new_tokens.push_back(static_cast(MistralTemplate::kEos)); - } - tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end()); + role = user_prompt_; + std::string content = message["content"].asString(); + formatted_input += role + content; + if (model_type_ == ModelType::kMistral) { + auto new_tokens = cortex_tokenizer->Encode(content); + new_tokens.insert(new_tokens.begin(), + static_cast(MistralTemplate::kBeginInst)); + new_tokens.push_back(static_cast(MistralTemplate::kEndInst)); + tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end()); + } + } else if (input_role == "assistant") { + role = ai_prompt_; + std::string content = message["content"].asString(); + formatted_input += role + content; + if (model_type_ == ModelType::kMistral) { + auto new_tokens = cortex_tokenizer->Encode(content); + if (msg_count == messages.size() - 1) { + new_tokens.push_back(static_cast(MistralTemplate::kEos)); } - } - else if (input_role == "system") { - role = system_prompt_; - std::string content = message["content"].asString(); - formatted_input = role + content + formatted_input; - } - else { - role = input_role; - std::string content = message["content"].asString(); - formatted_input += role + content; + tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end()); + } + } else if (input_role == "system") { + role = system_prompt_; + std::string content = message["content"].asString(); + formatted_input = role + content + formatted_input; + } else { + role = input_role; + std::string content = message["content"].asString(); + formatted_input += role + content; } } msg_count++; } formatted_input += ai_prompt_; - std::shared_ptr infer_state = std::make_shared(); + std::shared_ptr infer_state = + std::make_shared(); std::vector input_ids_host; - - if(model_type_ == ModelType::kOpenHermes ) { + if (model_type_ == ModelType::kOpenHermes) { input_ids_host = cortex_tokenizer->Encode(formatted_input); - } else if( model_type_ == ModelType::kMistral) { + } else if (model_type_ == ModelType::kMistral) { input_ids_host = tokens; - } - else if (model_type_ == ModelType::kLlama3){ + } else if (model_type_ == ModelType::kLlama3) { auto footer_tokens = EncodeHeaderLlama3("assistant"); tokens.insert(tokens.end(), footer_tokens.begin(), footer_tokens.end()); input_ids_host = tokens; @@ -370,24 +414,32 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr json_b sampling_config.repetitionPenalty = std::vector{request.frequency_penalty}; // Input preparation - std::thread inference_thread(InferenceThread, infer_state, input_ids_host, callback, this, sampling_config, input_len, outputLen, model_type_); - inference_thread.detach(); // Detach the thread to allow it to run independently + std::thread inference_thread(InferenceThread, infer_state, input_ids_host, + callback, this, sampling_config, input_len, + outputLen, model_type_); + inference_thread + .detach(); // Detach the thread to allow it to run independently q_->runTaskInQueue([this, cb = std::move(callback), infer_state]() { LOG_INFO << "Preparing to run inference task queue..."; - while (true) { // Continuously check if the queue is not empty - std::unique_lock lock(infer_state->queue_mutex); // Lock the queue for exclusive access + while (true) { // Continuously check if the queue is not empty + std::unique_lock lock( + infer_state->queue_mutex); // Lock the queue for exclusive access if (!infer_state->texts_to_stream.empty()) { std::string rew_text = infer_state->texts_to_stream.front(); infer_state->texts_to_stream.pop(); - if (HandleMatch(rew_text, infer_state, cb, model_type_ ) && rew_text != "[DONE]") { - continue; + if (HandleMatch(rew_text, infer_state, cb, model_type_) && + rew_text != "[DONE]") { + continue; }; if (rew_text == "[DONE]") { - const std::string str - = "data: " + tensorrtllm_utils::CreateReturnJson(tensorrtllm_utils::GenerateRandomString(20), model_id_, "", "stop") - + "\n\n" + "data: [DONE]" + "\n\n"; + const std::string str = + "data: " + + tensorrtllm_utils::CreateReturnJson( + tensorrtllm_utils::GenerateRandomString(20), model_id_, "", + "stop") + + "\n\n" + "data: [DONE]" + "\n\n"; infer_state->is_finished = true; @@ -401,12 +453,16 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr json_b cb(std::move(status), std::move(resp_data)); break; } - const std::string text_to_stream - = "data: " + tensorrtllm_utils::CreateReturnJson(tensorrtllm_utils::GenerateRandomString(20), model_id_, rew_text) + "\n\n"; - - lock.unlock(); // Unlock as soon as possible + const std::string text_to_stream = + "data: " + + tensorrtllm_utils::CreateReturnJson( + tensorrtllm_utils::GenerateRandomString(20), model_id_, + rew_text) + + "\n\n"; + + lock.unlock(); // Unlock as soon as possible // std::cout << rew_text; - + Json::Value resp_data; resp_data["data"] = text_to_stream; Json::Value status; @@ -427,96 +483,109 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr json_b return; }; -void TensorrtllmEngine::LoadModel(std::shared_ptr json_body, std::function&& callback) { - model::LoadModelRequest request = model::fromJson(json_body); - std::filesystem::path model_dir = request.model_path; - model_type_ = GetModelType(request.model_path); - - int ctx_len = request.ctx_len; - // We only support 2 models for now, it is ugly but it works :( - if(model_type_ == ModelType::kOpenHermes) { - user_prompt_ = request.user_prompt.empty() ? kOhUserPrompt : request.user_prompt; - ai_prompt_ = request.ai_prompt.empty() ? kOhAiPrompt : request.ai_prompt; - system_prompt_ = request.system_prompt.empty() ? kOhSystemPrompt : request.system_prompt; - } - model_id_ = GetModelId(*json_body); +void TensorrtllmEngine::LoadModel( + std::shared_ptr json_body, + std::function&& callback) { + model::LoadModelRequest request = model::fromJson(json_body); + std::filesystem::path model_dir = request.model_path; + model_type_ = GetModelType(request.model_path); + + int ctx_len = request.ctx_len; + // We only support 2 models for now, it is ugly but it works :( + if (model_type_ == ModelType::kOpenHermes) { + user_prompt_ = + request.user_prompt.empty() ? kOhUserPrompt : request.user_prompt; + ai_prompt_ = request.ai_prompt.empty() ? kOhAiPrompt : request.ai_prompt; + system_prompt_ = + request.system_prompt.empty() ? kOhSystemPrompt : request.system_prompt; + } + model_id_ = GetModelId(*json_body); - logger_ = std::make_shared(); - logger_->setLevel(nvinfer1::ILogger::Severity::kINFO); - initTrtLlmPlugins(logger_.get()); + logger_ = std::make_shared(); + logger_->setLevel(nvinfer1::ILogger::Severity::kINFO); + initTrtLlmPlugins(logger_.get()); - std::filesystem::path tokenizer_model_name = model_dir / "tokenizer.model"; - if(model_type_ == ModelType::kLlama3){ - cortex_tokenizer = std::make_unique(tokenizer_model_name.string()); - } - else{ - cortex_tokenizer = std::make_unique(tokenizer_model_name.string()); - } - - LOG_INFO << "Loaded tokenizer from " << tokenizer_model_name.string(); - - std::filesystem::path json_file_name = model_dir / "config.json"; - auto json = GptJsonConfig::parse(json_file_name); - auto config = json.getModelConfig(); - model_config_ = std::make_unique(config); - auto world_config = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism()); - LOG_INFO << "Loaded config from " << json_file_name.string(); - // auto dtype = model_config->getDataType(); - - // Currently doing fixed session config - session_config_.maxBatchSize = batch_size_; - session_config_.maxBeamWidth = 1; // Fixed for simplicity - session_config_.maxSequenceLength = ctx_len; - session_config_.cudaGraphMode = true; // Fixed for simplicity - - // Init gpt_session - auto model_path = model_dir / json.engineFilename(world_config, model_id_); + std::filesystem::path tokenizer_model_name = model_dir / "tokenizer.model"; + if (model_type_ == ModelType::kLlama3) { + cortex_tokenizer = + std::make_unique(tokenizer_model_name.string()); + } else { + cortex_tokenizer = + std::make_unique(tokenizer_model_name.string()); + } + + LOG_INFO << "Loaded tokenizer from " << tokenizer_model_name.string(); + + std::filesystem::path json_file_name = model_dir / "config.json"; + auto json = GptJsonConfig::parse(json_file_name); + auto config = json.getModelConfig(); + model_config_ = std::make_unique(config); + auto world_config = WorldConfig::mpi(1, json.getTensorParallelism(), + json.getPipelineParallelism()); + LOG_INFO << "Loaded config from " << json_file_name.string(); + // auto dtype = model_config->getDataType(); + + // Currently doing fixed session config + session_config_.maxBatchSize = batch_size_; + session_config_.maxBeamWidth = 1; // Fixed for simplicity + session_config_.maxSequenceLength = ctx_len; + session_config_.cudaGraphMode = true; // Fixed for simplicity + + // Init gpt_session + auto model_path = model_dir / json.engineFilename(world_config, model_id_); + try { + gpt_session = std::make_unique(session_config_, *model_config_, + world_config, + model_path.string(), logger_); + } catch (const std::exception& e) { + LOG_ERROR << "Failed to load model: " << e.what(); + LOG_INFO << "Retry once with smaller maxSequenceLength"; + gpt_session.reset(); + // Retry again with smaller maxSequenceLength once + session_config_.maxSequenceLength /= 2; try { - gpt_session = std::make_unique(session_config_, *model_config_, world_config, model_path.string(), logger_); - } catch(const std::exception& e) { + gpt_session = std::make_unique(session_config_, + *model_config_, world_config, + model_path.string(), logger_); + } catch (const std::exception& e) { LOG_ERROR << "Failed to load model: " << e.what(); - LOG_INFO << "Retry once with smaller maxSequenceLength"; gpt_session.reset(); - // Retry again with smaller maxSequenceLength once - session_config_.maxSequenceLength /= 2; - try { - gpt_session = std::make_unique(session_config_, *model_config_, world_config, model_path.string(), logger_); - } catch(const std::exception& e) { - LOG_ERROR << "Failed to load model: " << e.what(); - gpt_session.reset(); - cortex_tokenizer.reset(); - q_.reset(); - model_config_.reset(); - logger_.reset(); - Json::Value json_resp; - json_resp["message"] = "Failed to load model"; - Json::Value status; - status["is_done"] = false; - status["has_error"] = true; - status["is_stream"] = false; - status["status_code"] = k500InternalServerError; - callback(std::move(status), std::move(json_resp)); - return; - } + cortex_tokenizer.reset(); + q_.reset(); + model_config_.reset(); + logger_.reset(); + Json::Value json_resp; + json_resp["message"] = "Failed to load model"; + Json::Value status; + status["is_done"] = false; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = k500InternalServerError; + callback(std::move(status), std::move(json_resp)); + return; } + } - model_loaded_ = true; - if (q_ == nullptr) { - q_ = std::make_unique(1, model_id_); - } + model_loaded_ = true; + if (q_ == nullptr) { + q_ = std::make_unique(1, model_id_); + } - // Model loaded successfully - LOG_INFO << "Model " << model_id_ << " loaded successfully from path " << model_path.string(); - Json::Value json_resp; - json_resp["message"] = "Model loaded successfully"; - Json::Value status_resp; - status_resp["status_code"] = k200OK; - callback(std::move(status_resp), std::move(json_resp)); - start_time_ = std::chrono::system_clock::now().time_since_epoch() / - std::chrono::milliseconds(1); + // Model loaded successfully + LOG_INFO << "Model " << model_id_ << " loaded successfully from path " + << model_path.string(); + Json::Value json_resp; + json_resp["message"] = "Model loaded successfully"; + Json::Value status_resp; + status_resp["status_code"] = k200OK; + callback(std::move(status_resp), std::move(json_resp)); + start_time_ = std::chrono::system_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); }; -void TensorrtllmEngine::UnloadModel(std::shared_ptr json_body, std::function&& callback) { +void TensorrtllmEngine::UnloadModel( + std::shared_ptr json_body, + std::function&& callback) { if (!CheckModelLoaded(callback)) { LOG_WARN << "Model was not loaded"; Json::Value json_resp; @@ -526,7 +595,7 @@ void TensorrtllmEngine::UnloadModel(std::shared_ptr json_body, std: callback(std::move(status), std::move(json_resp)); return; } - + gpt_session.reset(); cortex_tokenizer.reset(); q_.reset(); @@ -545,7 +614,9 @@ void TensorrtllmEngine::UnloadModel(std::shared_ptr json_body, std: LOG_INFO << "Model unloaded sucessfully"; } -void TensorrtllmEngine::HandleEmbedding( std::shared_ptr json_body, std::function&& callback) { +void TensorrtllmEngine::HandleEmbedding( + std::shared_ptr json_body, + std::function&& callback) { LOG_WARN << "Engine does not support embedding yet"; Json::Value json_resp; json_resp["message"] = "Engine does not support embedding yet"; @@ -554,7 +625,9 @@ void TensorrtllmEngine::HandleEmbedding( std::shared_ptr json_body, callback(std::move(status), std::move(json_resp)); } -void TensorrtllmEngine::GetModelStatus(std::shared_ptr json_body, std::function&& callback) { +void TensorrtllmEngine::GetModelStatus( + std::shared_ptr json_body, + std::function&& callback) { LOG_WARN << "Engine does not support get model status method yet"; Json::Value json_resp; json_resp["message"] = "Engine does not support get model status method yet"; diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h index 171eda9bb..93645ff14 100644 --- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h +++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h @@ -7,62 +7,56 @@ #include #include +#include #include "NvInfer.h" #include "base/cortex-common/enginei.h" +#include "cpp-tiktoken/emdedded_resource_reader.h" //include to use tiktoken +#include "cpp-tiktoken/encoding.h" //include to use tiktoken #include "models/chat_completion_request.h" #include "models/load_model_request.h" #include "sentencepiece_processor.h" -#include "cpp-tiktoken/encoding.h" //include to use tiktoken -#include "cpp-tiktoken/emdedded_resource_reader.h" //include to use tiktoken #include "tensorrt_llm/plugins/api/tllmPlugin.h" #include "tensorrt_llm/runtime/generationInput.h" #include "tensorrt_llm/runtime/generationOutput.h" #include "tensorrt_llm/runtime/gptJsonConfig.h" -#include "tensorrt_llm/runtime/modelConfig.h" #include "tensorrt_llm/runtime/gptSession.h" +#include "tensorrt_llm/runtime/modelConfig.h" #include "tensorrt_llm/runtime/samplingConfig.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include "trantor/utils/ConcurrentTaskQueue.h" #include "trantor/utils/Logger.h" -#include - using namespace tensorrt_llm::runtime; // This class is file source reader from https://github.com/gh-markt/cpp-tiktoken/blob/master/ut/tests.cpp class TFilePathResourceReader : public IResourceReader { -public: - TFilePathResourceReader(const std::string& path) - : path_(path) - { + public: + TFilePathResourceReader(const std::string& path) : path_(path) {} + + std::vector readLines() override { + std::ifstream file(path_); + if (!file.is_open()) { + throw std::runtime_error("Embedded resource '" + path_ + "' not found."); } - std::vector readLines() override { - std::ifstream file(path_); - if (!file.is_open()) { - throw std::runtime_error("Embedded resource '" + path_ + "' not found."); - } + std::string line; + std::vector lines; + while (std::getline(file, line)) { + lines.push_back(line); + } - std::string line; - std::vector lines; - while (std::getline(file, line)) { - lines.push_back(line); - } + return lines; + } - return lines; - } -private: - std::string path_; + private: + std::string path_; }; class Tokenizer { public: - Tokenizer() { - } + Tokenizer() {} - virtual std::string DecodeWithSpace(const int id) { - return ""; - } + virtual std::string DecodeWithSpace(const int id) { return ""; } virtual std::string Decode(const std::vector ids) = 0; @@ -73,11 +67,12 @@ class SentencePieceTokenizer : public Tokenizer { private: sentencepiece::SentencePieceProcessor processor; - void ReplaceSubstring(std::string& base, const std::string& from, const std::string& to) { + void ReplaceSubstring(std::string& base, const std::string& from, + const std::string& to) { size_t start_pos = 0; while ((start_pos = base.find(from, start_pos)) != std::string::npos) { - base.replace(start_pos, from.length(), to); - start_pos += to.length(); + base.replace(start_pos, from.length(), to); + start_pos += to.length(); } } @@ -115,7 +110,8 @@ class TiktokenTokenizer : public Tokenizer { public: TiktokenTokenizer(const std::string& model_path) : Tokenizer() { TFilePathResourceReader reader(model_path); - encoder = GptEncoding::get_encoding_llama3(LanguageModel::CL100K_BASE, &reader); + encoder = + GptEncoding::get_encoding_llama3(LanguageModel::CL100K_BASE, &reader); LOG_INFO << "Successully loaded the tokenizer"; } @@ -129,26 +125,24 @@ class TiktokenTokenizer : public Tokenizer { return ids; } }; - enum class ModelType { - kOpenHermes, kLlama3, kMistral -}; +enum class ModelType { kOpenHermes, kLlama3, kMistral }; struct InferenceState { int prev_pos{0}; bool is_finished; std::queue texts_to_stream; - std::mutex queue_mutex; // Mutex to protect access to textsToStream + std::mutex queue_mutex; // Mutex to protect access to textsToStream size_t stop_word_match_len = 0; - std::vector sequence_openhermes = {"<", "|", "im", "_", "end", "|", ">"}; + std::vector sequence_openhermes = {"<", "|", "im", "_", + "end", "|", ">"}; std::vector sequence_mistral = {"[", "INST", "]"}; int token_gen_count = 0; - void Reset() { - stop_word_match_len = 0; - } + void Reset() { stop_word_match_len = 0; } bool IsComplete(ModelType model_type) const { - if(model_type == ModelType::kOpenHermes || model_type == ModelType::kLlama3) { + if (model_type == ModelType::kOpenHermes || + model_type == ModelType::kLlama3) { return stop_word_match_len >= sequence_openhermes.size(); } else { return stop_word_match_len >= sequence_mistral.size(); @@ -156,18 +150,17 @@ struct InferenceState { } const std::string& GetSequence(ModelType model_type, size_t index) { - if(model_type == ModelType::kOpenHermes || model_type == ModelType::kLlama3) { + if (model_type == ModelType::kOpenHermes || + model_type == ModelType::kLlama3) { return sequence_openhermes[index]; } else { return sequence_mistral[index]; } - } }; namespace tensorrtllm { - class TensorrtllmEngine : public EngineI { public: ~TensorrtllmEngine() final; @@ -188,7 +181,8 @@ class TensorrtllmEngine : public EngineI { std::shared_ptr json_body, std::function&& callback) final; virtual std::vector EncodeHeaderLlama3(const std::string& role); - virtual std::vector EncodeMessageLlama3( const std::string& role, const std::string& content); + virtual std::vector EncodeMessageLlama3(const std::string& role, + const std::string& content); // API to get running models. void GetModels( std::shared_ptr json_body, @@ -221,4 +215,4 @@ class TensorrtllmEngine : public EngineI { ModelType model_type_ = ModelType::kOpenHermes; }; -} // namespace inferences +} // namespace tensorrtllm