Merge branch 'main' into trtllm-executor-thread

huggingface · Oct 25, 2024 · 01b82b5 · 01b82b5
2 parents 18b473b + ed87b46
commit 01b82b5
Show file tree

Hide file tree

Showing 63 changed files with 2,095 additions and 1,967 deletions.
diff --git a/README.md b/README.md
@@ -98,7 +98,7 @@ curl 127.0.0.1:8080/generate_stream \
 You can also use [TGI's Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) to obtain Open AI Chat Completion API compatible responses.
 
 ```bash
-curl localhost:3000/v1/chat/completions \
+curl localhost:8080/v1/chat/completions \
     -X POST \
     -d '{
   "model": "tgi",

diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
@@ -54,14 +54,14 @@ struct Args {
     otlp_service_name: String,
     #[clap(long, env)]
     cors_allow_origin: Option<Vec<String>>,
-    #[clap(long, env, default_value_t = false)]
-    messages_api_enabled: bool,
     #[clap(default_value = "4", long, env)]
     max_client_batch_size: usize,
     #[clap(long, env)]
     auth_token: Option<String>,
     #[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")]
     executor_worker: PathBuf,
+    #[clap(default_value = "on", long, env)]
+    usage_stats: usage_stats::UsageStatsLevel,
 }
 
 async fn get_tokenizer(
@@ -213,10 +213,10 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
         otlp_endpoint,
         otlp_service_name,
         cors_allow_origin,
-        messages_api_enabled,
         max_client_batch_size,
         auth_token,
         executor_worker,
+        usage_stats,
     } = args;
 
     // Launch Tokio runtime
@@ -293,7 +293,6 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
         false,
         None,
         None,
-        messages_api_enabled,
         true,
         max_client_batch_size,
         UsageStatsLevel::Off,

diff --git a/backends/v2/src/main.rs b/backends/v2/src/main.rs
@@ -44,6 +44,8 @@ struct Args {
     tokenizer_config_path: Option<String>,
     #[clap(long, env)]
     revision: Option<String>,
+    #[clap(long, env, value_enum)]
+    trust_remote_code: bool,
     #[clap(default_value = "2", long, env)]
     validation_workers: usize,
     #[clap(long, env)]
@@ -63,8 +65,6 @@ struct Args {
     #[clap(long, env)]
     ngrok_edge: Option<String>,
     #[clap(long, env, default_value_t = false)]
-    messages_api_enabled: bool,
-    #[clap(long, env, default_value_t = false)]
     disable_grammar_support: bool,
     #[clap(default_value = "4", long, env)]
     max_client_batch_size: usize,
@@ -101,6 +101,7 @@ async fn main() -> Result<(), RouterError> {
         tokenizer_name,
         tokenizer_config_path,
         revision,
+        trust_remote_code,
         validation_workers,
         api_key,
         json_output,
@@ -110,7 +111,6 @@ async fn main() -> Result<(), RouterError> {
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,
@@ -184,13 +184,13 @@ async fn main() -> Result<(), RouterError> {
         tokenizer_name,
         tokenizer_config_path,
         revision,
+        trust_remote_code,
         hostname,
         port,
         cors_allow_origin,
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,

diff --git a/backends/v3/src/main.rs b/backends/v3/src/main.rs
@@ -44,6 +44,8 @@ struct Args {
     tokenizer_config_path: Option<String>,
     #[clap(long, env)]
     revision: Option<String>,
+    #[clap(long, env, value_enum)]
+    trust_remote_code: bool,
     #[clap(default_value = "2", long, env)]
     validation_workers: usize,
     #[clap(long, env)]
@@ -63,8 +65,6 @@ struct Args {
     #[clap(long, env)]
     ngrok_edge: Option<String>,
     #[clap(long, env, default_value_t = false)]
-    messages_api_enabled: bool,
-    #[clap(long, env, default_value_t = false)]
     disable_grammar_support: bool,
     #[clap(default_value = "4", long, env)]
     max_client_batch_size: usize,
@@ -101,6 +101,7 @@ async fn main() -> Result<(), RouterError> {
         tokenizer_name,
         tokenizer_config_path,
         revision,
+        trust_remote_code,
         validation_workers,
         api_key,
         json_output,
@@ -110,7 +111,6 @@ async fn main() -> Result<(), RouterError> {
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,
@@ -184,13 +184,13 @@ async fn main() -> Result<(), RouterError> {
         tokenizer_name,
         tokenizer_config_path,
         revision,
+        trust_remote_code,
         hostname,
         port,
         cors_allow_origin,
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,

diff --git a/docs/openapi.json b/docs/openapi.json
@@ -316,6 +316,98 @@
         }
       }
     },
+    "/invocations": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Generate tokens from Sagemaker request",
+        "operationId": "sagemaker_compatibility",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/SagemakerRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Generated Chat Completion",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/SagemakerResponse"
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/SagemakerStreamResponse"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Input validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Input validation error",
+                  "error_type": "validation"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Generation Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Request failed during generation",
+                  "error_type": "generation"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Incomplete generation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Incomplete generation",
+                  "error_type": "incomplete_generation"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
     "/metrics": {
       "get": {
         "tags": [
@@ -1865,6 +1957,45 @@
           "type": "string"
         }
       },
+      "SagemakerRequest": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/CompatGenerateRequest"
+          },
+          {
+            "$ref": "#/components/schemas/ChatRequest"
+          },
+          {
+            "$ref": "#/components/schemas/CompletionRequest"
+          }
+        ]
+      },
+      "SagemakerResponse": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/GenerateResponse"
+          },
+          {
+            "$ref": "#/components/schemas/ChatCompletion"
+          },
+          {
+            "$ref": "#/components/schemas/CompletionFinal"
+          }
+        ]
+      },
+      "SagemakerStreamResponse": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/StreamResponse"
+          },
+          {
+            "$ref": "#/components/schemas/ChatCompletionChunk"
+          },
+          {
+            "$ref": "#/components/schemas/Chunk"
+          }
+        ]
+      },
       "SimpleToken": {
         "type": "object",
         "required": [

diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md
@@ -141,9 +141,7 @@ TGI can be deployed on various cloud providers for scalable and robust text gene
 
 ## Amazon SageMaker
 
-To enable the Messages API in Amazon SageMaker you need to set the environment variable `MESSAGES_API_ENABLED=true`.
-
-This will modify the `/invocations` route to accept Messages dictonaries consisting out of role and content. See the example below on how to deploy Llama with the new Messages API.
+Amazon Sagemaker natively supports the message API:
 
 ```python
 import json
@@ -161,12 +159,11 @@ except ValueError:
 hub = {
  'HF_MODEL_ID':'HuggingFaceH4/zephyr-7b-beta',
  'SM_NUM_GPUS': json.dumps(1),
- 'MESSAGES_API_ENABLED': True
 }
 
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.0"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="2.3.2"),
  env=hub,
  role=role,
 )

diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
@@ -8,6 +8,7 @@ Text Generation Inference enables serving optimized models. The following sectio
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
 - [Llama](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
 - [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
+- [Granite](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct)
 - [Gemma](https://huggingface.co/google/gemma-7b)
 - [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224)
 - [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)

diff --git a/docs/source/usage_statistics.md b/docs/source/usage_statistics.md
@@ -26,7 +26,6 @@ As of release 2.1.2 this is an example of the data collected:
   "max_top_n_tokens": 5,
   "max_total_tokens": 2048,
   "max_waiting_tokens": 20,
-  "messages_api_enabled": false,
   "model_config": {
     "model_type": "Bloom"
   },

diff --git a/flake.lock b/flake.lock
diff --git a/flake.nix b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/marlin-kernels-0.3.0";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
@@ -137,6 +137,11 @@
 
           impure = callPackage ./nix/impure-shell.nix { inherit server; };
 
+          impureWithCuda = callPackage ./nix/impure-shell.nix {
+            inherit server;
+            withCuda = true;
+          };
+
           impure-flash-attn-v1 = callPackage ./nix/impure-shell.nix {
             server = server.override { flash-attn = python3.pkgs.flash-attn-v1; };
           };