diff --git a/website/docs/references/models-http-api/llamafile.md b/website/docs/references/models-http-api/llamafile.md index cf3bd75e8894..8826e6d87ea2 100644 --- a/website/docs/references/models-http-api/llamafile.md +++ b/website/docs/references/models-http-api/llamafile.md @@ -17,23 +17,23 @@ Below is an example configuration: ```toml title="~/.tabby/config.toml" # Chat model [model.chat.http] -kind = "openai/chat" +kind = "openai/chat" # llamafile uses openai/chat kind model_name = "your_model" -api_endpoint = "http://localhost:8081/v1" +api_endpoint = "http://localhost:8081/v1" # Please add and conclude with the `v1` suffix api_key = "" # Completion model [model.completion.http] -kind = "llama.cpp/completion" +kind = "llama.cpp/completion" # llamafile uses llama.cpp/completion kind model_name = "your_model" -api_endpoint = "http://localhost:8081" +api_endpoint = "http://localhost:8081" # DO NOT append the `v1` suffix api_key = "secret-api-key" prompt_template = "<|fim_prefix|>{prefix}<|fim_suffix|>{suffix}<|fim_middle|>" # Example prompt template for the Qwen2.5 Coder model series. # Embedding model [model.embedding.http] -kind = "llama.cpp/embedding" +kind = "llama.cpp/embedding" # llamafile uses llama.cpp/embedding kind model_name = "your_model" -api_endpoint = "http://localhost:8082" +api_endpoint = "http://localhost:8082" # DO NOT append the `v1` suffix api_key = "" ``` diff --git a/website/docs/references/models-http-api/openai.md b/website/docs/references/models-http-api/openai.md index d7db87e553df..940845798dd9 100644 --- a/website/docs/references/models-http-api/openai.md +++ b/website/docs/references/models-http-api/openai.md @@ -15,21 +15,21 @@ Tabby continues to support the OpenAI Completion API specifications due to its w # Chat model [model.chat.http] kind = "openai/chat" -model_name = "gpt-3.5-turbo" -api_endpoint = "https://api.openai.com/v1" -api_key = "secret-api-key" - -# Embedding model -[model.embedding.http] -kind = "openai/embedding" -model_name = "text-embedding-3-small" -api_endpoint = "https://api.openai.com/v1" +model_name = "gpt-3.5-turbo" # Please make sure to use a chat model, such as gpt-4o +api_endpoint = "https://api.openai.com/v1" # DO NOT append the `/chat/completions` suffix api_key = "secret-api-key" # Completion model [model.completion.http] kind = "openai/completion" -model_name = "your_model" -api_endpoint = "https://url_to_your_backend_or_service" +model_name = "gpt-3.5-turbo-instruct" # Please make sure to use a completion model, such as gpt-3.5-turbo-instruct +api_endpoint = "https://api.openai.com/v1" # DO NOT append the `/completions` suffix +api_key = "secret-api-key" + +# Embedding model +[model.embedding.http] +kind = "openai/embedding" +model_name = "text-embedding-3-small" # Please make sure to use a embedding model, such as text-embedding-3-small +api_endpoint = "https://api.openai.com/v1" # DO NOT append the `/embeddings` suffix api_key = "secret-api-key" ``` diff --git a/website/docs/references/models-http-api/vllm.md b/website/docs/references/models-http-api/vllm.md index bef977d0158a..96222b5e8c5b 100644 --- a/website/docs/references/models-http-api/vllm.md +++ b/website/docs/references/models-http-api/vllm.md @@ -3,30 +3,42 @@ [vLLM](https://docs.vllm.ai/en/stable/) is a fast and user-friendly library for LLM inference and serving. vLLM offers an `OpenAI Compatible Server`, enabling us to use the OpenAI kinds for chat and embedding. -However, for completion, there are certain differences in the implementation. Therefore, we should use the `vllm/completion` kind and provide a `prompt_template` depending on the specific models. +However, for completion, there are certain differences in the implementation. +Therefore, we should use the `vllm/completion` kind and provide a `prompt_template` depending on the specific models. -Below is an example +Please note that models differ in their capabilities for completion or chat. +You should confirm the model's capability before employing it for chat or completion tasks. + +Additionally, there are models that can serve both as chat and completion. +For detailed information, please refer to the [Model Registry](../../models/index.mdx). + +Below is an example of the vLLM running at `http://localhost:8000`: + +Please note the following requirements in each model type: +1. `model_name` must exactly match the one used to run vLLM. +2. `api_endpoint` should follow the format `http://host:port/v1`. +3. `api_key` should be identical to the one used to run vLLM. ```toml title="~/.tabby/config.toml" # Chat model [model.chat.http] kind = "openai/chat" -model_name = "your_model" -api_endpoint = "https://url_to_your_backend_or_service" -api_key = "secret-api-key" - -# Embedding model -[model.embedding.http] -kind = "openai/embedding" -model_name = "your_model" -api_endpoint = "https://url_to_your_backend_or_service" +model_name = "your_model" # Please make sure to use a chat model. +api_endpoint = "http://localhost:8000/v1" api_key = "secret-api-key" # Completion model [model.completion.http] kind = "vllm/completion" -model_name = "your_model" -api_endpoint = "https://url_to_your_backend_or_service" +model_name = "your_model" # Please make sure to use a completion model. +api_endpoint = "http://localhost:8000/v1" api_key = "secret-api-key" prompt_template = "
 {prefix} {suffix} "  # Example prompt template for the CodeLlama model series.
+
+# Embedding model
+[model.embedding.http]
+kind = "openai/embedding"
+model_name = "your_model"
+api_endpoint = "http://localhost:8000/v1"
+api_key = "secret-api-key"
 ```