diff --git a/06_gpu_and_ml/llm-frontend/index.html b/06_gpu_and_ml/llm-frontend/index.html index fffef7a91..a2d368f45 100644 --- a/06_gpu_and_ml/llm-frontend/index.html +++ b/06_gpu_and_ml/llm-frontend/index.html @@ -37,7 +37,9 @@
- LLaMA 2 70B + Modal LLM Engine +
+
diff --git a/06_gpu_and_ml/text_generation_inference.py b/06_gpu_and_ml/text_generation_inference.py index 66312d4de..8861c8202 100644 --- a/06_gpu_and_ml/text_generation_inference.py +++ b/06_gpu_and_ml/text_generation_inference.py @@ -22,9 +22,9 @@ # # Any model supported by TGI can be chosen here. -GPU_CONFIG = gpu.A100(memory=80, count=2) -MODEL_ID = "meta-llama/Llama-2-70b-chat-hf" -REVISION = "36d9a7388cc80e5f4b3e9701ca2f250d21a96c30" +GPU_CONFIG = gpu.A100(memory=80, count=1) +MODEL_ID = "Phind/Phind-CodeLlama-34B-v2" +REVISION = "949f61e203f91b412efe8f679c798f09f0ff4b0c" # Add `["--quantize", "gptq"]` for TheBloke GPTQ models. LAUNCH_FLAGS = [ "--model-id", @@ -201,7 +201,7 @@ def main(): allow_concurrent_inputs=10, timeout=60 * 10, ) -@asgi_app(label="tgi-app") +@asgi_app(label="codellama") def app(): import json @@ -217,6 +217,7 @@ async def stats(): return { "backlog": stats.backlog, "num_total_runners": stats.num_total_runners, + "model": MODEL_ID, } @web_app.get("/completion/{question}")