diff --git a/06_gpu_and_ml/llm-frontend/index.html b/06_gpu_and_ml/llm-frontend/index.html
index fffef7a91..a2d368f45 100644
--- a/06_gpu_and_ml/llm-frontend/index.html
+++ b/06_gpu_and_ml/llm-frontend/index.html
@@ -37,7 +37,9 @@
- LLaMA 2 70B
+ Modal LLM Engine
+
+
diff --git a/06_gpu_and_ml/text_generation_inference.py b/06_gpu_and_ml/text_generation_inference.py
index 66312d4de..8861c8202 100644
--- a/06_gpu_and_ml/text_generation_inference.py
+++ b/06_gpu_and_ml/text_generation_inference.py
@@ -22,9 +22,9 @@
#
# Any model supported by TGI can be chosen here.
-GPU_CONFIG = gpu.A100(memory=80, count=2)
-MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
-REVISION = "36d9a7388cc80e5f4b3e9701ca2f250d21a96c30"
+GPU_CONFIG = gpu.A100(memory=80, count=1)
+MODEL_ID = "Phind/Phind-CodeLlama-34B-v2"
+REVISION = "949f61e203f91b412efe8f679c798f09f0ff4b0c"
# Add `["--quantize", "gptq"]` for TheBloke GPTQ models.
LAUNCH_FLAGS = [
"--model-id",
@@ -201,7 +201,7 @@ def main():
allow_concurrent_inputs=10,
timeout=60 * 10,
)
-@asgi_app(label="tgi-app")
+@asgi_app(label="codellama")
def app():
import json
@@ -217,6 +217,7 @@ async def stats():
return {
"backlog": stats.backlog,
"num_total_runners": stats.num_total_runners,
+ "model": MODEL_ID,
}
@web_app.get("/completion/{question}")