Feature backend Model as subtitle

modal-labs · Nov 13, 2023 · fe9c43f · fe9c43f
1 parent 8f0afc0
commit fe9c43f
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 5 deletions.
diff --git a/06_gpu_and_ml/llm-frontend/index.html b/06_gpu_and_ml/llm-frontend/index.html
@@ -37,7 +37,9 @@
         </a>
       </div>
       <div class="text-4xl mt-4 mb-4 font-semibold tracking-tighter text-center">
-        LLaMA 2 70B
+        Modal LLM Engine
+      </div>
+      <div x-show="info.loaded && info.model" x-text="info.model" class="text-2xl mt-4 mb-4 font-medium tracking-tighter text-center">
       </div>
 
       <div class="flex flex-wrap justify-center items-center mt-8 mb-6">

diff --git a/06_gpu_and_ml/text_generation_inference.py b/06_gpu_and_ml/text_generation_inference.py
@@ -22,9 +22,9 @@
 #
 # Any model supported by TGI can be chosen here.
 
-GPU_CONFIG = gpu.A100(memory=80, count=2)
-MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
-REVISION = "36d9a7388cc80e5f4b3e9701ca2f250d21a96c30"
+GPU_CONFIG = gpu.A100(memory=80, count=1)
+MODEL_ID = "Phind/Phind-CodeLlama-34B-v2"
+REVISION = "949f61e203f91b412efe8f679c798f09f0ff4b0c"
 # Add `["--quantize", "gptq"]` for TheBloke GPTQ models.
 LAUNCH_FLAGS = [
     "--model-id",
@@ -201,7 +201,7 @@ def main():
     allow_concurrent_inputs=10,
     timeout=60 * 10,
 )
-@asgi_app(label="tgi-app")
+@asgi_app(label="codellama")
 def app():
     import json
 
@@ -217,6 +217,7 @@ async def stats():
         return {
             "backlog": stats.backlog,
             "num_total_runners": stats.num_total_runners,
+            "model": MODEL_ID,
         }
 
     @web_app.get("/completion/{question}")