modal-labs · charlesfrye · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/06_gpu_and_ml/llm-serving/llama_cpp.py b/06_gpu_and_ml/llm-serving/llama_cpp.py
@@ -7,7 +7,7 @@
 # It's lightweight, fast, and includes support for exotic quantizations like 5-bit integers.
 # This example shows how you can run `llama.cpp` on Modal.
 
-# We start by defining a container image with `llama.cpp` installed.
+# We start by defining a [container image](https://modal.com/docs/guide/custom-container) with `llama.cpp` installed.
 
 import modal
 
@@ -28,19 +28,42 @@
 # We use a model with 5-bit quantization.
 # The model format, `.gguf`, is a custom format used by `llama.cpp`.
 
-MODEL_NAME = "Meta-Llama-3.1-8B-Instruct"
+ORG_NAME = "bartowski"
+MODEL_NAME = "Meta-Llama-3.1-8B-Instruct-GGUF"
+REPO_ID = f"{ORG_NAME}/{MODEL_NAME}"
 MODEL_FILE = "Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf"
 REVISION = "9a8dec50f04fa8fad1dc1e7bc20a84a512e2bb01"
 
-image = image.run_commands(
-    f"curl --fail-with-body -L -O https://huggingface.co/bartowski/{MODEL_NAME}-GGUF/resolve/{REVISION}/{MODEL_FILE}?download=true"
+
+def download_model(repo_id, filename, revision):
+    from huggingface_hub import hf_hub_download
+
+    hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        revision=revision,
+        local_dir="/",
+    )
+
+
+# We can execute this Python function as part of building our image,
+# just as we can install dependencies and set environment variables,
+# with the `run_function` method:
+
+image = (
+    image.pip_install("huggingface_hub[hf_transfer]==0.26.2")
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+    .run_function(download_model, args=(REPO_ID, MODEL_FILE, REVISION))
 )
 
-# Now, we're ready to define a serverless function that runs `llama.cpp`.
+
+# Now, we're ready to define a serverless function that runs `llama.cpp`!
+
 # We wrap that function with a decorator from a Modal App,
-# `@app.function` specifying the image it should run on
+# `@app.function`, specifying the image it should run on
 # and setting the maximum number of concurrent replicas
-# (here, `100`, which is the default).
+# (here, `100`, which is the default for CPU Functions).
+
 
 app = modal.App("llama-cpp-modal", image=image)
 
@@ -66,7 +89,8 @@ def llama_cpp_inference(
             str(num_output_tokens),
             "-p",
             prompt,
-        ]
+        ],
+        check=True,
     )