Skip to content

Commit

Permalink
switch to hf-transfer for the model download
Browse files Browse the repository at this point in the history
  • Loading branch information
charlesfrye committed Nov 18, 2024
1 parent 69ab06b commit bbff2ad
Showing 1 changed file with 32 additions and 8 deletions.
40 changes: 32 additions & 8 deletions 06_gpu_and_ml/llm-serving/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# It's lightweight, fast, and includes support for exotic quantizations like 5-bit integers.
# This example shows how you can run `llama.cpp` on Modal.

# We start by defining a container image with `llama.cpp` installed.
# We start by defining a [container image](https://modal.com/docs/guide/custom-container) with `llama.cpp` installed.

import modal

Expand All @@ -28,19 +28,42 @@
# We use a model with 5-bit quantization.
# The model format, `.gguf`, is a custom format used by `llama.cpp`.

MODEL_NAME = "Meta-Llama-3.1-8B-Instruct"
ORG_NAME = "bartowski"
MODEL_NAME = "Meta-Llama-3.1-8B-Instruct-GGUF"
REPO_ID = f"{ORG_NAME}/{MODEL_NAME}"
MODEL_FILE = "Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf"
REVISION = "9a8dec50f04fa8fad1dc1e7bc20a84a512e2bb01"

image = image.run_commands(
f"curl --fail-with-body -L -O https://huggingface.co/bartowski/{MODEL_NAME}-GGUF/resolve/{REVISION}/{MODEL_FILE}?download=true"

def download_model(repo_id, filename, revision):
from huggingface_hub import hf_hub_download

hf_hub_download(
repo_id=repo_id,
filename=filename,
revision=revision,
local_dir="/",
)


# We can execute this Python function as part of building our image,
# just as we can install dependencies and set environment variables,
# with the `run_function` method:

image = (
image.pip_install("huggingface_hub[hf_transfer]==0.26.2")
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(download_model, args=(REPO_ID, MODEL_FILE, REVISION))
)

# Now, we're ready to define a serverless function that runs `llama.cpp`.

# Now, we're ready to define a serverless function that runs `llama.cpp`!

# We wrap that function with a decorator from a Modal App,
# `@app.function` specifying the image it should run on
# `@app.function`, specifying the image it should run on
# and setting the maximum number of concurrent replicas
# (here, `100`, which is the default).
# (here, `100`, which is the default for CPU Functions).


app = modal.App("llama-cpp-modal", image=image)

Expand All @@ -66,7 +89,8 @@ def llama_cpp_inference(
str(num_output_tokens),
"-p",
prompt,
]
],
check=True,
)


Expand Down

0 comments on commit bbff2ad

Please sign in to comment.