Skip to content

Commit

Permalink
Update SGLang example to use Qwen2-VL (#1030)
Browse files Browse the repository at this point in the history
* Update SGLang example to use Qwen2-VL

* ruff

* Update 06_gpu_and_ml/llm-serving/sgl_vlm.py

Co-authored-by: Charles Frye <[email protected]>

* Update 06_gpu_and_ml/llm-serving/sgl_vlm.py

Co-authored-by: Charles Frye <[email protected]>

* PR changes

* minor text fixes, l40s, cleaner file handling

---------

Co-authored-by: Charles Frye <[email protected]>
  • Loading branch information
advay-modal and charlesfrye authored Jan 10, 2025
1 parent 38449e5 commit a8e9d14
Showing 1 changed file with 22 additions and 19 deletions.
41 changes: 22 additions & 19 deletions 06_gpu_and_ml/llm-serving/sgl_vlm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# # Run LLaVA-Next on SGLang for Visual QA
# # Run Qwen2-VL on SGLang for Visual QA

# Vision-Language Models (VLMs) are like LLMs with eyes:
# they can generate text based not just on other text,
Expand All @@ -7,7 +7,7 @@
# This example shows how to run a VLM on Modal using the
# [SGLang](https://github.com/sgl-project/sglang) library.

# Here's a sample inference, with the image rendered directly in the terminal:
# Here's a sample inference, with the image rendered directly (and at low resolution) in the terminal:

# ![Sample output answering a question about a photo of the Statue of Liberty](https://modal-public-assets.s3.amazonaws.com/sgl_vlm_qa_sol.png)

Expand All @@ -32,7 +32,7 @@
# If you want to see the model really rip, try an `"a100-80gb"` or an `"h100"`
# on a large batch.

GPU_TYPE = os.environ.get("GPU_TYPE", "a10g")
GPU_TYPE = os.environ.get("GPU_TYPE", "l40s")
GPU_COUNT = os.environ.get("GPU_COUNT", 1)

GPU_CONFIG = f"{GPU_TYPE}:{GPU_COUNT}"
Expand All @@ -41,13 +41,13 @@

MINUTES = 60 # seconds

# We use a [LLaVA-NeXT](https://huggingface.co/docs/transformers/en/model_doc/llava_next)
# model built on top of Meta's LLaMA 3 8B.
# We use the [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)
# model by Alibaba.

MODEL_PATH = "lmms-lab/llama3-llava-next-8b"
MODEL_REVISION = "e7e6a9fd5fd75d44b32987cba51c123338edbede"
TOKENIZER_PATH = "lmms-lab/llama3-llava-next-8b-tokenizer"
MODEL_CHAT_TEMPLATE = "llama-3-instruct"
MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
MODEL_REVISION = "a7a06a1cc11b4514ce9edcde0e3ca1d16e5ff2fc"
TOKENIZER_PATH = "Qwen/Qwen2-VL-7B-Instruct"
MODEL_CHAT_TEMPLATE = "qwen2-vl"

# We download it from the Hugging Face Hub using the Python function below.

Expand All @@ -73,12 +73,15 @@ def download_model_to_image():
vlm_image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install( # add sglang and some Python dependencies
"sglang[all]==0.1.17",
"transformers==4.40.2",
"transformers==4.47.1",
"numpy<2",
"fastapi[standard]==0.115.4",
"pydantic==2.9.2",
"starlette==0.41.2",
"torch==2.4.0",
"sglang[all]==0.4.1",
# as per sglang website: https://sgl-project.github.io/start/install.html
extra_options="--find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/",
)
.run_function( # download the model by running a Python function
download_model_to_image
Expand All @@ -94,11 +97,11 @@ def download_model_to_image():

# The code below adds a modal `Cls` to an `App` that runs the VLM.

# We define a method `generate` that takes a URL for an image URL and a question
# We define a method `generate` that takes a URL for an image and a question
# about the image as inputs and returns the VLM's answer.

# By decorating it with `@modal.web_endpoint`, we expose it as an HTTP endpoint,
# so it can be accessed over the public internet from any client.
# so it can be accessed over the public Internet from any client.

app = modal.App("example-sgl-vlm")

Expand Down Expand Up @@ -129,6 +132,8 @@ def start_runtime(self):

@modal.web_endpoint(method="POST", docs=True)
def generate(self, request: dict):
from pathlib import Path

import sglang as sgl
from term_image.image import from_file

Expand All @@ -140,18 +145,16 @@ def generate(self, request: dict):
if image_url is None:
image_url = "https://modal-public-assets.s3.amazonaws.com/golden-gate-bridge.jpg"

image_filename = image_url.split("/")[-1]
image_path = f"/tmp/{uuid4()}-{image_filename}"
response = requests.get(image_url)

response.raise_for_status()

with open(image_path, "wb") as file:
file.write(response.content)
image_filename = image_url.split("/")[-1]
image_path = Path(f"/tmp/{uuid4()}-{image_filename}")
image_path.write_bytes(response.content)

@sgl.function
def image_qa(s, image_path, question):
s += sgl.user(sgl.image(image_path) + question)
s += sgl.user(sgl.image(str(image_path)) + question)
s += sgl.assistant(sgl.gen("answer"))

question = request.get("question")
Expand Down

0 comments on commit a8e9d14

Please sign in to comment.