Skip to content

Commit

Permalink
vLLM as inference server
Browse files Browse the repository at this point in the history
  • Loading branch information
gongy committed Dec 13, 2023
1 parent abbef63 commit 802dc28
Showing 1 changed file with 6 additions and 4 deletions.
10 changes: 6 additions & 4 deletions 06_gpu_and_ml/vllm_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
#
# First we import the components we need from `modal`.

import time
import os
import time

from modal import Image, Stub, gpu, method

Expand Down Expand Up @@ -88,7 +88,7 @@ def download_model_to_folder():
gpu=GPU_CONFIG,
timeout=60 * 10,
container_idle_timeout=60 * 10,
allow_concurrent_inputs=10
allow_concurrent_inputs=10,
)
class Model:
def __enter__(self):
Expand Down Expand Up @@ -168,6 +168,7 @@ def main():
for text in model.completion_stream.remote_gen(question):
print(text, end="", flush=True)


# ## Deploy and invoke the model
# Once we deploy this model with `modal deploy text_generation_inference.py`,
# we can invoke inference from other apps, sharing the same pool
Expand All @@ -184,12 +185,13 @@ def main():
# ## Coupling a frontend web application
#
# We can stream inference from a FastAPI backend, also deployed on Modal.
#
#
# You can try our deployment [here](https://modal-labs--vllm-mixtral.modal.run).

from modal import Mount, asgi_app
from pathlib import Path

from modal import Mount, asgi_app

frontend_path = Path(__file__).parent / "llm-frontend"


Expand Down

0 comments on commit 802dc28

Please sign in to comment.