Skip to content

Commit

Permalink
Merge branch 'vllm-project:main' into vllmfp8mistral
Browse files Browse the repository at this point in the history
  • Loading branch information
akllm authored Nov 11, 2024
2 parents 6937cc7 + 9d5b4e4 commit 3518639
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 16 deletions.
1 change: 1 addition & 0 deletions docs/source/serving/integrations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ Integrations
deploying_with_dstack
serving_with_langchain
serving_with_llamaindex
serving_with_llamastack
42 changes: 42 additions & 0 deletions docs/source/serving/serving_with_llamastack.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
.. _run_on_llamastack:

Serving with Llama Stack
============================

vLLM is also available via `Llama Stack <https://github.com/meta-llama/llama-stack>`_ .

To install Llama Stack, run

.. code-block:: console
$ pip install llama-stack -q
Inference using OpenAI Compatible API
-------------------------------------

Then start Llama Stack server pointing to your vLLM server with the following configuration:

.. code-block:: yaml
inference:
- provider_id: vllm0
provider_type: remote::vllm
config:
url: http://127.0.0.1:8000
Please refer to `this guide <https://github.com/meta-llama/llama-stack/blob/main/docs/source/getting_started/distributions/self_hosted_distro/remote_vllm.md>`_ for more details on this remote vLLM provider.

Inference via Embedded vLLM
---------------------------

An `inline vLLM provider
<https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm>`_
is also available. This is a sample of configuration using that method:

.. code-block:: yaml
inference
- provider_type: vllm
config:
model: Llama3.1-8B-Instruct
tensor_parallel_size: 4
39 changes: 23 additions & 16 deletions tests/distributed/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import socket

import pytest
import ray
import torch

import vllm.envs as envs
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
from vllm.distributed.utils import StatelessProcessGroup
from vllm.utils import (cuda_device_count_stateless,
from vllm.utils import (cuda_device_count_stateless, get_open_port,
update_environment_variables)

from ..utils import multi_gpu_test
Expand Down Expand Up @@ -40,14 +42,13 @@ def test_cuda_device_count_stateless():
assert ray.get(actor.get_count.remote()) == 0


def cpu_worker(rank, WORLD_SIZE):
pg1 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29500",
def cpu_worker(rank, WORLD_SIZE, port1, port2):
pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
rank=rank,
world_size=WORLD_SIZE)
if rank <= 2:
pg2 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29501",
rank=rank,
world_size=3)
pg2 = StatelessProcessGroup.create(
init_method=f"tcp://127.0.0.1:{port2}", rank=rank, world_size=3)
data = torch.tensor([rank])
data = pg1.broadcast_obj(data, src=2)
assert data.item() == 2
Expand All @@ -59,17 +60,16 @@ def cpu_worker(rank, WORLD_SIZE):
pg1.barrier()


def gpu_worker(rank, WORLD_SIZE):
def gpu_worker(rank, WORLD_SIZE, port1, port2):
torch.cuda.set_device(rank)
pg1 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29502",
pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
rank=rank,
world_size=WORLD_SIZE)
pynccl1 = PyNcclCommunicator(pg1, device=rank)
pynccl1.disabled = False
if rank <= 2:
pg2 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29503",
rank=rank,
world_size=3)
pg2 = StatelessProcessGroup.create(
init_method=f"tcp://127.0.0.1:{port2}", rank=rank, world_size=3)
pynccl2 = PyNcclCommunicator(pg2, device=rank)
pynccl2.disabled = False
data = torch.tensor([rank]).cuda()
Expand All @@ -88,8 +88,8 @@ def gpu_worker(rank, WORLD_SIZE):
assert item == 18


def broadcast_worker(rank, WORLD_SIZE):
pg1 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29504",
def broadcast_worker(rank, WORLD_SIZE, port1, port2):
pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
rank=rank,
world_size=WORLD_SIZE)
if rank == 2:
Expand All @@ -100,26 +100,33 @@ def broadcast_worker(rank, WORLD_SIZE):
pg1.barrier()


def allgather_worker(rank, WORLD_SIZE):
pg1 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29505",
def allgather_worker(rank, WORLD_SIZE, port1, port2):
pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
rank=rank,
world_size=WORLD_SIZE)
data = pg1.all_gather_obj(rank)
assert data == list(range(WORLD_SIZE))
pg1.barrier()


# TODO: investigate why this test is flaky. It hangs during initialization.
@pytest.mark.skip("Skip the test because it is flaky.")
@multi_gpu_test(num_gpus=4)
@pytest.mark.parametrize(
"worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
def test_stateless_process_group(worker):
port1 = get_open_port()
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("", port1))
port2 = get_open_port()
WORLD_SIZE = 4
from multiprocessing import get_context
ctx = get_context("fork")
processes = []
for i in range(WORLD_SIZE):
rank = i
processes.append(ctx.Process(target=worker, args=(rank, WORLD_SIZE)))
processes.append(
ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2)))
for p in processes:
p.start()
for p in processes:
Expand Down
2 changes: 2 additions & 0 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import time
from dataclasses import dataclass
from typing import TYPE_CHECKING, Dict, List, Optional, Set
Expand Down Expand Up @@ -405,6 +406,7 @@ def load_model(self) -> None:
if self.use_cuda_graph:
# FIXME(woosuk): Currently, we do not use inductor to reduce the
# compilation time and any potential issues with the inductor.
os.environ["VLLM_CUSTOM_OPS"] = "all"
set_compilation_config(
CompilationConfig(
use_cudagraph=True,
Expand Down

0 comments on commit 3518639

Please sign in to comment.