From 199adbb7cfb7e8185c9418de56db006f8cf1c294 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 13 Aug 2024 21:52:58 -0700 Subject: [PATCH] [doc] update test script to include cudagraph (#7501) --- docs/source/getting_started/debugging.rst | 39 +++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index d7066f2325b3a..117a9dd666481 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -30,24 +30,59 @@ Here are some common issues that can cause hangs: .. code-block:: python + # Test PyTorch NCCL import torch import torch.distributed as dist dist.init_process_group(backend="nccl") local_rank = dist.get_rank() % torch.cuda.device_count() - data = torch.FloatTensor([1,] * 128).to(f"cuda:{local_rank}") + torch.cuda.set_device(local_rank) + data = torch.FloatTensor([1,] * 128).to("cuda") dist.all_reduce(data, op=dist.ReduceOp.SUM) torch.cuda.synchronize() value = data.mean().item() world_size = dist.get_world_size() assert value == world_size, f"Expected {world_size}, got {value}" + print("PyTorch NCCL is successful!") + + # Test PyTorch GLOO gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") cpu_data = torch.FloatTensor([1,] * 128) dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) value = cpu_data.mean().item() assert value == world_size, f"Expected {world_size}, got {value}" - print("sanity check is successful!") + print("PyTorch GLOO is successful!") + + # Test vLLM NCCL, with cuda graph + from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + + pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) + pynccl.disabled = False + + s = torch.cuda.Stream() + with torch.cuda.stream(s): + data.fill_(1) + pynccl.all_reduce(data, stream=s) + value = data.mean().item() + assert value == world_size, f"Expected {world_size}, got {value}" + + print("vLLM NCCL is successful!") + + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(cuda_graph=g, stream=s): + pynccl.all_reduce(data, stream=torch.cuda.current_stream()) + + data.fill_(1) + g.replay() + torch.cuda.current_stream().synchronize() + value = data.mean().item() + assert value == world_size, f"Expected {world_size}, got {value}" + + print("vLLM NCCL with cuda graph is successful!") + + dist.destroy_process_group(gloo_group) + dist.destroy_process_group() .. tip::