From 199adbb7cfb7e8185c9418de56db006f8cf1c294 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 13 Aug 2024 21:52:58 -0700
Subject: [PATCH] [doc] update test script to include cudagraph (#7501)

---
 docs/source/getting_started/debugging.rst | 39 +++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index d7066f2325b3a..117a9dd666481 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -30,24 +30,59 @@ Here are some common issues that can cause hangs:
 
 .. code-block:: python
 
+    # Test PyTorch NCCL
     import torch
     import torch.distributed as dist
     dist.init_process_group(backend="nccl")
     local_rank = dist.get_rank() % torch.cuda.device_count()
-    data = torch.FloatTensor([1,] * 128).to(f"cuda:{local_rank}")
+    torch.cuda.set_device(local_rank)
+    data = torch.FloatTensor([1,] * 128).to("cuda")
     dist.all_reduce(data, op=dist.ReduceOp.SUM)
     torch.cuda.synchronize()
     value = data.mean().item()
     world_size = dist.get_world_size()
     assert value == world_size, f"Expected {world_size}, got {value}"
 
+    print("PyTorch NCCL is successful!")
+
+    # Test PyTorch GLOO
     gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
     cpu_data = torch.FloatTensor([1,] * 128)
     dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
     value = cpu_data.mean().item()
     assert value == world_size, f"Expected {world_size}, got {value}"
 
-    print("sanity check is successful!")
+    print("PyTorch GLOO is successful!")
+
+    # Test vLLM NCCL, with cuda graph
+    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+
+    pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
+    pynccl.disabled = False
+
+    s = torch.cuda.Stream()
+    with torch.cuda.stream(s):
+        data.fill_(1)
+        pynccl.all_reduce(data, stream=s)
+        value = data.mean().item()
+        assert value == world_size, f"Expected {world_size}, got {value}"
+
+    print("vLLM NCCL is successful!")
+
+    g = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cuda_graph=g, stream=s):
+        pynccl.all_reduce(data, stream=torch.cuda.current_stream())
+
+    data.fill_(1)
+    g.replay()
+    torch.cuda.current_stream().synchronize()
+    value = data.mean().item()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+
+    print("vLLM NCCL with cuda graph is successful!")
+
+    dist.destroy_process_group(gloo_group)
+    dist.destroy_process_group()
 
 .. tip::