pytorch · amithrm · Nov 16, 2021 · Feb 29, 2024 · Feb 29, 2024 · Feb 2, 2024
diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -296,12 +296,13 @@ function run_mp_op_tests {
   run_test "$CDIR/test_fsdp_auto_wrap.py"
   run_torchrun "$CDIR/test_mp_early_exit.py"
   run_pt_xla_debug "$CDIR/debug_tool/test_mp_pt_xla_debug.py"
-  run_test "$CDIR/torch_distributed/test_torch_distributed_all_gather_xla_backend.py"
-  run_test "$CDIR/torch_distributed/test_torch_distributed_all_reduce_xla_backend.py"
-  run_test "$CDIR/torch_distributed/test_torch_distributed_multi_all_reduce_xla_backend.py"
-  run_test "$CDIR/torch_distributed/test_torch_distributed_reduce_scatter_xla_backend.py"
-  run_test "$CDIR/torch_distributed/test_ddp.py"
-  run_test "$CDIR/torch_distributed/test_torch_distributed_fsdp_meta.py"
+  run_xla_backend_mp "$CDIR/test_torch_distributed_all_gather_xla_backend.py"
+  run_xla_backend_mp "$CDIR/test_torch_distributed_all_reduce_xla_backend.py"
+  run_xla_backend_mp "$CDIR/test_torch_distributed_bucketed_all_reduce_xla_backend.py"
+  run_xla_backend_mp "$CDIR/test_torch_distributed_multi_all_reduce_xla_backend.py"
+  run_xla_backend_mp "$CDIR/test_torch_distributed_reduce_scatter_xla_backend.py"
+  run_xla_backend_mp "$CDIR/test_ddp.py"
+  run_xla_backend_mp "$CDIR/test_torch_distributed_fsdp_meta.py"
 }
 
 function run_tests {

diff --git a/test/test_torch_distributed_bucketed_all_reduce_xla_backend.py b/test/test_torch_distributed_bucketed_all_reduce_xla_backend.py
@@ -0,0 +1,35 @@
+import os
+import sys
+import torch
+import torch_xla
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.xla_multiprocessing as xmp
+import torch_xla.distributed.xla_backend
+import torch.distributed as dist
+
+
+def _mp_fn(index):
+  device = xm.xla_device()
+  if xm.xla_device_hw(device) in ('TPU', 'CUDA'):
+    world_size = xm.xrt_world_size()
+    rank = xm.get_ordinal()
+
+    dist.init_process_group('xla', world_size=world_size, rank=rank)
+
+    tensor_list = [
+        torch.empty((i, i), device=device) for i in range(1, 1000, 101)
+    ]
+    for j, t in enumerate(tensor_list):
+      t.fill_(float(j))
+    dist.bucketed_allreduce(tensor_list)
+    for j, t in enumerate(tensor_list):
+      assert torch.all(torch.eq(t.cpu(),
+                                float(j) * world_size)) == torch.tensor(True)
+  else:
+    print(
+        'Default device {} is not a TPU or GPU device'.format(device),
+        file=sys.stderr)
+
+
+if __name__ == '__main__':
+  xmp.spawn(_mp_fn, args=())
diff --git a/torch_xla/core/xla_model.py b/torch_xla/core/xla_model.py
@@ -18,6 +18,7 @@
 import torch_xla.debug.metrics_saver as ms
 import torch_xla.utils.utils as xu
 import torch_xla.utils.closures as xc
+import os
 
 _DEVICES = xu.LazyProperty(lambda: torch_xla._XLAC._xla_get_devices())
 
@@ -1123,6 +1124,36 @@ def wait_device_ops(devices=[]):
   torch_xla._XLAC._xla_wait_device_ops(devices=devices)
 
 
+def bucketed_allreduce(gradients):
+  total = 0
+  tensor_bucket = []
+
+  for grad in gradients:
+    grad_bytes = grad.numel() * grad.element_size()
+
+    # Bucketize till the total spills over
+    total += grad_bytes
+    if total > bucket_cap and len(tensor_bucket) > 0:
+      all_reduce(
+          REDUCE_SUM,
+          tensor_bucket,
+          scale=1.0 / count,
+          groups=groups,
+          pin_layout=pin_layout)
+      total = grad_bytes
+      tensor_bucket = []
+    tensor_bucket.append(grad)
+
+  # Flush the last remaining bucket
+  if len(tensor_bucket):
+    all_reduce(
+        REDUCE_SUM,
+        tensor_bucket,
+        scale=1.0 / count,
+        groups=groups,
+        pin_layout=pin_layout)
+
+
 def reduce_gradients(optimizer, groups=None, pin_layout=True):
   """Reduces all the gradients handled by an optimizer.
 
@@ -1140,12 +1171,20 @@ def reduce_gradients(optimizer, groups=None, pin_layout=True):
   count = xrt_world_size()
   if count > 1:
     gradients = _fetch_gradients(optimizer)
-    all_reduce(
-        REDUCE_SUM,
-        gradients,
-        scale=1.0 / count,
-        groups=groups,
-        pin_layout=pin_layout)
+    bucket_cap = int(os.getenv('ALLREDUCE_BUCKET_SIZE_MB', 0)) * 1024 * 1024
+    # Reverse the gradients list so that we start allreduce from the last layer
+    # onwards. This allows allreduce to trigger as soon as the bucket fills up and
+    # overlap with backward pass.
+    if bucket_cap > 0:
+      gradients = reversed(gradients)
+      bucketed_allreduce(gradients)
+    else:
+      all_reduce(
+          REDUCE_SUM,
+          gradients,
+          scale=1.0 / count,
+          groups=groups,
+          pin_layout=pin_layout)
 
 
 def optimizer_step(optimizer,