From 235c563c0f1648e9a6d048a15146b9b34bd106b2 Mon Sep 17 00:00:00 2001 From: Sebastian Hoffmann Date: Thu, 28 Mar 2024 17:19:16 +0100 Subject: [PATCH] fix: use gpu/nccl even when running without slurm --- dmlcloud/util/distributed.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dmlcloud/util/distributed.py b/dmlcloud/util/distributed.py index b76d211..4db2e65 100644 --- a/dmlcloud/util/distributed.py +++ b/dmlcloud/util/distributed.py @@ -79,14 +79,17 @@ def print_worker(msg, barrier=True, flush=True): dist.barrier() -def init_process_group_dummy(): +def init_process_group_dummy(**kwargs): """ Initializes the process group with a single process. Uses HashStore under the hood. Useful for applications that only run on a single gpu. """ + backend = kwargs.get('backend', None) + if backend is None: + backend = 'cpu:gloo,cuda:nccl' if dist.is_nccl_available() else 'gloo' store = dist.HashStore() - dist.init_process_group(store=store, rank=0, world_size=1, backend='gloo') + dist.init_process_group(store=store, rank=0, world_size=1, backend=backend, **kwargs) def init_process_group_MPI(ip_idx=0, port=None, **kwargs):