From b58d68c14ecb9e9d61f5c363da15c547d3915526 Mon Sep 17 00:00:00 2001 From: Vitaliy Chiley <6439018+vchiley@users.noreply.github.com> Date: Fri, 12 Apr 2024 09:15:28 -0700 Subject: [PATCH 1/2] Revert "Update config_moe_args.py (#1104)" (#1111) This reverts commit 17f8aeb0d3dd85e3f4987d249f6d36efa70e3a63. --- llmfoundry/models/utils/config_moe_args.py | 29 +++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/llmfoundry/models/utils/config_moe_args.py b/llmfoundry/models/utils/config_moe_args.py index b69cd18348..3386204e26 100644 --- a/llmfoundry/models/utils/config_moe_args.py +++ b/llmfoundry/models/utils/config_moe_args.py @@ -12,6 +12,33 @@ from llmfoundry.models.layers.ffn import resolve_ffn_hidden_size +def create_process_group_ranks(ranks: tuple[int]): + """Creates a new distributed group. + + Used in create_set_process_group and create_mod_process_group methods below. + + This function is an alternative to `distributed.new_group(ranks)`. + When working with FSDP in torch1.13.1, using `distributed.new_group(ranks)` + resulted in an error but this method worked. + + TODO(GRT-2416): When composer no longer has support for torch1.13.1, we should + consider using `distributed.new_group(ranks)` here and in composer's FSDP + custom process group init. + + Args: + ranks (tuple[int]): Tuple of ranks of group members. + + Returns: + A handle of distributed group that can be given to collective calls. + """ + ranks_gather_list = [None for _ in range(distributed.get_world_size())] + distributed.all_gather_object(ranks_gather_list, ranks) + ranks_per_subgroup = list(set(ranks_gather_list)) + group, _ = distributed.distributed_c10d.new_subgroups_by_enumeration( + ranks_per_subgroup) + return group + + def create_set_process_group(k: int): """Creates a new distributed group using sets of k GPUs. @@ -33,7 +60,7 @@ def create_set_process_group(k: int): raise RuntimeError(f'{world_size=} must be divisible by {k=}.') start = distributed.get_rank() // k * k ranks = tuple(range(start, start + k)) - return distributed.new_group(ranks) + return create_process_group_ranks(ranks) def config_megablocks_moe_args( From 6257e5b92b76c85772760f2051dde70bf1dee6c6 Mon Sep 17 00:00:00 2001 From: Vitaliy Chiley <6439018+vchiley@users.noreply.github.com> Date: Fri, 12 Apr 2024 09:45:13 -0700 Subject: [PATCH 2/2] Update config_moe_args.py (#1112) https://github.com/mosaicml/llm-foundry/pull/1111 needed to revert https://github.com/mosaicml/llm-foundry/pull/1104 because the #1104 PR caused issues. Removing TODO and marking Jira with wont-do --- llmfoundry/models/utils/config_moe_args.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llmfoundry/models/utils/config_moe_args.py b/llmfoundry/models/utils/config_moe_args.py index 3386204e26..1f7132c281 100644 --- a/llmfoundry/models/utils/config_moe_args.py +++ b/llmfoundry/models/utils/config_moe_args.py @@ -18,12 +18,6 @@ def create_process_group_ranks(ranks: tuple[int]): Used in create_set_process_group and create_mod_process_group methods below. This function is an alternative to `distributed.new_group(ranks)`. - When working with FSDP in torch1.13.1, using `distributed.new_group(ranks)` - resulted in an error but this method worked. - - TODO(GRT-2416): When composer no longer has support for torch1.13.1, we should - consider using `distributed.new_group(ranks)` here and in composer's FSDP - custom process group init. Args: ranks (tuple[int]): Tuple of ranks of group members.