huggingface · pacman100 · Oct 12, 2023 · Oct 6, 2023 · Oct 6, 2023 · Oct 11, 2023
diff --git a/docs/source/usage_guides/fsdp.md b/docs/source/usage_guides/fsdp.md
@@ -96,6 +96,8 @@ all-gather while executing in the forward pass. only use with Static graphs.
 Useful in cases such as parameter-efficient fine-tuning. 
 Please refer this [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019)
 
+`CPU RAM Efficient Model loading`: If True, only the first process loads the pretrained model checkoint while all other processes have empty weights. Only applicable for 🤗 Transformers models. This should be set to False if you experience errors when loading the pretrained 🤗 Transformers model via `from_pretrained` method. When using this, `Sync Module States` needs to be True else all the processes expect the main process would have random empty weights leading to unexpected behaviour during training.
+
 `Sync Module States`: If True, each individually wrapped FSDP unit will broadcast module parameters from rank 0
 ```
 

diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
@@ -386,12 +386,21 @@ def get_cluster_input():
                 default=False,
                 error_message="Please enter yes or no.",
             )
-            fsdp_config["fsdp_sync_module_states"] = _ask_field(
-                "Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start? [YES/no]: ",
+            fsdp_config["fsdp_cpu_ram_efficient_loading"] = _ask_field(
+                "Do you want to enable CPU RAM efficient model loading? Only applicable for 🤗 Transformers models. [YES/no]: ",
                 _convert_yes_no_to_bool,
                 default=True,
                 error_message="Please enter yes or no.",
             )
+            if fsdp_config["fsdp_cpu_ram_efficient_loading"]:
+                fsdp_config["fsdp_sync_module_states"] = True
+            else:
+                fsdp_config["fsdp_sync_module_states"] = _ask_field(
+                    "Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start? [YES/no]: ",
+                    _convert_yes_no_to_bool,
+                    default=True,
+                    error_message="Please enter yes or no.",
+                )
 
     megatron_lm_config = {}
     if distributed_type in [DistributedType.MULTI_GPU]:

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
@@ -524,6 +524,14 @@ def launch_command_parser(subparsers=None):
         help="If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres."
         " (useful only when `use_fsdp` flag is passed).",
     )
+    fsdp_args.add_argument(
+        "--fsdp_cpu_ram_efficient_loading",
+        default="true",
+        type=str,
+        help="If True, only the first process loads the pretrained model checkoint while all other processes have empty weights. "
+        "Only applicable for 🤗 Transformers. When using this, `--fsdp_sync_module_states` needs to True. "
+        "(useful only when `use_fsdp` flag is passed).",
+    )
     fsdp_args.add_argument(
         "--fsdp_sync_module_states",
         default="true",

diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py
@@ -174,6 +174,9 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> Dict[str, str]:
 
     if args.use_fsdp:
         current_env["ACCELERATE_USE_FSDP"] = "true"
+        if args.fsdp_cpu_ram_efficient_loading and not args.fsdp_sync_module_states:
+            raise ValueError("When using `--fsdp_cpu_ram_efficient_loading` set `--fsdp_sync_module_states` to `True`")
+
         current_env["FSDP_SHARDING_STRATEGY"] = str(args.fsdp_sharding_strategy)
         current_env["FSDP_OFFLOAD_PARAMS"] = str(args.fsdp_offload_params).lower()
         current_env["FSDP_MIN_NUM_PARAMS"] = str(args.fsdp_min_num_params)
@@ -187,6 +190,7 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> Dict[str, str]:
             current_env["FSDP_STATE_DICT_TYPE"] = str(args.fsdp_state_dict_type)
         current_env["FSDP_FORWARD_PREFETCH"] = str(args.fsdp_forward_prefetch).lower()
         current_env["FSDP_USE_ORIG_PARAMS"] = str(args.fsdp_use_orig_params).lower()
+        current_env["FSDP_CPU_RAM_EFFICIENT_LOADING"] = str(args.fsdp_cpu_ram_efficient_loading).lower()
         current_env["FSDP_SYNC_MODULE_STATES"] = str(args.fsdp_sync_module_states).lower()
 
     if args.use_megatron_lm: