diff --git a/examples/slurm/fsdp_config.yaml b/examples/slurm/fsdp_config.yaml new file mode 100644 index 00000000000..1708fe20958 --- /dev/null +++ b/examples/slurm/fsdp_config.yaml @@ -0,0 +1,12 @@ +distributed_type: FSDP +fsdp_config: + fsdp_activation_checkpointing: false + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch: BACKWARD_PRE + fsdp_cpu_ram_efficient_loading: true + fsdp_forward_prefetch: false + fsdp_offload_params: false + fsdp_sharding_strategy: FULL_SHARD + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_sync_module_states: true + fsdp_use_orig_params: true diff --git a/examples/slurm/submit_multinode_fsdp.sh b/examples/slurm/submit_multinode_fsdp.sh new file mode 100644 index 00000000000..1f8ab5c914a --- /dev/null +++ b/examples/slurm/submit_multinode_fsdp.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +#SBATCH --job-name=multinode +#SBATCH -D . +#SBATCH --output=O-%x.%j +#SBATCH --error=E-%x.%j +#SBATCH --nodes=4 # number of nodes +#SBATCH --ntasks-per-node=1 # number of MP tasks +#SBATCH --gres=gpu:4 # number of GPUs per node +#SBATCH --cpus-per-task=160 # number of cores per tasks +#SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS) + +###################### +### Set enviroment ### +###################### +source activateEnvironment.sh +export GPUS_PER_NODE=4 +###################### + +###################### +#### Set network ##### +###################### +head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +###################### +export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}" + +export LAUNCHER="accelerate launch \ + --config ${ACCELERATE_DIR}/examples/slurm/fsdp_config.yaml \ + --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \ + --num_machines $SLURM_NNODES \ + --rdzv_backend c10d \ + --main_process_ip $head_node_ip \ + --main_process_port 29500 \ + " +export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py" +export SCRIPT_ARGS=" \ + --mixed_precision fp16 \ + --output_dir ${ACCELERATE_DIR}/examples/output \ + " + +# This step is necessary because accelerate launch does not handle multiline arguments properly +export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS" +srun $CMD \ No newline at end of file diff --git a/src/accelerate/commands/config/config_args.py b/src/accelerate/commands/config/config_args.py index 3039e45054b..a3991b2808d 100644 --- a/src/accelerate/commands/config/config_args.py +++ b/src/accelerate/commands/config/config_args.py @@ -177,7 +177,7 @@ def __post_init__(self): @dataclass class ClusterConfig(BaseConfig): - num_processes: int + num_processes: int = -1 # For instance if we use SLURM and the user manually passes it in machine_rank: int = 0 num_machines: int = 1 gpu_ids: Optional[str] = None diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 9c937996a93..b4f606f493a 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -1074,6 +1074,8 @@ def _validate_launch_command(args): # Silently set the default here if args.dynamo_backend is None: args.dynamo_backend = "no" + if args.num_processes == -1: + raise ValueError("You need to manually pass in `--num_processes` using this config yaml.") else: if args.num_processes is None: if args.use_xpu and is_xpu_available():