diff --git a/launcher.py b/launcher.py index 03a29754..e0e9c4cb 100644 --- a/launcher.py +++ b/launcher.py @@ -135,7 +135,7 @@ def launch_slurm_job(launch_file_contents, *args): ) ).replace(".", "p") - print(f"🏋️ Model has {num_params} parameters") + print(f"🏋️ Model has {num_params} parameters") # Do we have a SLURM task ID? # You can SLURM_ARRAY_TASK_ID to run multiple runs with predefined HP @@ -206,6 +206,14 @@ def launch_slurm_job(launch_file_contents, *args): train_steps=100, val_check_interval=-1, ) + BS = tokens.micro_batch_size*tokens.batch_accumulation_per_replica*tokens.sequence_length + GBS = BS * parallelism.dp + + total_tokens = tokens.train_steps * GBS + total_tokens_billions = total_tokens / 1e9 + print(f"📙 Number of tokens: {total_tokens_billions:.2f} billion") + + model = ModelArgs( model_config=model_config, @@ -232,7 +240,23 @@ def launch_slurm_job(launch_file_contents, *args): lr_decay_starting_step= 80, min_decay_lr=0, ) - + # Calculate and print learning rate and global batch size information + lr_initial = learning_rate_scheduler.learning_rate + lr_min = learning_rate_scheduler.min_decay_lr + lr_warmup_steps = learning_rate_scheduler.lr_warmup_steps + lr_decay_steps = learning_rate_scheduler.lr_decay_steps + lr_decay_start = learning_rate_scheduler.lr_decay_starting_step + lr_decay_style = learning_rate_scheduler.lr_decay_style + + print(f"📊 Learning Rate Schedule:") + print(f" Initial LR: {lr_initial:.2e}") + print(f" Warmup: {learning_rate_scheduler.lr_warmup_style} increase over {lr_warmup_steps} steps") + if lr_decay_start != lr_warmup_steps: + print(f" Constant LR until step {lr_decay_start}") + print(f" {lr_decay_style.capitalize()} decay from step {lr_decay_start} to {lr_decay_start + lr_decay_steps}") + print(f" Final LR: {lr_min:.2e}") + + print(f"🚚 Global Batch Size: {GBS:,} tokens") optimizer = OptimizerArgs( zero_stage=0, weight_decay=0.01, @@ -262,13 +286,11 @@ def launch_slurm_job(launch_file_contents, *args): data_stages=[ DatasetStageArgs( data=DataArgs( - dataset=NanosetDatasetsArgs( - dataset_folder={ - "/fsx/elie_bakouch/nanotron/datasets/cosmopedia-v2":0.7, - "/fsx/elie_bakouch/nanotron/datasets/fineweb-edu-dedup":0.3, - }, - ), - seed=general.seed, + dataset=NanosetDatasetsArgs( + dataset_folder="/fsx/elie_bakouch/nanotron/datasets/cosmopedia-v2", + ), + num_loading_workers=0, + seed=general.seed, ), name="training stage", start_training_step=1, @@ -299,7 +321,6 @@ def launch_slurm_job(launch_file_contents, *args): os.makedirs(f"{config.slurm.slurm_logs_path}/", exist_ok=True) - sbatch_script = f"""#!/bin/bash #SBATCH --job-name={slurm.job_name} #SBATCH --nodes={slurm.nodes} #SBATCH --ntasks-per-node={slurm.n_tasks_per_node} # crucial - only 1 task per dist per node! @@ -313,26 +334,39 @@ def launch_slurm_job(launch_file_contents, *args): #SBATCH --mail-type=ALL #SBATCH --mail-user={slurm.mail} #SBATCH --requeue - + sbatch_script = f"""#!/bin/bash +#SBATCH --job-name=test +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:8 +#SBATCH --partition=hopper-prod +#SBATCH --output=/fsx/elie_bakouch/nanotron/debug/main/train-{timestamp}-%x-%j.out +#SBATCH --qos=high +#SBATCH --begin=now+0minutes +#SBATCH --mail-type=ALL +set -x -e TRAINER_PYTHON_FILE=/fsx/elie_bakouch/nanotron/run_train.py nvidia-smi -set -x -e source ~/.bashrc source /fsx/elie_bakouch/miniconda3/etc/profile.d/conda.sh -conda activate {config.slurm.conda_env_path} #Modify this line if you use something different than conda +conda activate /fsx/elie_bakouch/miniconda3/envs/smollm #Modify this line if you use something different than conda -module load cuda/12.1 - -echo "START TIME: $(date)" #Show some environment variables echo python3 version = `python3 --version` echo "NCCL version: $(python -c "import torch;print(torch.cuda.nccl.version())")" echo "CUDA version: $(python -c "import torch;print(torch.version.cuda)")" + +echo "START TIME: $(date)" +secs_to_human(){{ + echo "$(( ${{1}} / 3600 )):$(( (${{1}} / 60) % 60 )):$(( ${{1}} % 60 ))" +}} start=$(date +%s) echo "$(date -d @${{start}} "+%Y-%m-%d %H:%M:%S"): ${{SLURM_JOB_NAME}} start id=${{SLURM_JOB_ID}}\n" + # SLURM stuff export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) @@ -342,6 +376,8 @@ def launch_slurm_job(launch_file_contents, *args): export TMPDIR=/scratch export CUDA_DEVICE_MAX_CONNECTIONS="1" +module load cuda/12.1 + echo go $COUNT_NODE echo $HOSTNAMES @@ -353,8 +389,11 @@ def launch_slurm_job(launch_file_contents, *args): " export LAUNCHER="python -u -m torch.distributed.run \ - --nproc_per_node {config.slurm.gpu_per_node} \ + --nproc_per_node 8 \ --nnodes $COUNT_NODE \ + --rdzv-backend etcd-v2 \ + --rdzv-endpoint etcd.hpc-cluster-hopper.hpc.internal.huggingface.tech:2379 \ + --rdzv-id $SLURM_JOB_ID \ --node_rank $SLURM_PROCID \ --role $SLURMD_NODENAME: \ --max_restarts 0 \