add torchrun launch; add simPO

xiyang-aads-lilly · Jun 1, 2024 · 3528a23 · 3528a23
1 parent 05a064a
commit 3528a23
Show file tree

Hide file tree

Showing 12 changed files with 721 additions and 25 deletions.
diff --git a/.gitignore b/.gitignore
@@ -169,4 +169,5 @@ wandb/
 experiments/*
 !experiments/.gitkeep
 !experiments/demo*
-!experiments/README.md
+!experiments/README.md
+!experiments/util.sh
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,10 +9,25 @@ repos:
         # pre-commit's default_language_version, see
         # https://pre-commit.com/#top_level-default_language_version
         # we do not set python version so it will use default
-      
+
       - id: black-jupyter
       #   # It is recommended to specify the latest version of Python
       #   # supported by your project here, or alternatively use
       #   # pre-commit's default_language_version, see
       #   # https://pre-commit.com/#top_level-default_language_version
       #   language_version: python3.11
+
+  # - repo: https://github.com/gitleaks/gitleaks
+  #   rev: v8.18.2  # Specify the desired version of Gitleaks
+  #   hooks:
+  #     - id: gitleaks
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-added-large-files
+      - id: check-merge-conflict
+      - id: detect-private-key # if this work well we can avoid using gitleaks
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
diff --git a/experiments/demo_dgx2.sh b/experiments/demo_dgx2.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/bash 
+#!/usr/bin/bash
 
 ROOT=$(realpath ~)
 
@@ -7,6 +7,7 @@ echo activate virtual ENV
 PYTHON_ENV=${ROOT}/project/scripts/v2306.sh
 source $PYTHON_ENV
 
+# pip freeze
 
 # CUDA
 export CUDA_VISIBLE_DEVICES=0,1
@@ -21,10 +22,12 @@ export HF_DATASETS_CACHE="${ROOT}/project/.cache/dataset"
 export HF_HOME="${ROOT}/project/.cache/"
 
 # Wandb
-export WANDB_API_KEY=""
-# export WANDB_API_KEY="<key>"
+export WANDB_API_KEY="05411100e08ac02e3fcbdc821b4116cf1c066e99"
 export WANDB_USERNAME="xi-yang5"
 export WANDB_PROJECT="demo_dgx2"
+# export WANDB_API_KEY=""
+# export WANDB_USERNAME=""
+# export WANDB_PROJECT=""
 export WANDB_LOG_MODEL="false"
 export WANDB_WATCH="false"
 
@@ -33,21 +36,38 @@ export TORCH_DISTRIBUTED_DEBUG=INFO
 export NCCL_DEBUG=INFO
 # export NCCL_SOCKET_NTHREADS=16
 
-export ACCELERATE_LOG_LEVEL=debug 
+export ACCELERATE_LOG_LEVEL=debug
 export ACCELERATE_DEBUG_MODE="1"
 export DEEPSPEED_TIMEOUT=120
 
+# get this script location
+SCRIPT=$(readlink -f "$0")
+SCRIPTPATH=$(dirname "$SCRIPT")
+
 # accelerate launch
 # accelerate launch \
 #     --config_file ${ROOT}/project/alignment_handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
 #     --num_processes $WORLD_SIZE \
 #     --tee 3 \
 #    ${ROOT}/project/alignment_handbook/scripts/run_sft.py \
-#    ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_qlora.yaml 
-#    ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_full.yaml 
+#    ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_qlora.yaml
 
 
-# deepspeed launch
+# torch launch
+# source ${SCRIPTPATH}/util.sh
+# --master_addr=$PRIMARY --master_port=$PRIMARY_PORT
+# python -m torch.distributed.run
 
+# need to add virtual env package path as PYTHONPATH
+export PYTHONPATH=${ROOT}/project/pyenv/2306/lib/python3.10/site-packages
+torchrun --nproc_per_node=$WORLD_SIZE --nnode=1 --node_rank=0 \
+  ${ROOT}/project/alignment_handbook/scripts/run_sft.py \
+  ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_qlora.yaml \
+  --deepspeed=${ROOT}/project/alignment_handbook/recipes/accelerate_configs/deepspeed_zs2.json \
+  --tee=2 >> ${SCRIPTPATH}/log.txt
 
-# torch launch
+# python -m torch.distributed.run --nproc_per_node=$WORLD_SIZE --nnode=1 --node_rank=0 \
+#   ${ROOT}/project/alignment_handbook/scripts/run_sft.py \
+#   ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_qlora.yaml \
+#   --deepspeed=${ROOT}/project/alignment_handbook/recipes/accelerate_configs/deepspeed_zs2.json \
+#   --tee=2
diff --git a/experiments/util.sh b/experiments/util.sh
@@ -0,0 +1,34 @@
+# for slurm use
+get_unused_port() {
+    # Well-known ports end at 1023.  On Linux, dynamic ports start at 32768
+    # (see /proc/sys/net/ipv4/ip_local_port_range).
+    local MIN_PORT=10001
+    local MAX_PORT=32767
+
+    local USED_PORTS=$(netstat -a -n -t | tail -n +3 | tr -s ' ' | \
+        cut -d ' ' -f 4 | sed 's/.*:\([0-9]\+\)$/\1/' | sort -n | uniq)
+
+    # Generate random port numbers within the search range (inclusive) until we
+    # find one that isn't in use.
+    local RAN_PORT
+    while
+        RAN_PORT=$(shuf -i 10001-32767 -n 1)
+        [[ "$USED_PORTS" =~ $RAN_PORT ]]
+    do
+        continue
+    done
+
+    echo $RAN_PORT
+}
+
+init_node_info() {
+    export PRIMARY=$(hostname -s)
+    SECONDARIES=$(scontrol show hostnames $SLURM_JOB_NODELIST | \
+        grep -v $PRIMARY)
+
+    ALL_NODES="$PRIMARY $SECONDARIES"
+    export PRIMARY_PORT=$(get_unused_port)
+    echo $PRIMARY $SECONDARIES $PRIMARY_PORT
+}
+
+init_node_info
diff --git a/recipes/accelerate_configs/deepspeed_zs2.json b/recipes/accelerate_configs/deepspeed_zs2.json
@@ -9,9 +9,11 @@
         "consecutive_hysteresis": false,
         "min_loss_scale": 1
     },
+
     "bf16": {
         "enabled": true
     },
+
     "optimizer": {
         "type": "AdamW",
         "params": {
@@ -23,6 +25,7 @@
             "adam_w_mode": true
         }
     },
+
     "scheduler": {
         "type": "WarmupDecayLR",
         "params": {
@@ -32,19 +35,22 @@
             "total_num_steps": "auto"
         }
     },
+
     "zero_optimization": {
         "stage": 2,
         "allgather_partitions": true,
-        "allgather_bucket_size": 2e8,
+        "allgather_bucket_size": 5e8,
         "overlap_comm": true,
         "reduce_scatter": true,
         "reduce_bucket_size": "auto",
-        "contiguous_gradients": true
+        "contiguous_gradients": true,
+        "round_robin_gradients": true
     },
+
     "gradient_accumulation_steps": "auto",
     "gradient_clipping": "auto",
-    "steps_per_print": 2000,
+    "steps_per_print": 20000000,
     "train_batch_size": "auto",
     "train_micro_batch_size_per_gpu": "auto",
     "wall_clock_breakdown": false
-}
+}
diff --git a/recipes/accelerate_configs/readme.md b/recipes/accelerate_configs/readme.md
@@ -1,10 +1,26 @@
 ## Accelerate launch only support partial parameters in deepspeed
 - to avoid, we need to launch with deepspeed not accelerate
 
+## more info HF-deepspeed integration
+- https://huggingface.co/docs/transformers/deepspeed?zero-config=ZeRO-2
+
 ## deepspeed optimizers
 - DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, OneBitLamb, FusedLamb, FusedAdam
 - see for details on how to config https://deepspeed.readthedocs.io/en/latest/optimizers.html
 ```json
+// You can set the parameters to "auto" or manually input your own desired values.
+{
+   "optimizer": {
+       "type": "AdamW",
+       "params": {
+         "lr": "auto",
+         "betas": "auto",
+         "eps": "auto",
+         "weight_decay": "auto"
+       }
+   }
+}
+
 {
   "optimizer": {
     "type": "OneBitLamb",
@@ -56,4 +72,81 @@
         "enabled": true
     }
 }
-```
+```
+
+- offload
+```json
+{
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true,
+        "round_robin_gradients": true
+    }
+}
+
+{
+  "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "nvme",
+            "nvme_path": "/local_nvme",
+            "pin_memory": true,
+            "buffer_count": 4,
+            "fast_init": false
+        },
+        "offload_param": {
+            "device": "nvme",
+            "nvme_path": "/local_nvme",
+            "pin_memory": true,
+            "buffer_count": 5,
+            "buffer_size": 1e8,
+            "max_in_cpu": 1e9
+        },
+        "aio": {
+            "block_size": 262144,
+            "queue_depth": 32,
+            "thread_count": 1,
+            "single_submit": false,
+            "overlap_events": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+}
+```
+
+- communication data type
+> Choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it is downcasted to whichever half-precision dtype you’re training in.
+> Default is fp16 if you use AMP.
+```json
+{ "communication_data_type": "fp32"}
+```
+
+- launch
+
+```sh
+deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \
+  --deepspeed tests/deepspeed/ds_config_zero3.json \
+...
+
+
+torchrun --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 --master_port=9901 \
+  your_program.py <normal cl args> \
+  --deepspeed ds_config.json
+```
diff --git a/recipes/llama3-8b/sft/config_qlora.yaml b/recipes/llama3-8b/sft/config_qlora.yaml
@@ -39,26 +39,32 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 learning_rate: 1.0e-04
 log_level: info
-logging_steps: 5  
+logging_steps: 5
 logging_strategy: steps
-optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
+optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit adamw_apex_fused
 # optim_target_modules: all-linear
 weight_decay: 0.01
 lr_scheduler_type: cosine
-max_seq_length: 2048
+max_seq_length: 4096
 max_steps: -1
 num_train_epochs: 1
-output_dir: /home/l069561/project/models/fine-tuned/demo-llama-3-8b-lora-ultrachat
+output_dir: /home/l069561/project/alignment_handbook/experiments/models/demo-llama-3-8b-lora-ultrachat
 overwrite_output_dir: true
-per_device_eval_batch_size: 2
-gradient_accumulation_steps: 32
 per_device_train_batch_size: 4
+gradient_accumulation_steps: 4
+per_device_eval_batch_size: 4
 push_to_hub: false
 report_to:
-- tensorboard  
+- tensorboard
 - wandb
 save_strategy: "steps"
 save_steps: 100
 save_total_limit: 1
 seed: 42
-warmup_ratio: 0.1
+warmup_ratio: 0.1
+
+torch_compile: false
+# https://pytorch.org/docs/stable/generated/torch.compile.html ('cudagraphs', 'inductor', 'onnxrt', 'openxla', 'openxla_eval', 'tvm'])
+# https://huggingface.co/docs/transformers/perf_train_gpu_one#using-torchcompile
+torch_compile_backend: "inductor"
+torch_compile_mode: "default" # reduce-overhead max-autotune
diff --git a/scripts/run_cpt.py b/scripts/run_cpt.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 #

diff --git a/scripts/run_orpo.py b/scripts/run_orpo.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 #