Skip to content

Commit

Permalink
add torchrun launch; add simPO
Browse files Browse the repository at this point in the history
  • Loading branch information
xiyang-aads-lilly committed Jun 1, 2024
1 parent 05a064a commit 3528a23
Show file tree
Hide file tree
Showing 12 changed files with 721 additions and 25 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -169,4 +169,5 @@ wandb/
experiments/*
!experiments/.gitkeep
!experiments/demo*
!experiments/README.md
!experiments/README.md
!experiments/util.sh
17 changes: 16 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,25 @@ repos:
# pre-commit's default_language_version, see
# https://pre-commit.com/#top_level-default_language_version
# we do not set python version so it will use default

- id: black-jupyter
# # It is recommended to specify the latest version of Python
# # supported by your project here, or alternatively use
# # pre-commit's default_language_version, see
# # https://pre-commit.com/#top_level-default_language_version
# language_version: python3.11

# - repo: https://github.com/gitleaks/gitleaks
# rev: v8.18.2 # Specify the desired version of Gitleaks
# hooks:
# - id: gitleaks

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: check-added-large-files
- id: check-merge-conflict
- id: detect-private-key # if this work well we can avoid using gitleaks
- id: end-of-file-fixer
- id: requirements-txt-fixer
36 changes: 28 additions & 8 deletions experiments/demo_dgx2.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/bash
#!/usr/bin/bash

ROOT=$(realpath ~)

Expand All @@ -7,6 +7,7 @@ echo activate virtual ENV
PYTHON_ENV=${ROOT}/project/scripts/v2306.sh
source $PYTHON_ENV

# pip freeze

# CUDA
export CUDA_VISIBLE_DEVICES=0,1
Expand All @@ -21,10 +22,12 @@ export HF_DATASETS_CACHE="${ROOT}/project/.cache/dataset"
export HF_HOME="${ROOT}/project/.cache/"

# Wandb
export WANDB_API_KEY=""
# export WANDB_API_KEY="<key>"
export WANDB_API_KEY="05411100e08ac02e3fcbdc821b4116cf1c066e99"
export WANDB_USERNAME="xi-yang5"
export WANDB_PROJECT="demo_dgx2"
# export WANDB_API_KEY=""
# export WANDB_USERNAME=""
# export WANDB_PROJECT=""
export WANDB_LOG_MODEL="false"
export WANDB_WATCH="false"

Expand All @@ -33,21 +36,38 @@ export TORCH_DISTRIBUTED_DEBUG=INFO
export NCCL_DEBUG=INFO
# export NCCL_SOCKET_NTHREADS=16

export ACCELERATE_LOG_LEVEL=debug
export ACCELERATE_LOG_LEVEL=debug
export ACCELERATE_DEBUG_MODE="1"
export DEEPSPEED_TIMEOUT=120

# get this script location
SCRIPT=$(readlink -f "$0")
SCRIPTPATH=$(dirname "$SCRIPT")

# accelerate launch
# accelerate launch \
# --config_file ${ROOT}/project/alignment_handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
# --num_processes $WORLD_SIZE \
# --tee 3 \
# ${ROOT}/project/alignment_handbook/scripts/run_sft.py \
# ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_qlora.yaml
# ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_full.yaml
# ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_qlora.yaml


# deepspeed launch
# torch launch
# source ${SCRIPTPATH}/util.sh
# --master_addr=$PRIMARY --master_port=$PRIMARY_PORT
# python -m torch.distributed.run

# need to add virtual env package path as PYTHONPATH
export PYTHONPATH=${ROOT}/project/pyenv/2306/lib/python3.10/site-packages
torchrun --nproc_per_node=$WORLD_SIZE --nnode=1 --node_rank=0 \
${ROOT}/project/alignment_handbook/scripts/run_sft.py \
${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_qlora.yaml \
--deepspeed=${ROOT}/project/alignment_handbook/recipes/accelerate_configs/deepspeed_zs2.json \
--tee=2 >> ${SCRIPTPATH}/log.txt

# torch launch
# python -m torch.distributed.run --nproc_per_node=$WORLD_SIZE --nnode=1 --node_rank=0 \
# ${ROOT}/project/alignment_handbook/scripts/run_sft.py \
# ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_qlora.yaml \
# --deepspeed=${ROOT}/project/alignment_handbook/recipes/accelerate_configs/deepspeed_zs2.json \
# --tee=2
34 changes: 34 additions & 0 deletions experiments/util.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# for slurm use
get_unused_port() {
# Well-known ports end at 1023. On Linux, dynamic ports start at 32768
# (see /proc/sys/net/ipv4/ip_local_port_range).
local MIN_PORT=10001
local MAX_PORT=32767

local USED_PORTS=$(netstat -a -n -t | tail -n +3 | tr -s ' ' | \
cut -d ' ' -f 4 | sed 's/.*:\([0-9]\+\)$/\1/' | sort -n | uniq)

# Generate random port numbers within the search range (inclusive) until we
# find one that isn't in use.
local RAN_PORT
while
RAN_PORT=$(shuf -i 10001-32767 -n 1)
[[ "$USED_PORTS" =~ $RAN_PORT ]]
do
continue
done

echo $RAN_PORT
}

init_node_info() {
export PRIMARY=$(hostname -s)
SECONDARIES=$(scontrol show hostnames $SLURM_JOB_NODELIST | \
grep -v $PRIMARY)

ALL_NODES="$PRIMARY $SECONDARIES"
export PRIMARY_PORT=$(get_unused_port)
echo $PRIMARY $SECONDARIES $PRIMARY_PORT
}

init_node_info
14 changes: 10 additions & 4 deletions recipes/accelerate_configs/deepspeed_zs2.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
"consecutive_hysteresis": false,
"min_loss_scale": 1
},

"bf16": {
"enabled": true
},

"optimizer": {
"type": "AdamW",
"params": {
Expand All @@ -23,6 +25,7 @@
"adam_w_mode": true
}
},

"scheduler": {
"type": "WarmupDecayLR",
"params": {
Expand All @@ -32,19 +35,22 @@
"total_num_steps": "auto"
}
},

"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": "auto",
"contiguous_gradients": true
"contiguous_gradients": true,
"round_robin_gradients": true
},

"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"steps_per_print": 20000000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
}
95 changes: 94 additions & 1 deletion recipes/accelerate_configs/readme.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,26 @@
## Accelerate launch only support partial parameters in deepspeed
- to avoid, we need to launch with deepspeed not accelerate

## more info HF-deepspeed integration
- https://huggingface.co/docs/transformers/deepspeed?zero-config=ZeRO-2

## deepspeed optimizers
- DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, OneBitLamb, FusedLamb, FusedAdam
- see for details on how to config https://deepspeed.readthedocs.io/en/latest/optimizers.html
```json
// You can set the parameters to "auto" or manually input your own desired values.
{
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
}
}

{
"optimizer": {
"type": "OneBitLamb",
Expand Down Expand Up @@ -56,4 +72,81 @@
"enabled": true
}
}
```
```

- offload
```json
{
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients": true,
"round_robin_gradients": true
}
}

{
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "nvme",
"nvme_path": "/local_nvme",
"pin_memory": true,
"buffer_count": 4,
"fast_init": false
},
"offload_param": {
"device": "nvme",
"nvme_path": "/local_nvme",
"pin_memory": true,
"buffer_count": 5,
"buffer_size": 1e8,
"max_in_cpu": 1e9
},
"aio": {
"block_size": 262144,
"queue_depth": 32,
"thread_count": 1,
"single_submit": false,
"overlap_events": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
}
```

- communication data type
> Choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it is downcasted to whichever half-precision dtype you’re training in.
> Default is fp16 if you use AMP.
```json
{ "communication_data_type": "fp32"}
```

- launch

```sh
deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \
--deepspeed tests/deepspeed/ds_config_zero3.json \
...


torchrun --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 --master_port=9901 \
your_program.py <normal cl args> \
--deepspeed ds_config.json
```
22 changes: 14 additions & 8 deletions recipes/llama3-8b/sft/config_qlora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,26 +39,32 @@ gradient_checkpointing_kwargs:
use_reentrant: false
learning_rate: 1.0e-04
log_level: info
logging_steps: 5
logging_steps: 5
logging_strategy: steps
optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit adamw_apex_fused
# optim_target_modules: all-linear
weight_decay: 0.01
lr_scheduler_type: cosine
max_seq_length: 2048
max_seq_length: 4096
max_steps: -1
num_train_epochs: 1
output_dir: /home/l069561/project/models/fine-tuned/demo-llama-3-8b-lora-ultrachat
output_dir: /home/l069561/project/alignment_handbook/experiments/models/demo-llama-3-8b-lora-ultrachat
overwrite_output_dir: true
per_device_eval_batch_size: 2
gradient_accumulation_steps: 32
per_device_train_batch_size: 4
gradient_accumulation_steps: 4
per_device_eval_batch_size: 4
push_to_hub: false
report_to:
- tensorboard
- tensorboard
- wandb
save_strategy: "steps"
save_steps: 100
save_total_limit: 1
seed: 42
warmup_ratio: 0.1
warmup_ratio: 0.1

torch_compile: false
# https://pytorch.org/docs/stable/generated/torch.compile.html ('cudagraphs', 'inductor', 'onnxrt', 'openxla', 'openxla_eval', 'tvm'])
# https://huggingface.co/docs/transformers/perf_train_gpu_one#using-torchcompile
torch_compile_backend: "inductor"
torch_compile_mode: "default" # reduce-overhead max-autotune
1 change: 0 additions & 1 deletion scripts/run_cpt.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
Expand Down
1 change: 0 additions & 1 deletion scripts/run_orpo.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
Expand Down
Loading

0 comments on commit 3528a23

Please sign in to comment.