Skip to content

Commit

Permalink
remote path
Browse files Browse the repository at this point in the history
  • Loading branch information
xiyang-aads-lilly committed May 13, 2024
1 parent c5141a2 commit 190935b
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 44 deletions.
14 changes: 7 additions & 7 deletions job_scripts/launch_dpo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ init_node_info() {
init_node_info

# HF cache
export TMPDIR="/blue/yonghui.wu/alexgre/ctmp"
export HF_DATASETS_CACHE="/blue/yonghui.wu/alexgre/ctmp"
export HF_HOME="/blue/yonghui.wu/alexgre/ctmp"
export TMPDIR="~/ctmp"
export HF_DATASETS_CACHE="~/ctmp"
export HF_HOME="~/ctmp"

# Global environment constants recommended by Nvidia.
EXCLUDE_IB_LIST=mlx5_4,mlx5_5,mlx5_10,mlx5_11
Expand All @@ -65,9 +65,9 @@ export NCCL_ASYNC_ERROR_HANDLING=1
WORLD_SIZE=$(($SLURM_JOB_NUM_NODES*$SLURM_GPUS_PER_TASK))

# HF cache
export TMPDIR="/blue/yonghui.wu/alexgre/ctmp"
export HF_DATASETS_CACHE="/blue/yonghui.wu/alexgre/ctmp"
export HF_HOME="/blue/yonghui.wu/alexgre/ctmp"
export TMPDIR="~/ctmp"
export HF_DATASETS_CACHE="~/ctmp"
export HF_HOME="~/ctmp"
export TORCH_DISTRIBUTED_DEBUG=DETAIL
export NCCL_DEBUG=DEBUG
export ACCELERATE_LOG_LEVEL=DEBUG
Expand All @@ -79,4 +79,4 @@ echo $SLURM_JOB_NUM_NODES:$SLURM_GPUS_PER_TASK:$WORLD_SIZE
echo $SLURM_NODEID

# sft replicate HF model
srun --jobid $SLURM_JOB_ID singularity exec --nv $CONTAINER bash /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/run_dpo.sh 2>&1
srun --jobid $SLURM_JOB_ID singularity exec --nv $CONTAINER bash ~/alignment-handbook/recipes/run_dpo.sh 2>&1
6 changes: 3 additions & 3 deletions job_scripts/launch_sft.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

# default setup
module load apptainer
CONTAINER=/red/gatortron-phi/workspace/containers/alignment.sif
CONTAINER=~/containers/alignment.sif

source /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/util.sh
source ~/alignment-handbook/recipes/util.sh

# sft replicate HF model
srun --jobid $SLURM_JOB_ID singularity exec --nv $CONTAINER bash /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/run_sft.sh
srun --jobid $SLURM_JOB_ID singularity exec --nv $CONTAINER bash ~/alignment-handbook/recipes/run_sft.sh
6 changes: 3 additions & 3 deletions job_scripts/run_dpo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ WORLD_SIZE=$(($SLURM_JOB_NUM_NODES*$SLURM_GPUS_PER_TASK))
echo $SLURM_JOB_NUM_NODES $WORLD_SIZE

ACCELERATE_LOG_LEVEL=info accelerate launch \
--config_file /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
--config_file ~/alignment-handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
--main_process_ip $PRIMARY \
--main_process_port $PRIMARY_PORT \
--machine_rank $SLURM_PROCID \
--num_machines $SLURM_JOB_NUM_NODES \
--num_processes $WORLD_SIZE \
--rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$PRIMARY:$PRIMARY_PORT" \
--tee 3 \
/red/gatortron-phi/workspace/zzz/alignment-handbook/scripts/run_dpo.py \
/red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/zephyr-7b-beta/dpo/config_full.yaml
~/alignment-handbook/scripts/run_dpo.py \
~/alignment-handbook/recipes/zephyr-7b-beta/dpo/config_full.yaml
26 changes: 13 additions & 13 deletions job_scripts/run_sft.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
WORLD_SIZE=$(($SLURM_JOB_NUM_NODES*$SLURM_GPUS_PER_TASK))

# HF cache
export TMPDIR="/blue/yonghui.wu/alexgre/ctmp"
export HF_DATASETS_CACHE="/blue/yonghui.wu/alexgre/ctmp"
export HF_HOME="/blue/yonghui.wu/alexgre/ctmp"
export TMPDIR="~/ctmp"
export HF_DATASETS_CACHE="~/ctmp"
export HF_HOME="~/ctmp"
export TORCH_DISTRIBUTED_DEBUG=INFO
export NCCL_DEBUG=INFO
export NCCL_SOCKET_NTHREADS=16
Expand Down Expand Up @@ -36,32 +36,32 @@ export DEEPSPEED_TIMEOUT=120

pip install -U git+https://github.com/huggingface/trl

export WANDB_PROJECT="alignment"
export WANDB_WATCH="parameters"
export WANDB_API_KEY="19ad7c08b73ec0d81580d46372bcd14d2f207232"
export WANDB_USERNAME="copy-o0o-paste"
# export WANDB_PROJECT="alignment"
# export WANDB_WATCH="parameters"
# export WANDB_API_KEY="KEY"
# export WANDB_USERNAME="copy-o0o-paste"

accelerate launch \
--config_file /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
--config_file ~/alignment-handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
--main_process_ip $PRIMARY \
--main_process_port $PRIMARY_PORT \
--machine_rank $SLURM_PROCID \
--num_machines $SLURM_JOB_NUM_NODES \
--num_processes $WORLD_SIZE \
--tee 3 \
/red/gatortron-phi/workspace/zzz/alignment-handbook/scripts/run_sft.py \
/red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/zephyr-7b-beta/sft/config_full.yaml
~/alignment-handbook/scripts/run_sft.py \
~/alignment-handbook/recipes/zephyr-7b-beta/sft/config_full.yaml

# ACCELERATE_LOG_LEVEL=info accelerate launch \
# --config_file /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/accelerate_configs/multi_gpu.yaml \
# --config_file ~/alignment-handbook/recipes/accelerate_configs/multi_gpu.yaml \
# --main_process_ip $PRIMARY \
# --main_process_port $PRIMARY_PORT \
# --machine_rank $SLURM_PROCID \
# --num_machines $SLURM_JOB_NUM_NODES \
# --num_processes $WORLD_SIZE \
# --tee 3 \
# /red/gatortron-phi/workspace/zzz/alignment-handbook/scripts/run_sft.py \
# /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/zephyr-7b-beta/sft/config_lora.yaml
# ~/alignment-handbook/scripts/run_sft.py \
# ~/alignment-handbook/recipes/zephyr-7b-beta/sft/config_lora.yaml



Expand Down
2 changes: 1 addition & 1 deletion job_scripts/singularity_container.def
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ From: nvcr.io/nvidia/pytorch:{{ VERSION }}
VERSION=23.12-py3

%files
/red/gatortron-phi/workspace/zzz/alignment-handbook/requirements.txt requirements.txt
requirements.txt requirements.txt

%post
python -m pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion recipes/gatortrongpt_5b/config_gatortrongpt_full_sft.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Model arguments
model_name_or_path: /red/gatortron-phi/gpt/models/gpt_uf_deid_thepile_mimic_5b_bs4_ep1_release_HF
model_name_or_path: ~/models/gpt_uf_deid_thepile_mimic_5b_bs4_ep1_release_HF
model_revision: main
torch_dtype: bfloat16
use_flash_attention_2: false
Expand Down
16 changes: 8 additions & 8 deletions recipes/zephyr-7b-beta/dpo/config_qlora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ load_in_4bit: true
lora_r: 128
lora_alpha: 128
lora_dropout: 0.05
lora_target_modules:
- q_proj
- k_proj
- v_proj
- o_proj
- gate_proj
- up_proj
- down_proj
lora_target_modules: all
# - q_proj
# - k_proj
# - v_proj
# - o_proj
# - gate_proj
# - up_proj
# - down_proj

# Data training arguments

Expand Down
16 changes: 8 additions & 8 deletions recipes/zephyr-7b-beta/sft/config_qlora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ use_peft: true
lora_r: 16
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
- q_proj
- k_proj
- v_proj
- o_proj
- gate_proj
- up_proj
- down_proj
lora_target_modules: all
# - q_proj
# - k_proj
# - v_proj
# - o_proj
# - gate_proj
# - up_proj
# - down_proj

# Data training arguments
chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
Expand Down

0 comments on commit 190935b

Please sign in to comment.