remote path

xiyang-aads-lilly · May 13, 2024 · 190935b · 190935b
1 parent c5141a2
commit 190935b
Show file tree

Hide file tree

Showing 8 changed files with 44 additions and 44 deletions.
diff --git a/job_scripts/launch_dpo.sh b/job_scripts/launch_dpo.sh
@@ -50,9 +50,9 @@ init_node_info() {
 init_node_info
 
 # HF cache
-export TMPDIR="/blue/yonghui.wu/alexgre/ctmp"
-export HF_DATASETS_CACHE="/blue/yonghui.wu/alexgre/ctmp"
-export HF_HOME="/blue/yonghui.wu/alexgre/ctmp"
+export TMPDIR="~/ctmp"
+export HF_DATASETS_CACHE="~/ctmp"
+export HF_HOME="~/ctmp"
 
 # Global environment constants recommended by Nvidia.
 EXCLUDE_IB_LIST=mlx5_4,mlx5_5,mlx5_10,mlx5_11
@@ -65,9 +65,9 @@ export NCCL_ASYNC_ERROR_HANDLING=1
 WORLD_SIZE=$(($SLURM_JOB_NUM_NODES*$SLURM_GPUS_PER_TASK))
 
 # HF cache
-export TMPDIR="/blue/yonghui.wu/alexgre/ctmp"
-export HF_DATASETS_CACHE="/blue/yonghui.wu/alexgre/ctmp"
-export HF_HOME="/blue/yonghui.wu/alexgre/ctmp"
+export TMPDIR="~/ctmp"
+export HF_DATASETS_CACHE="~/ctmp"
+export HF_HOME="~/ctmp"
 export TORCH_DISTRIBUTED_DEBUG=DETAIL
 export NCCL_DEBUG=DEBUG
 export ACCELERATE_LOG_LEVEL=DEBUG 
@@ -79,4 +79,4 @@ echo $SLURM_JOB_NUM_NODES:$SLURM_GPUS_PER_TASK:$WORLD_SIZE
 echo $SLURM_NODEID 
 
 # sft replicate HF model 
-srun --jobid $SLURM_JOB_ID singularity exec --nv $CONTAINER bash /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/run_dpo.sh 2>&1
+srun --jobid $SLURM_JOB_ID singularity exec --nv $CONTAINER bash ~/alignment-handbook/recipes/run_dpo.sh 2>&1
diff --git a/job_scripts/launch_sft.sh b/job_scripts/launch_sft.sh
@@ -16,9 +16,9 @@
 
 # default setup
 module load apptainer
-CONTAINER=/red/gatortron-phi/workspace/containers/alignment.sif
+CONTAINER=~/containers/alignment.sif
 
-source /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/util.sh
+source ~/alignment-handbook/recipes/util.sh
 
 # sft replicate HF model 
-srun --jobid $SLURM_JOB_ID singularity exec --nv $CONTAINER bash /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/run_sft.sh
+srun --jobid $SLURM_JOB_ID singularity exec --nv $CONTAINER bash ~/alignment-handbook/recipes/run_sft.sh
diff --git a/job_scripts/run_dpo.sh b/job_scripts/run_dpo.sh
@@ -10,13 +10,13 @@ WORLD_SIZE=$(($SLURM_JOB_NUM_NODES*$SLURM_GPUS_PER_TASK))
 echo $SLURM_JOB_NUM_NODES $WORLD_SIZE
 
 ACCELERATE_LOG_LEVEL=info accelerate launch \
-    --config_file /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
+    --config_file ~/alignment-handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
     --main_process_ip $PRIMARY \
     --main_process_port $PRIMARY_PORT \
     --machine_rank $SLURM_PROCID \
     --num_machines $SLURM_JOB_NUM_NODES \
     --num_processes $WORLD_SIZE \
     --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$PRIMARY:$PRIMARY_PORT" \
     --tee 3 \
-    /red/gatortron-phi/workspace/zzz/alignment-handbook/scripts/run_dpo.py \
-    /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/zephyr-7b-beta/dpo/config_full.yaml
+    ~/alignment-handbook/scripts/run_dpo.py \
+    ~/alignment-handbook/recipes/zephyr-7b-beta/dpo/config_full.yaml
diff --git a/job_scripts/run_sft.sh b/job_scripts/run_sft.sh
@@ -1,9 +1,9 @@
 WORLD_SIZE=$(($SLURM_JOB_NUM_NODES*$SLURM_GPUS_PER_TASK))
 
 # HF cache
-export TMPDIR="/blue/yonghui.wu/alexgre/ctmp"
-export HF_DATASETS_CACHE="/blue/yonghui.wu/alexgre/ctmp"
-export HF_HOME="/blue/yonghui.wu/alexgre/ctmp"
+export TMPDIR="~/ctmp"
+export HF_DATASETS_CACHE="~/ctmp"
+export HF_HOME="~/ctmp"
 export TORCH_DISTRIBUTED_DEBUG=INFO
 export NCCL_DEBUG=INFO
 export NCCL_SOCKET_NTHREADS=16
@@ -36,32 +36,32 @@ export DEEPSPEED_TIMEOUT=120
 
 pip install -U git+https://github.com/huggingface/trl
 
-export WANDB_PROJECT="alignment"
-export WANDB_WATCH="parameters"
-export WANDB_API_KEY="19ad7c08b73ec0d81580d46372bcd14d2f207232"
-export WANDB_USERNAME="copy-o0o-paste"
+# export WANDB_PROJECT="alignment"
+# export WANDB_WATCH="parameters"
+# export WANDB_API_KEY="KEY"
+# export WANDB_USERNAME="copy-o0o-paste"
 
 accelerate launch \
-    --config_file /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
+    --config_file ~/alignment-handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
     --main_process_ip $PRIMARY \
     --main_process_port $PRIMARY_PORT \
     --machine_rank $SLURM_PROCID \
     --num_machines $SLURM_JOB_NUM_NODES \
     --num_processes $WORLD_SIZE \
     --tee 3 \
-    /red/gatortron-phi/workspace/zzz/alignment-handbook/scripts/run_sft.py \
-    /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/zephyr-7b-beta/sft/config_full.yaml
+   ~/alignment-handbook/scripts/run_sft.py \
+   ~/alignment-handbook/recipes/zephyr-7b-beta/sft/config_full.yaml
 
 # ACCELERATE_LOG_LEVEL=info accelerate launch \
-#     --config_file /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/accelerate_configs/multi_gpu.yaml \
+#     --config_file ~/alignment-handbook/recipes/accelerate_configs/multi_gpu.yaml \
 #     --main_process_ip $PRIMARY \
 #     --main_process_port $PRIMARY_PORT \
 #     --machine_rank $SLURM_PROCID \
 #     --num_machines $SLURM_JOB_NUM_NODES \
 #     --num_processes $WORLD_SIZE \
 #     --tee 3 \
-#     /red/gatortron-phi/workspace/zzz/alignment-handbook/scripts/run_sft.py \
-#     /red/gatortron-phi/workspace/zzz/alignment-handbook/recipes/zephyr-7b-beta/sft/config_lora.yaml
+#     ~/alignment-handbook/scripts/run_sft.py \
+#     ~/alignment-handbook/recipes/zephyr-7b-beta/sft/config_lora.yaml
 
 
 

diff --git a/job_scripts/singularity_container.def b/job_scripts/singularity_container.def
@@ -5,7 +5,7 @@ From: nvcr.io/nvidia/pytorch:{{ VERSION }}
     VERSION=23.12-py3
 
 %files
-/red/gatortron-phi/workspace/zzz/alignment-handbook/requirements.txt requirements.txt
+requirements.txt requirements.txt
 
 %post
     python -m pip install --upgrade pip

diff --git a/recipes/gatortrongpt_5b/config_gatortrongpt_full_sft.yaml b/recipes/gatortrongpt_5b/config_gatortrongpt_full_sft.yaml
@@ -1,5 +1,5 @@
 # Model arguments
-model_name_or_path: /red/gatortron-phi/gpt/models/gpt_uf_deid_thepile_mimic_5b_bs4_ep1_release_HF
+model_name_or_path: ~/models/gpt_uf_deid_thepile_mimic_5b_bs4_ep1_release_HF
 model_revision: main
 torch_dtype: bfloat16
 use_flash_attention_2: false

diff --git a/recipes/zephyr-7b-beta/dpo/config_qlora.yaml b/recipes/zephyr-7b-beta/dpo/config_qlora.yaml
@@ -9,14 +9,14 @@ load_in_4bit: true
 lora_r: 128
 lora_alpha: 128
 lora_dropout: 0.05
-lora_target_modules:
-- q_proj
-- k_proj
-- v_proj
-- o_proj
-- gate_proj
-- up_proj
-- down_proj
+lora_target_modules: all
+# - q_proj
+# - k_proj
+# - v_proj
+# - o_proj
+# - gate_proj
+# - up_proj
+# - down_proj
 
 # Data training arguments
 

diff --git a/recipes/zephyr-7b-beta/sft/config_qlora.yaml b/recipes/zephyr-7b-beta/sft/config_qlora.yaml
@@ -10,14 +10,14 @@ use_peft: true
 lora_r: 16
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules:
-- q_proj
-- k_proj
-- v_proj
-- o_proj
-- gate_proj
-- up_proj
-- down_proj
+lora_target_modules: all
+# - q_proj
+# - k_proj
+# - v_proj
+# - o_proj
+# - gate_proj
+# - up_proj
+# - down_proj
 
 # Data training arguments
 chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"