diff --git a/experiments/demo_magtrain_llm_sft.sh b/experiments/demo_magtrain_llm_sft.sh
index 191daa47..85208ab6 100644
--- a/experiments/demo_magtrain_llm_sft.sh
+++ b/experiments/demo_magtrain_llm_sft.sh
@@ -1,5 +1,4 @@
 #!/usr/bin/bash
-
 whoami
 pwd
 ds_report
@@ -33,8 +32,14 @@ echo $PRIMARY
 echo $PRIMARY_PORT
 
 # TRAIN_CONF=${ROOT}/recipes/llama3-8b/sft/config_full.yaml
-TRAIN_CONF=${ROOT}/recipes/phi3/sft/config_full.yaml
+# TRAIN_CONF=${ROOT}/recipes/phi3/sft/config_full.yaml
 # TRAIN_CONF=${ROOT}/recipes/qwen/sft/config_full.yaml
+# TRAIN_CONF=${ROOT}/recipes/falcon_mamba/sft/config_full.yaml        # need futher debug, training stuck
+
+# manually set
+export WANDB_PROJECT="sang"
+# TRAIN_CONF=${ROOT}/recipes/sang_project/config_full_1.yaml
+TRAIN_CONF=${ROOT}/recipes/sang_project/config_full_2.yaml
 
 DEEPSPEED_CONF=${ROOT}/recipes/accelerate_configs/deepspeed_zs2.json
 
diff --git a/experiments/demo_magtrain_slurm.sh b/experiments/demo_magtrain_slurm.sh
index 6185e8e8..3fa035f8 100644
--- a/experiments/demo_magtrain_slurm.sh
+++ b/experiments/demo_magtrain_slurm.sh
@@ -3,11 +3,11 @@
 #SBATCH --job-name=llm_sft
 #SBATCH --mail-type=ALL
 #SBATCH --mail-user=xi.yang5@lilly.com
-#SBATCH --nodes=1
+#SBATCH --nodes=4
 #SBATCH --ntasks-per-node=1
 #SBATCH --gpus-per-node=4
 #SBATCH --gpus-per-task=4
-#SBATCH --cpus-per-task=64
+#SBATCH --cpus-per-task=32
 #SBATCH --mem=512gb
 #SBATCH --time=48:00:00
 #SBATCH --output=/home/l069561/project/log/alignment/sft_%j.out
@@ -17,13 +17,13 @@ HOME=/home/l069561
 SCRIPTPATH=${HOME}/project/alignment-handbook/experiments
 
 echo $SCRIPTPATH
-echo $SLURM_NTASKS_PER_NODE
 echo $SLURM_JOB_NUM_NODES
+echo $SLURM_NTASKS_PER_NODE
 echo $SLURM_GPUS_ON_NODE
 source ${SCRIPTPATH}/util.sh
 
-# CONTAINER=${HOME}/container/pt2402.sif
-CONTAINER=${HOME}/container/pt2402
+CONTAINER=${HOME}/container/pt2402.sif
+# CONTAINER=${HOME}/container/pt2402
 
 export TRITON_HOME=${HOME}/project/cache/triton
 export TRITON_CACHE_DIR=${HOME}/project/cache/triton/cache
@@ -31,18 +31,18 @@ export TRITON_DUMP_DIR=${HOME}/project/cache/triton/dump
 export HF_DATASETS_CACHE=${HOME}/project/cache/dataset
 export HF_HOME=${HOME}/project/cache/huggingface
 
-# srun --jobid $SLURM_JOB_ID apptainer exec -B $SLURM_TMPDIR:/cache  --nv $CONTAINER bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh
+srun --jobid $SLURM_JOB_ID apptainer exec -B $SLURM_TMPDIR:/cache  --nv $CONTAINER bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh
 
 # use nsys to profile training process
-srun --jobid $SLURM_JOB_ID \
-    apptainer exec -B $SLURM_TMPDIR:/cache --nv --fakeroot $CONTAINER \
-    nsys profile  -s none -t cuda,nvtx \
-    --gpu-metrics-device=all \
-    --gpu-metrics-frequency=100 \
-    --nic-metrics=true \
-    --capture-range=cudaProfilerApi \
-    --capture-range-end=stop \
-    -o $SLURM_TMPDIR/nsys_${SLURM_JOB_ID} \
-    bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh
-
-cp $SLURM_TMPDIR/nsys_${SLURM_JOB_ID}.nsys-rep ${HOME}/project/log/nsys/
+# srun --jobid $SLURM_JOB_ID \
+#     apptainer exec -B $SLURM_TMPDIR:/cache --nv --fakeroot $CONTAINER \
+#     nsys profile  -s none -t cuda,nvtx \
+#     --gpu-metrics-device=all \
+#     --gpu-metrics-frequency=100 \
+#     --nic-metrics=true \
+#     --capture-range=cudaProfilerApi \
+#     --capture-range-end=stop \
+#     -o $SLURM_TMPDIR/nsys_${SLURM_JOB_ID} \
+#     bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh
+
+# cp $SLURM_TMPDIR/nsys_${SLURM_JOB_ID}.nsys-rep ${HOME}/project/log/nsys/
diff --git a/recipes/falcon_mamba/sft/config_full.yaml b/recipes/falcon_mamba/sft/config_full.yaml
new file mode 100644
index 00000000..77cf8ccc
--- /dev/null
+++ b/recipes/falcon_mamba/sft/config_full.yaml
@@ -0,0 +1,56 @@
+# Model arguments
+model_name_or_path: /home/l069561/project/models/falcon-mamba-7b-instruct
+model_revision: main
+torch_dtype: bfloat16
+# attn_implementation: flash_attention_2
+
+# Data training arguments
+# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+  /home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset
+  /home/l069561/project/data/sang_data_formatted: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 16
+
+# SFT trainer config
+bf16: true
+do_eval: true
+# evaluation_strategy: epoch
+eval_strategy: steps
+eval_steps: 1000
+max_grad_norm: 1.0
+# gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+learning_rate: 1.0e-05
+optim: adamw_torch #galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
+optim_target_modules: all-linear
+weight_decay: 0.01
+lr_scheduler_type: cosine
+max_seq_length: 8192
+packing: false
+dataset_num_proc: 16
+max_steps: -1
+num_train_epochs: 3
+output_dir: /home/l069561/project/alignment-handbook/experiments/models/models-falcon-mamba-7b-inst-full-ultrachat
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node
+gradient_accumulation_steps: 4
+push_to_hub: false
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "epoch"
+save_steps: 100
+save_total_limit: 3
+seed: 42
+warmup_ratio: 0.1
diff --git a/recipes/falcon_mamba/sft/config_qlora.yaml b/recipes/falcon_mamba/sft/config_qlora.yaml
new file mode 100644
index 00000000..5b9794aa
--- /dev/null
+++ b/recipes/falcon_mamba/sft/config_qlora.yaml
@@ -0,0 +1,70 @@
+# Model arguments
+model_name_or_path: /home/l069561/project/models/falcon-mamba-7b-instruct # no chat template
+model_revision: main
+torch_dtype: bfloat16
+use_flash_attention_2: true
+
+# LoRA arguments
+use_unsloth: false # unsloth not support deepspeed yet
+use_peft: true
+load_in_4bit: true
+lora_r: 32
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules: all
+# - q_proj
+# - k_proj
+# - v_proj
+# - o_proj
+# - gate_proj
+# - up_proj
+# - down_proj
+
+# Data training arguments
+# chat_template: "{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'].strip() + '<|im_end|>\\n' }}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 16
+auto_insert_empty_system_msg: true
+
+# SFT trainer config
+bf16: true
+do_eval: true
+evaluation_strategy: epoch
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+learning_rate: 1.0e-04
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit adamw_apex_fused
+# optim_target_modules: all-linear
+weight_decay: 0.01
+lr_scheduler_type: cosine
+max_seq_length: 4096
+max_steps: -1
+num_train_epochs: 1
+output_dir: /home/l069561/project/alignment-handbook/experiments/models/demo-falcon-mamba-7b-inst-qlora-ultrachat
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+gradient_accumulation_steps: 4
+per_device_eval_batch_size: 4
+push_to_hub: false
+report_to:
+- tensorboard
+- wandb
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
+
+torch_compile: false
+# https://pytorch.org/docs/stable/generated/torch.compile.html ('cudagraphs', 'inductor', 'onnxrt', 'openxla', 'openxla_eval', 'tvm'])
+# https://huggingface.co/docs/transformers/perf_train_gpu_one#using-torchcompile
+torch_compile_backend: "inductor"
+torch_compile_mode: "default" # reduce-overhead max-autotune
diff --git a/recipes/gemma/sft/config_full.yaml b/recipes/gemma/sft/config_full.yaml
new file mode 100644
index 00000000..77cf8ccc
--- /dev/null
+++ b/recipes/gemma/sft/config_full.yaml
@@ -0,0 +1,56 @@
+# Model arguments
+model_name_or_path: /home/l069561/project/models/falcon-mamba-7b-instruct
+model_revision: main
+torch_dtype: bfloat16
+# attn_implementation: flash_attention_2
+
+# Data training arguments
+# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+  /home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset
+  /home/l069561/project/data/sang_data_formatted: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 16
+
+# SFT trainer config
+bf16: true
+do_eval: true
+# evaluation_strategy: epoch
+eval_strategy: steps
+eval_steps: 1000
+max_grad_norm: 1.0
+# gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+learning_rate: 1.0e-05
+optim: adamw_torch #galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
+optim_target_modules: all-linear
+weight_decay: 0.01
+lr_scheduler_type: cosine
+max_seq_length: 8192
+packing: false
+dataset_num_proc: 16
+max_steps: -1
+num_train_epochs: 3
+output_dir: /home/l069561/project/alignment-handbook/experiments/models/models-falcon-mamba-7b-inst-full-ultrachat
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node
+gradient_accumulation_steps: 4
+push_to_hub: false
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "epoch"
+save_steps: 100
+save_total_limit: 3
+seed: 42
+warmup_ratio: 0.1
diff --git a/recipes/gemma/sft/config_qlora.yaml b/recipes/gemma/sft/config_qlora.yaml
new file mode 100644
index 00000000..5b9794aa
--- /dev/null
+++ b/recipes/gemma/sft/config_qlora.yaml
@@ -0,0 +1,70 @@
+# Model arguments
+model_name_or_path: /home/l069561/project/models/falcon-mamba-7b-instruct # no chat template
+model_revision: main
+torch_dtype: bfloat16
+use_flash_attention_2: true
+
+# LoRA arguments
+use_unsloth: false # unsloth not support deepspeed yet
+use_peft: true
+load_in_4bit: true
+lora_r: 32
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules: all
+# - q_proj
+# - k_proj
+# - v_proj
+# - o_proj
+# - gate_proj
+# - up_proj
+# - down_proj
+
+# Data training arguments
+# chat_template: "{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'].strip() + '<|im_end|>\\n' }}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 16
+auto_insert_empty_system_msg: true
+
+# SFT trainer config
+bf16: true
+do_eval: true
+evaluation_strategy: epoch
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+learning_rate: 1.0e-04
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit adamw_apex_fused
+# optim_target_modules: all-linear
+weight_decay: 0.01
+lr_scheduler_type: cosine
+max_seq_length: 4096
+max_steps: -1
+num_train_epochs: 1
+output_dir: /home/l069561/project/alignment-handbook/experiments/models/demo-falcon-mamba-7b-inst-qlora-ultrachat
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+gradient_accumulation_steps: 4
+per_device_eval_batch_size: 4
+push_to_hub: false
+report_to:
+- tensorboard
+- wandb
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
+
+torch_compile: false
+# https://pytorch.org/docs/stable/generated/torch.compile.html ('cudagraphs', 'inductor', 'onnxrt', 'openxla', 'openxla_eval', 'tvm'])
+# https://huggingface.co/docs/transformers/perf_train_gpu_one#using-torchcompile
+torch_compile_backend: "inductor"
+torch_compile_mode: "default" # reduce-overhead max-autotune
diff --git a/recipes/llama3-8b/sft/config_full.yaml b/recipes/llama3-8b/sft/config_full.yaml
index 69610497..47aea00a 100644
--- a/recipes/llama3-8b/sft/config_full.yaml
+++ b/recipes/llama3-8b/sft/config_full.yaml
@@ -9,6 +9,7 @@ attn_implementation: flash_attention_2
 dataset_mixer:
   HuggingFaceH4/ultrachat_200k: 1.0
   /home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset
+  # /home/l069561/project/data/sang_data_formatted: 1.0
 dataset_splits:
 - train_sft
 - test_sft
@@ -27,7 +28,7 @@ gradient_checkpointing_kwargs:
 log_level: info
 logging_steps: 5
 logging_strategy: steps
-learning_rate: 2.0e-05
+learning_rate: 1.0e-05
 optim: galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
 optim_target_modules: all-linear
 weight_decay: 0.01
@@ -36,7 +37,7 @@ max_seq_length: 8192
 packing: false
 dataset_num_proc: 16
 max_steps: -1
-num_train_epochs: 3
+num_train_epochs: 1
 output_dir: /home/l069561/project/alignment-handbook/experiments/models/llama-3.1-inst-full-ultrachat
 overwrite_output_dir: true
 per_device_eval_batch_size: 1
diff --git a/recipes/sang_project/config_full_1.yaml b/recipes/sang_project/config_full_1.yaml
new file mode 100644
index 00000000..717b5e84
--- /dev/null
+++ b/recipes/sang_project/config_full_1.yaml
@@ -0,0 +1,54 @@
+# Model arguments
+model_name_or_path: /home/l069561/project/models/gemma-2-2b
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{% if messages[0]['role'] == 'system' %}{% set system_message = '### System Instruction: ' + messages[0]['content'] | trim + '' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{{ bos_token + system_message }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Context: ' + message['content'] | trim + '' }}{% elif message['role'] == 'assistant' %}{{ '### Result: ' + message['content'] | trim + eos_token + '' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '### Result: ' }}{% endif %}"
+dataset_mixer:
+  /home/l069561/project/data/processed_data_open_sourced_xml_to_text/merged_open_sourced_xml_to_text_dataset: 1.0
+  # /home/l069561/project/data/sang_data_formatted: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 4
+
+# SFT trainer config
+bf16: true
+do_eval: true
+# evaluation_strategy: epoch
+eval_strategy: epoch
+max_grad_norm: 1.0
+# gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+learning_rate: 2.0e-05
+optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
+optim_target_modules: all-linear
+weight_decay: 0.01
+lr_scheduler_type: cosine
+max_seq_length: 8192
+packing: false
+dataset_num_proc: 16
+max_steps: -1
+num_train_epochs: 2
+output_dir: /home/l069561/project/alignment-handbook/experiments/models/sang_exp1_stage1_gemma-2-2b_full
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node
+gradient_accumulation_steps: 4
+push_to_hub: false
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "steps"
+save_steps: 2000
+save_total_limit: 10
+seed: 42
+warmup_ratio: 0.1
diff --git a/recipes/sang_project/config_full_2.yaml b/recipes/sang_project/config_full_2.yaml
new file mode 100644
index 00000000..67b604bd
--- /dev/null
+++ b/recipes/sang_project/config_full_2.yaml
@@ -0,0 +1,55 @@
+# Model arguments
+model_name_or_path: /home/l069561/project/models/Qwen2-1.5B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+# qwen does not use bos token - https://github.com/QwenLM/Qwen2/issues/486
+chat_template: "{% if messages[0]['role'] == 'system' %}{% set system_message = '### Instruction: ' + messages[0]['content'] | trim + '\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{{ system_message }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### XML Data:\n' + message['content'] | trim + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Summary: ' + message['content'] | trim + eos_token + '' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '### Summary: ' }}{% endif %}"
+dataset_mixer:
+  /home/l069561/project/data/processed_data_open_sourced_xml_to_text/merged_open_sourced_xml_to_text_dataset: 1.0
+  # /home/l069561/project/data/sang_data_formatted: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 8
+dataset_num_proc: 8
+packing: false
+
+# SFT trainer config
+bf16: true
+do_eval: true
+# evaluation_strategy: epoch
+eval_strategy: epoch
+max_grad_norm: 1.0
+# gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+learning_rate: 1.0e-05
+optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
+optim_target_modules: all-linear
+weight_decay: 0.01
+lr_scheduler_type: linear
+max_seq_length: 8192
+max_steps: -1
+num_train_epochs: 2
+output_dir: /home/l069561/project/alignment-handbook/experiments/models/sang_exp1_stage1_qwen-2b_full
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node
+gradient_accumulation_steps: 4
+push_to_hub: false
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "steps"
+save_steps: 1500
+save_total_limit: 10
+seed: 42
+warmup_ratio: 0.1
diff --git a/recipes/sang_project/config_qlora.yaml b/recipes/sang_project/config_qlora.yaml
new file mode 100644
index 00000000..4f343a39
--- /dev/null
+++ b/recipes/sang_project/config_qlora.yaml
@@ -0,0 +1,70 @@
+# Model arguments
+model_name_or_path: /home/l069561/project/models/Meta-Llama-3-8B # no chat template
+model_revision: main
+torch_dtype: bfloat16
+use_flash_attention_2: true
+
+# LoRA arguments
+use_unsloth: false # unsloth not support deepspeed yet
+use_peft: true
+load_in_4bit: true
+lora_r: 32
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules: all
+# - q_proj
+# - k_proj
+# - v_proj
+# - o_proj
+# - gate_proj
+# - up_proj
+# - down_proj
+
+# Data training arguments
+chat_template: "{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'].strip() + '<|im_end|>\\n' }}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 16
+auto_insert_empty_system_msg: true
+
+# SFT trainer config
+bf16: true
+do_eval: true
+evaluation_strategy: epoch
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+learning_rate: 1.0e-04
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit adamw_apex_fused
+# optim_target_modules: all-linear
+weight_decay: 0.01
+lr_scheduler_type: cosine
+max_seq_length: 4096
+max_steps: -1
+num_train_epochs: 1
+output_dir: /home/l069561/project/alignment-handbook/experiments/models/demo-llama-3-8b-qlora-ultrachat
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+gradient_accumulation_steps: 4
+per_device_eval_batch_size: 4
+push_to_hub: false
+report_to:
+- tensorboard
+- wandb
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
+
+torch_compile: false
+# https://pytorch.org/docs/stable/generated/torch.compile.html ('cudagraphs', 'inductor', 'onnxrt', 'openxla', 'openxla_eval', 'tvm'])
+# https://huggingface.co/docs/transformers/perf_train_gpu_one#using-torchcompile
+torch_compile_backend: "inductor"
+torch_compile_mode: "default" # reduce-overhead max-autotune
diff --git a/scripts/run_sft.py b/scripts/run_sft.py
index c9bfbbd4..8c74f20d 100644
--- a/scripts/run_sft.py
+++ b/scripts/run_sft.py
@@ -83,11 +83,6 @@ def main():
     logger.info(f"Data parameters {data_args}")
     logger.info(f"Training/evaluation parameters {training_args}")
 
-    # Check for last checkpoint
-    last_checkpoint = get_checkpoint(training_args)
-    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint}.")
-
     ###############
     # Load datasets
     ###############
@@ -170,6 +165,10 @@ def main():
     train_dataset = raw_datasets["train"]
     eval_dataset = raw_datasets["test"]
 
+    # this is hard coded
+    training_args.dataset_text_field = "text"
+
+    # # no need for logging samples
     # with training_args.main_process_first(
     #     desc="Log a few random samples from the processed training set"
     # ):
@@ -219,7 +218,6 @@ def main():
             args=training_args,
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
-            dataset_text_field="text",
             tokenizer=tokenizer,
             dataset_kwargs=training_args.dataset_kwargs,
             callbacks=[GpuUtilPrintCallBack()],
@@ -230,7 +228,6 @@ def main():
             args=training_args,
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
-            dataset_text_field="text",
             tokenizer=tokenizer,
             peft_config=get_peft_config(model_args),
             dataset_kwargs=training_args.dataset_kwargs,
@@ -242,11 +239,14 @@ def main():
     ###############
     logger.info("*** Train ***")
 
+    # Check for last checkpoint
+    last_checkpoint = get_checkpoint(training_args)
     checkpoint = None
     if training_args.resume_from_checkpoint is not None:
         checkpoint = training_args.resume_from_checkpoint
     elif last_checkpoint is not None:
         checkpoint = last_checkpoint
+    logger.info(f"Checkpoint detected, resuming training at {checkpoint}.")
 
     train_result = trainer.train(resume_from_checkpoint=checkpoint)
     metrics = train_result.metrics
diff --git a/src/alignment/model_utils.py b/src/alignment/model_utils.py
index edfbe565..0d7d1e16 100644
--- a/src/alignment/model_utils.py
+++ b/src/alignment/model_utils.py
@@ -94,13 +94,13 @@ def tokenizer_and_embedding_resize(
                 )
 
             tokenizer.add_special_tokens({k: v})
-            model.resize_token_embeddings(len(tokenizer))
+            model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
             model.get_input_embeddings().weight.data[-1] = tk_emb
 
     # add non special extra tokens
     if non_special_tokens_to_add:
         num_new_tokens = tokenizer.add_tokens(non_special_tokens_to_add)
-        model.resize_token_embeddings(len(tokenizer))
+        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
         if num_new_tokens > 0:
             input_embeddings_data = model.get_input_embeddings().weight.data
             output_embeddings_data = model.get_output_embeddings().weight.data
@@ -151,6 +151,8 @@ def get_tokenizer(
     elif auto_set_chat_template and tokenizer.get_chat_template() is None:
         tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
 
+    tokenizer.pad_to_multiple_of = 8
+
     return tokenizer
 
 
diff --git a/src/alignment/utils.py b/src/alignment/utils.py
index 487ee003..06a1eefa 100644
--- a/src/alignment/utils.py
+++ b/src/alignment/utils.py
@@ -1,15 +1,10 @@
+from datetime import datetime
+
 from transformers import TrainerCallback
 
 from pynvml import *
 
 
-class GpuUtilPrintCallBack(TrainerCallback):
-    def on_log(self, args, state, control, logs=None, **kwargs):
-        if state.is_local_process_zero:
-            print(logs)
-            print_gpu_utilization()
-
-
 def print_gpu_utilization():
     nvmlInit()
     handle = nvmlDeviceGetHandleByIndex(0)
@@ -23,6 +18,15 @@ def print_summary(result):
     print_gpu_utilization()
 
 
+class GpuUtilPrintCallBack(TrainerCallback):
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        if state.is_local_process_zero:
+            print(datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S"))
+            print(logs)
+            print_gpu_utilization()
+            # print_summary(args)
+
+
 class ProfCallback(TrainerCallback):
     def __init__(self, prof):
         self.prof = prof