Skip to content

Commit

Permalink
fix bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
xiyang-aads-lilly committed May 30, 2024
1 parent 6dc510f commit ffd77dd
Show file tree
Hide file tree
Showing 12 changed files with 115 additions and 30 deletions.
15 changes: 14 additions & 1 deletion experiments/demo_dgx2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@ echo activate virtual ENV
PYTHON_ENV=${ROOT}/project/scripts/v2306.sh
source $PYTHON_ENV


# CUDA
export CUDA_VISIBLE_DEVICES=0,1
export CUDA_LAUNCH_BLOCKING="1"

# number of GPUs; here we use all GPUs for demo
WORLD_SIZE=3
WORLD_SIZE=2

# HF cache
export TMPDIR="${ROOT}/project/.cache/"
Expand All @@ -32,9 +37,17 @@ export ACCELERATE_LOG_LEVEL=debug
export ACCELERATE_DEBUG_MODE="1"
export DEEPSPEED_TIMEOUT=120

# accelerate launch
accelerate launch \
--config_file ${ROOT}/project/alignment_handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
--num_processes $WORLD_SIZE \
--tee 3 \
${ROOT}/project/alignment_handbook/scripts/run_sft.py \
${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_qlora.yaml
# ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_full.yaml


# deepspeed launch


# torch launch
3 changes: 0 additions & 3 deletions experiments/demo_dgx2_launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@ ROOT=$(realpath ~)
# singularity container
CONTAINER=${ROOT}/project/singularity_containers/py2402.sig

# CUDA
export CUDA_VISIBLE_DEVICES=0,1

# PATH
DEMO_PATH=${ROOT}/project/alignment_handbook/experiments

Expand Down
Empty file.
16 changes: 8 additions & 8 deletions recipes/accelerate_configs/deepspeed_zero2.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
compute_environment: LOCAL_MACHINE
debug: true
deepspeed_config:
deepspeed_config_file: /home/l069561/project/alignment-handbook/recipes/accelerate_configs/deepspeed_zs2.json
zero3_init_flag: true
# deepspeed_multinode_launcher: standard
# offload_optimizer_device: none
# offload_param_device: none
# deepspeed_config_file: '/home/l069561/project/alignment_handbook/recipes/accelerate_configs/ds_acc_conf.json'
# zero3_init_flag: true
# zero3_save_16bit_model: false
# zero_stage: 2
# mixed_precision: bf16
deepspeed_multinode_launcher: standard
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: true
zero3_save_16bit_model: false
zero_stage: 2
mixed_precision: bf16
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
Expand Down
2 changes: 1 addition & 1 deletion recipes/accelerate_configs/deepspeed_zs2.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"reduce_bucket_size": "auto",
"contiguous_gradients": true
},
"gradient_accumulation_steps": 1,
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
Expand Down
23 changes: 23 additions & 0 deletions recipes/accelerate_configs/ds_acc_conf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"bf16": {
"enabled": true
},
"zero_optimization": {
"stage": 2,
"stage3_gather_16bit_weights_on_model_save": true,
"offload_optimizer": {
"device": "none"
},
"offload_param": {
"device": "none"
}
},
"gradient_clipping": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto",
"steps_per_print": 200000,
"fp16": {
"enabled": false
}
}
24 changes: 21 additions & 3 deletions recipes/accelerate_configs/readme.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
## deepspeed optimizers
- DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, OneBitLamb, FusedLamb, FusedAdam
- see for details on how to config https://deepspeed.readthedocs.io/en/latest/optimizers.html
-
```json
{
"optimizer": {
Expand All @@ -20,7 +19,7 @@
"factor_min": 0.5,
"factor_threshold": 0.1
}
},
}
}

{
Expand All @@ -33,6 +32,25 @@
"max_coeff": 0.3,
"min_coeff": 0.01
}
},
}
}
```

- fp16 vs bf16
```json
{
"fp16": {
"enabled": false,
"loss_scale": 0,
"auto_cast": false,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"consecutive_hysteresis": false,
"min_loss_scale": 1
},
"bf16": {
"enabled": true
}
}
```
3 changes: 2 additions & 1 deletion recipes/llama3-8b/sft/config_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dataset_mixer:
dataset_splits:
- train_sft
- test_sft
preprocessing_num_workers: 8
preprocessing_num_workers: 32

# SFT trainer config
bf16: true
Expand All @@ -27,6 +27,7 @@ logging_steps: 5
logging_strategy: steps
learning_rate: 2.0e-05
optim: galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
optim_target_modules: all-linear
weight_decay: 0.01
lr_scheduler_type: cosine
max_seq_length: 4096
Expand Down
16 changes: 10 additions & 6 deletions recipes/llama3-8b/sft/config_qlora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ torch_dtype: bfloat16
use_flash_attention_2: true

# LoRA arguments
load_in_4bit: true
use_unsloth: false # unsloth not support deepspeed yet
use_peft: true
load_in_4bit: true
lora_r: 32
lora_alpha: 32
lora_dropout: 0.05
Expand All @@ -26,33 +27,36 @@ dataset_mixer:
dataset_splits:
- train_sft
- test_sft
preprocessing_num_workers: 16
preprocessing_num_workers: 32
auto_insert_empty_system_msg: true

# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: epoch
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
learning_rate: 1.0e-04
log_level: info
logging_steps: 5
logging_strategy: steps
optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
# optim_target_modules: all-linear
weight_decay: 0.01
lr_scheduler_type: cosine
max_seq_length: 4096
max_seq_length: 2048
max_steps: -1
num_train_epochs: 1
output_dir: /home/l069561/project/models/fine-tuned/demo-llama-3-8b-lora-ultrachat
overwrite_output_dir: true
per_device_eval_batch_size: 8
per_device_eval_batch_size: 2
gradient_accumulation_steps: 32
per_device_train_batch_size: 4
push_to_hub: false
report_to:
- tensorboard
# - wandb
- wandb
save_strategy: "steps"
save_steps: 100
save_total_limit: 1
Expand Down
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,7 @@ flash-attn>=2.1.0
pynvml>=11.4.0

# optional
galore-torch
galore-torch

# unsloth
# with NV pytorch container install -> pip install git+https://github.com/unslothai/unsloth.git --no-deps
20 changes: 16 additions & 4 deletions scripts/run_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def main():
if (
"<|im_start|>" in tokenizer.chat_template
and "gemma-tokenizer-chatml" not in tokenizer.name_or_path
and not model_args.use_unsloth
):
model = AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path, **model_kwargs
Expand All @@ -151,6 +152,7 @@ def main():
#####################
# Apply chat template
#####################
logger.info("*** apply chat template ***")
raw_datasets = raw_datasets.map(
apply_chat_template,
fn_kwargs={
Expand Down Expand Up @@ -192,22 +194,31 @@ def main():
########################
# Initialize the Trainer
########################

if model_args.use_unsloth:
logger.info("*** use unsloth ***")
from alignment.unsloth import get_unsloth_peft_model

peft_config = get_peft_config(model_args)
model = AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path, **model_kwargs
model, tokenizer = get_unsloth_peft_model(
model_args.model_name_or_path,
training_args.max_seq_length,
peft_config.to_dict(),
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_unsloth_peft_model(model, training_args.max_seq_length, peft_config)

if (
"<|im_start|>" in tokenizer.chat_template
and "gemma-tokenizer-chatml" not in tokenizer.name_or_path
):
model, tokenizer = setup_chat_format(model, tokenizer)

trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
dataset_text_field="text",
dataset_num_proc=data_args.preprocessing_num_workers,
max_seq_length=training_args.max_seq_length,
tokenizer=tokenizer,
packing=True,
Expand All @@ -222,6 +233,7 @@ def main():
train_dataset=train_dataset,
eval_dataset=eval_dataset,
dataset_text_field="text",
dataset_num_proc=data_args.preprocessing_num_workers,
max_seq_length=training_args.max_seq_length,
tokenizer=tokenizer,
packing=True,
Expand Down
18 changes: 16 additions & 2 deletions src/alignment/unsloth.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,18 +66,32 @@ def load_unsloth_pretrained_model(config, model_args):
return model


def get_unsloth_peft_model(model, max_seq_length, peft_kwargs: Dict[str, Any]):
def get_unsloth_peft_model(model_name, max_seq_length, peft_kwargs: Dict[str, Any]):
r"""
Gets the peft model for the pretrained model with unsloth. Used in training.
"""
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=None,
load_in_4bit=True,
)

unsloth_peft_kwargs = {
"model": model,
"max_seq_length": max_seq_length,
"use_gradient_checkpointing": "unsloth",
}
return FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)

peft_kwargs["lora_dropout"] = 0.0
peft_kwargs.pop("task_type", None)

return (
FastLanguageModel.get_peft_model(**unsloth_peft_kwargs, **peft_kwargs),
tokenizer,
)


def load_unsloth_peft_model(config, model_args, is_trainable: bool):
Expand Down

0 comments on commit ffd77dd

Please sign in to comment.