From 05563e1f39e12367c47ef86a921e3c4d887a4d32 Mon Sep 17 00:00:00 2001
From: Charles Tang <j316chuck@users.noreply.github.com>
Date: Thu, 5 Dec 2024 08:44:25 -0800
Subject: [PATCH] Add llama3 ft example yamls (#1686)

Co-authored-by: Chuck Tang <chuck.tang@databricks.com>
Co-authored-by: Saaketh Narayan <narayan.saaketh@gmail.com>
---
 mcli/mcli-llama2-finetune.yaml              |  19 ++-
 mcli/mcli-llama3-70b-instruct-finetune.yaml | 158 ++++++++++++++++++++
 2 files changed, 169 insertions(+), 8 deletions(-)
 create mode 100644 mcli/mcli-llama3-70b-instruct-finetune.yaml

diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
index 8f3b6bac4e..d675cb9767 100644
--- a/mcli/mcli-llama2-finetune.yaml
+++ b/mcli/mcli-llama2-finetune.yaml
@@ -21,9 +21,12 @@ compute:
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
-  tokenizer_name: meta-llama/Llama-2-7b-hf
-  max_seq_len: 4096
-  global_seed: 17
+  variables:
+    tokenizer_name: meta-llama/Llama-2-7b-hf
+    global_seed: 17
+    max_seq_len: 4096
+
+  max_seq_len: ${variables.max_seq_len}
 
   # Run Name
   run_name:  # If left blank, will be read from env var $RUN_NAME
@@ -42,9 +45,9 @@ parameters:
 
   # Tokenizer
   tokenizer:
-    name: ${tokenizer_name}
+    name: ${variables.tokenizer_name}
     kwargs:
-      model_max_length: ${max_seq_len}
+      model_max_length: ${variables.max_seq_len}
 
   # Dataloaders
   train_loader:
@@ -52,7 +55,7 @@ parameters:
     dataset:
       hf_name: mosaicml/dolly_hhrlhf
       split: train
-      max_seq_len: ${max_seq_len}
+      max_seq_len: ${variables.max_seq_len}
       allow_pad_trimming: false
       decoder_only_format: true
       shuffle: true
@@ -75,7 +78,7 @@ parameters:
     dataset:
       hf_name: mosaicml/dolly_hhrlhf
       split: test
-      max_seq_len: ${max_seq_len}
+      max_seq_len: ${variables.max_seq_len}
       allow_pad_trimming: false
       decoder_only_format: true
       # packing_ratio:
@@ -114,7 +117,7 @@ parameters:
   global_train_batch_size: 64
 
   # System
-  seed: ${global_seed}
+  seed: ${variables.global_seed}
   device_eval_batch_size: 8
   device_train_microbatch_size: auto
   precision: amp_bf16
diff --git a/mcli/mcli-llama3-70b-instruct-finetune.yaml b/mcli/mcli-llama3-70b-instruct-finetune.yaml
new file mode 100644
index 0000000000..1bb3f17b01
--- /dev/null
+++ b/mcli/mcli-llama3-70b-instruct-finetune.yaml
@@ -0,0 +1,158 @@
+integrations:
+- integration_type: git_repo
+  git_repo: mosaicml/llm-foundry
+  git_branch: v0.15.0
+  # git_commit: # OR use your commit hash
+  pip_install: .[gpu]
+  ssh_clone: false  # Should be true if using a private repo
+
+command: |
+  cd llm-foundry/scripts
+  composer train/train.py /mnt/config/parameters.yaml
+image: mosaicml/llm-foundry:2.5.1_cu124-latest
+name: llama3.1-70b-finetune
+
+compute:
+  # Note: Finetuning the 70b model requires at least 16x80GB GPUs
+  gpus: 16  # Number of GPUs to use
+  ## These configurations are optional
+  # cluster: TODO # Name of the cluster to use for this run
+  # gpu_type: h100_80gb # Type of GPU to use. We use h100_80gb in our experiments
+
+# The below is injected as a YAML file: /mnt/config/parameters.yaml
+parameters:
+  variables:
+    tokenizer_name: meta-llama/Llama-3.1-70B-Instruct
+    global_seed: 17
+    max_seq_len: 4096
+
+  max_seq_len: ${variables.max_seq_len}
+  # Run Name
+  run_name:  # If left blank, will be read from env var $RUN_NAME
+
+  max_split_size_mb: 512
+
+  # Model
+  model:
+    name: hf_causal_lm
+    init_device: mixed
+    pretrained_model_name_or_path: meta-llama/Llama-3.1-70B-Instruct
+    pretrained: true
+    # Note: you must have set the HF_TOKEN environment variable and have access to the llama3 models
+    use_auth_token: true
+    use_flash_attention_2: true
+
+  # Tokenizer
+  tokenizer:
+    name: ${variables.tokenizer_name}
+    kwargs:
+      model_max_length: ${variables.max_seq_len}
+  # Dataloaders
+  train_loader:
+    name: finetuning
+    dataset:
+      hf_name: mosaicml/dolly_hhrlhf
+      split: train
+      max_seq_len: ${variables.max_seq_len}
+      allow_pad_trimming: false
+      decoder_only_format: true
+      shuffle: true
+      # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
+      # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
+      # # of the dataset.
+      # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
+      # # to profile this run's optimal packing_ratio as it depends on GPU count,
+      # # batch size, sequence length
+      # packing_ratio: auto
+    drop_last: true
+    num_workers: 8
+    pin_memory: false
+    prefetch_factor: 2
+    persistent_workers: true
+    timeout: 0
+
+  eval_loader:
+    name: finetuning
+    dataset:
+      hf_name: mosaicml/dolly_hhrlhf
+      split: test
+      max_seq_len: ${variables.max_seq_len}
+      allow_pad_trimming: false
+      decoder_only_format: true
+      # packing_ratio:
+      shuffle: false
+    drop_last: true
+    num_workers: 8
+    pin_memory: false
+    prefetch_factor: 2
+    persistent_workers: true
+    timeout: 0
+
+  # Optimization
+  scheduler:
+    name: cosine_with_warmup
+    t_warmup: 100ba
+    alpha_f: 0.1
+
+  # Note: You may want to change learning rate, betas, weight decay
+  optimizer:
+    name: decoupled_lionw
+    lr: 5.0e-7
+    betas:
+    - 0.9
+    - 0.95
+    weight_decay: 0.0
+
+  algorithms:
+    gradient_clipping:
+      clipping_type: norm
+      clipping_threshold: 1.0
+
+  max_duration: 1ep
+  eval_first: false
+  eval_interval: 1ep
+  eval_subset_num_batches: -1
+  global_train_batch_size: 16
+
+  # System
+  seed: ${variables.global_seed}
+  device_eval_batch_size: 1
+  device_train_microbatch_size: 1
+  precision: amp_bf16
+
+  # FSDP
+  fsdp_config:
+    state_dict_type: sharded  # Note: we enable sharded checkpointing to avoid GPU OOM
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: true
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
+
+  # Logging
+  progress_bar: false
+  log_to_console: true
+  console_log_interval: 1ba
+
+  callbacks:
+    speed_monitor:
+      window_size: 10
+    lr_monitor: {}
+    memory_monitor: {}
+    runtime_estimator: {}
+
+  load_weights_only: true  # Only load the weights, not the optimizer state, LR schedule, etc
+
+#   loggers:
+#     wandb: {}
+
+#   Checkpoint to local filesystem or remote object store
+#   save_interval: 2000ba
+#   save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
+#   save_folder: ./{run_name}/checkpoints
+#   save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints
+
+#   Load from local filesystem or remote object store
+#   load_path: ./gpt-1b/checkpoints/latest-rank{rank}.pt
+#   load_path: s3://my-bucket/my-folder/gpt-1b/checkpoints/latest-rank{rank}.pt