sync

xiyang-aads-lilly · Aug 2, 2024 · 9ff4599 · 9ff4599
2 parents 6e72b98 + 01f29c1
commit 9ff4599
Show file tree

Hide file tree

Showing 38 changed files with 253 additions and 639 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,29 @@
+cff-version: 1.2.0
+title: The Alignment Handbook
+message: >-
+  Robust recipes to align language models with human and AI
+  preferences.
+type: software
+authors:
+  - given-names: Lewis
+    family-names: Tunstall
+  - given-names: Edward
+    family-names: Beeching
+  - given-names: Nathan
+    family-names: Lambert
+  - given-names: Nazneen
+    family-names: Rajani
+  - given-names: Shengyi
+    family-names: Huang
+  - given-names: Kashif
+    family-names: Rasul
+  - given-names: Alvaro
+    family-names: Bartolome
+  - given-names: Alexander
+    name-particle: M.
+    family-names: Rush
+  - given-names: Thomas
+    family-names: Wolf
+repository-code: 'https://github.com/huggingface/alignment-handbook'
+license: Apache-2.0
+version: 0.3.0.dev0
diff --git a/README.md b/README.md
@@ -49,8 +49,8 @@ If you would like to train chat models on your own datasets, we recommend follow
 
 The initial release of the handbook will focus on the following techniques:
 
-* **Continued pretraining:** adapt language models to a new language or domain, or simply improve it by continue pretraning (causal language modeling) on a new dataset.
-* **Supervised fine-tuning:** teach language models to follow instructions and tips on how to collect and curate your own training dataset.
+* **Continued pretraining:** adapt language models to a new language or domain, or simply improve it by continued pretraining (causal language modeling) on a new dataset.
+* **Supervised fine-tuning:** teach language models to follow instructions and tips on how to collect and curate your training dataset.
 * **Reward modeling:** teach language models to distinguish model responses according to human or AI preferences.
 * **Rejection sampling:** a simple, but powerful technique to boost the performance of your SFT model.
 * **Direct preference optimisation (DPO):** a powerful and promising alternative to PPO.
@@ -115,15 +115,14 @@ You can now check out the `scripts` and `recipes` directories for instructions o
 
 ## Citation
 
-If you find the content of this repo useful in your work, please cite it as follows:
+If you find the content of this repo useful in your work, please cite it as follows via `\usepackage{biblatex}`:
 
 ```bibtex
-@misc{alignment_handbook2023,
-  author = {Lewis Tunstall and Edward Beeching and Nathan Lambert and Nazneen Rajani and Shengyi Huang and Kashif Rasul and Alexander M. Rush and Thomas Wolf},
-  title = {The Alignment Handbook},
-  year = {2023},
-  publisher = {GitHub},
-  journal = {GitHub repository},
-  howpublished = {\url{https://github.com/huggingface/alignment-handbook}}
+@software{Tunstall_The_Alignment_Handbook,
+  author = {Tunstall, Lewis and Beeching, Edward and Lambert, Nathan and Rajani, Nazneen and Huang, Shengyi and Rasul, Kashif and Bartolome, Alvaro and M. Rush, Alexander and Wolf, Thomas},
+  license = {Apache-2.0},
+  title = {{The Alignment Handbook}},
+  url = {https://github.com/huggingface/alignment-handbook},
+  version = {0.3.0.dev0}
 }
 ```
diff --git a/recipes/constitutional-ai/README.md b/recipes/constitutional-ai/README.md
@@ -21,4 +21,4 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 
 ## Advanced: generating you own dataset
 
-To generate the constitutional AI dataset, see https://github.com/huggingface/llm-swarm/tree/main/examples/constitutional-ai for detailed instructions if you want build or customize the dataset. 
+To generate the constitutional AI dataset, see https://github.com/huggingface/llm-swarm/tree/main/examples/constitutional-ai for detailed instructions if you want to build or customize the dataset. 
diff --git a/recipes/constitutional-ai/dpo/config_anthropic.yaml b/recipes/constitutional-ai/dpo/config_anthropic.yaml
@@ -17,7 +17,7 @@ bf16: true
 beta: 0.1
 do_eval: true
 do_train: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 1000
 gradient_accumulation_steps: 1
 gradient_checkpointing: true

diff --git a/recipes/constitutional-ai/sft/config_anthropic.yaml b/recipes/constitutional-ai/sft/config_anthropic.yaml
@@ -2,7 +2,7 @@
 model_name_or_path: mistralai/Mistral-7B-v0.1
 model_revision: main
 torch_dtype: bfloat16
-use_flash_attention_2: true
+attn_implementation: flash_attention_2
 
 # Data training arguments
 chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
@@ -18,7 +18,7 @@ preprocessing_num_workers: 12
 bf16: true
 do_eval: true
 do_train: true
-evaluation_strategy: epoch # One of ["no", "steps", "epoch"]
+eval_strategy: epoch # One of ["no", "steps", "epoch"]
 gradient_accumulation_steps: 4
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:

diff --git a/recipes/constitutional-ai/sft/config_grok.yaml b/recipes/constitutional-ai/sft/config_grok.yaml
@@ -2,7 +2,7 @@
 model_name_or_path: mistralai/Mistral-7B-v0.1
 model_revision: main
 torch_dtype: bfloat16
-use_flash_attention_2: true
+attn_implementation: flash_attention_2
 
 # Data training arguments
 chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
@@ -18,7 +18,7 @@ preprocessing_num_workers: 12
 bf16: true
 do_eval: true
 do_train: true
-evaluation_strategy: epoch # One of ["no", "steps", "epoch"]
+eval_strategy: epoch # One of ["no", "steps", "epoch"]
 gradient_accumulation_steps: 4
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:

diff --git a/recipes/gpt2-nl/README.md b/recipes/gpt2-nl/README.md
@@ -2,7 +2,7 @@
 
 This directory shows a base example of how to use continued pretraining and further tuning to adapt a language model to new data (e.g. a new language or domain).
 
-Three steps are needed: continued pretraining (`cpt`), supervised finetuning (`sft`), and direct preference optimisation (`dpo`). In this dummy example we'll continue pretraining gpt2 on Dutch raw data, then sft-tuning it, and finally aligning it with DPO. Note that no extensive hyperparameters were tested in this example and that the output models are bad - it is just to show you how you can use the scripts for LM adaptation. The scripts work on 4x 3090s (24GB VRAM). If you have less powerful hardware you may need to reduce the batch size.
+Three steps are needed: continued pretraining (`cpt`), supervised finetuning (`sft`), and direct preference optimisation (`dpo`). In this dummy example, we'll continue pretraining gpt2 on Dutch raw data, then sft-tuning it, and finally aligning it with DPO. Note that no extensive hyperparameters were tested in this example and that the output models are bad - it is just to show you how you can use the scripts for LM adaptation. The scripts work on 4x 3090s (24GB VRAM). If you have less powerful hardware you may need to reduce the batch size.
 
 ## Continued pretraining
 
@@ -18,7 +18,7 @@ ACCELERATE_LOG_LEVEL=info accelerate launch \
 
 ## Supervised finetuning
 
-As other recipes, such as the famous zephyr-7b-beta recipe, have shown, we can then teach our model how to hold a conversation by finetuning it on chat-formatted data. As a base model we'll make use of the output of the previous step.
+As other recipes, such as the famous zephyr-7b-beta recipe, have shown, we can then teach our model how to hold a conversation by finetuning it on chat-formatted data. As a base model, we'll make use of the output of the previous step.
 
 ```shell
 ACCELERATE_LOG_LEVEL=info accelerate launch \

diff --git a/recipes/gpt2-nl/cpt/config_full.yaml b/recipes/gpt2-nl/cpt/config_full.yaml
@@ -15,7 +15,7 @@ preprocessing_num_workers: 12
 # SFT trainer config
 bf16: true
 do_eval: False
-evaluation_strategy: "no"
+eval_strategy: "no"
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:

diff --git a/recipes/gpt2-nl/dpo/config_full.yaml b/recipes/gpt2-nl/dpo/config_full.yaml
@@ -16,7 +16,7 @@ preprocessing_num_workers: 12
 bf16: true
 beta: 0.1
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 100
 gradient_accumulation_steps: 8
 gradient_checkpointing: true

diff --git a/recipes/gpt2-nl/sft/config_full.yaml b/recipes/gpt2-nl/sft/config_full.yaml
@@ -15,7 +15,7 @@ preprocessing_num_workers: 12
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: epoch
+eval_strategy: epoch
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:

diff --git a/recipes/pref_align_scan/README.md b/recipes/pref_align_scan/README.md
@@ -5,13 +5,14 @@ This directory contains various comparisons for three algorithms: DPO, IPO, and
 - OpenHermes-2.5 and the OpenOrca datasets 
 
 We release a collection containing the datasets and models used for these experiments, if you require the other trained models, we can release them on request.
-You can find a longer decription of there results in our [blogpost](https://huggingface.co/blog/pref-tuning)
+You can find a longer description of these results in our [blogpost](https://huggingface.co/blog/pref-tuning)
+
 ## Comparisons
 For each algorithm, we aim to tune the beta parameter for a fixed learning rate. We vary beta from 0.1-0.9 in steps of 0.1, we have also found that in certain configurations a tiny value of beta, 0.01, can be effective. So we have included this smaller value in all our comparisons.
 
 ## Usage
 The experiments can be launched with the following bash script:
-```
+```bash
 #!/bin/bash
 
 # Define an array containing the base configs we wish to fine tune

diff --git a/recipes/pref_align_scan/dpo/config_openhermes.yaml b/recipes/pref_align_scan/dpo/config_openhermes.yaml
@@ -16,7 +16,7 @@ beta: 0.01
 loss_type: sigmoid
 do_eval: true
 do_train: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 100
 gradient_accumulation_steps: 2
 gradient_checkpointing: true

diff --git a/recipes/pref_align_scan/dpo/config_zephyr.yaml b/recipes/pref_align_scan/dpo/config_zephyr.yaml
@@ -15,7 +15,7 @@ bf16: true
 beta: 0.01
 loss_type: sigmoid
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 100
 gradient_accumulation_steps: 2
 gradient_checkpointing: true

diff --git a/recipes/starchat2-15b/dpo/config_v0.1.yaml b/recipes/starchat2-15b/dpo/config_v0.1.yaml
@@ -16,7 +16,7 @@ preprocessing_num_workers: 12
 bf16: true
 beta: 0.05
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 100
 gradient_accumulation_steps: 8
 gradient_checkpointing: true

diff --git a/recipes/starchat2-15b/sft/config_v0.1.yaml b/recipes/starchat2-15b/sft/config_v0.1.yaml
@@ -2,7 +2,7 @@
 model_name_or_path: bigcode/starcoder2-15b
 model_revision: main
 torch_dtype: bfloat16
-use_flash_attention_2: true
+attn_implementation: flash_attention_2
 
 # Data training arguments
 chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
@@ -20,7 +20,7 @@ preprocessing_num_workers: 24
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: epoch
+eval_strategy: epoch
 gradient_accumulation_steps: 2
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:

diff --git a/recipes/zephyr-141b-A35b/orpo/config_full.yaml b/recipes/zephyr-141b-A35b/orpo/config_full.yaml
@@ -2,7 +2,7 @@
 model_name_or_path: mistral-community/Mixtral-8x22B-v0.1
 model_revision: main
 torch_dtype: bfloat16
-use_flash_attention_2: true
+attn_implementation: flash_attention_2
 
 # Data training arguments
 chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

diff --git a/recipes/zephyr-7b-beta/README.md b/recipes/zephyr-7b-beta/README.md
@@ -4,9 +4,9 @@
 As described in the Zephyr [technical report](https://huggingface.co/papers/2310.16944), training this model proceeds in two steps:
 
 1. Apply SFT to fine-tune Mistral 7B on a filtered version of the UltraChat dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)). The result is an SFT model like [`zephyr-7b-sft-full`](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) or [`zephyr-7b-sft-qlora`](https://huggingface.co/alignment-handbook/zephyr-7b-sft-qlora).
-2. Align the SFT model to AI feedback via DPO on a preprocessed version of the UltraFeedback dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)). The result is an DPO model like [`zephyr-7b-dpo-full`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-full) or [`zephyr-7b-dpo-qlora`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-qlora).
+2. Align the SFT model to AI feedback via DPO on a preprocessed version of the UltraFeedback dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)). The result is a DPO model like [`zephyr-7b-dpo-full`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-full) or [`zephyr-7b-dpo-qlora`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-qlora).
 
-**Note:** after the release of Zephyr, the team at [Argilla](https://argilla.io) found that the source UltraFeedback dataset had a few thousand incorrect preference labels from GPT-4. Additionally, TRL's `SFTTrainer` had a bug in the learning rate scheduler which terminated training early. Accounting for these changes led us to find a better set of hyperparameters from those described in the technical report. In particular, for DPO training we found that training for 1 epoch with `beta=0.01` was suffucient to achieve comparable performance to `zephyr-7b-beta` (vs. 3 epochs with `beta=0.1`).
+**Note:** after the release of Zephyr, the team at [Argilla](https://argilla.io) found that the source UltraFeedback dataset had a few thousand incorrect preference labels from GPT-4. Additionally, TRL's `SFTTrainer` had a bug in the learning rate scheduler which terminated training early. Accounting for these changes led us to find a better set of hyperparameters from those described in the technical report. In particular, for DPO training we found that training for 1 epoch with `beta=0.01` was sufficient to achieve comparable performance to `zephyr-7b-beta` (vs. 3 epochs with `beta=0.1`).
 
 See below for commands to train these models using either DeepSpeed ZeRO-3 or LoRA.
 
@@ -34,11 +34,11 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 
 P.S. Using Flash Attention also allows you to drastically increase the batch size (x2 in my case)
 
-Train without flash-attention:
+Train without flash-attention (i.e. via PyTorch's scaled dot product attention):
 ```````shell
 # Step 1 - SFT
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_qlora.yaml --load_in_4bit=true --use_flash_attention_2=false
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_qlora.yaml --load_in_4bit=true --attn_implementation=sdpa
 
 # Step 2 - DPO
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_qlora.yaml --use_flash_attention_2=false
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_qlora.yaml --attn_implementation=sdpa
 ```````
diff --git a/recipes/zephyr-7b-beta/dpo/config_full.yaml b/recipes/zephyr-7b-beta/dpo/config_full.yaml
@@ -15,7 +15,7 @@ preprocessing_num_workers: 12
 bf16: true
 beta: 0.01
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 100
 gradient_accumulation_steps: 2
 gradient_checkpointing: true

diff --git a/recipes/zephyr-7b-beta/dpo/config_qlora.yaml b/recipes/zephyr-7b-beta/dpo/config_qlora.yaml
@@ -1,7 +1,7 @@
 # Model arguments
 model_name_or_path: alignment-handbook/zephyr-7b-sft-qlora
 torch_dtype: bfloat16
-use_flash_attention_2: true
+attn_implementation: flash_attention_2
 
 # LoRA arguments
 use_peft: true
@@ -31,7 +31,7 @@ preprocessing_num_workers: 12
 bf16: true
 beta: 0.01
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 100
 gradient_accumulation_steps: 4
 gradient_checkpointing: true
@@ -54,4 +54,4 @@ save_strategy: "steps"
 save_steps: 100
 save_total_limit: 1
 seed: 42
-warmup_ratio: 0.1
+warmup_ratio: 0.1
diff --git a/recipes/zephyr-7b-beta/sft/config_full.yaml b/recipes/zephyr-7b-beta/sft/config_full.yaml
@@ -2,7 +2,7 @@
 model_name_or_path: mistralai/Mistral-7B-v0.1
 model_revision: main
 torch_dtype: bfloat16
-use_flash_attention_2: true
+attn_implementation: flash_attention_2
 
 # Data training arguments
 chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
@@ -16,7 +16,7 @@ preprocessing_num_workers: 12
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: epoch
+eval_strategy: epoch
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:

diff --git a/recipes/zephyr-7b-beta/sft/config_qlora.yaml b/recipes/zephyr-7b-beta/sft/config_qlora.yaml
@@ -2,7 +2,7 @@
 model_name_or_path: mistralai/Mistral-7B-v0.1
 model_revision: main
 torch_dtype: bfloat16
-use_flash_attention_2: true
+attn_implementation: flash_attention_2
 
 # LoRA arguments
 load_in_4bit: true
@@ -31,7 +31,7 @@ preprocessing_num_workers: 12
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: epoch
+eval_strategy: epoch
 gradient_accumulation_steps: 2
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
@@ -40,7 +40,7 @@ hub_model_id: zephyr-7b-sft-qlora
 hub_strategy: every_save
 learning_rate: 2.0e-04
 log_level: info
-logging_steps: 5  
+logging_steps: 5
 logging_strategy: steps
 lr_scheduler_type: cosine
 max_seq_length: 2048
@@ -57,4 +57,4 @@ save_strategy: "steps"
 save_steps: 100
 save_total_limit: 1
 seed: 42
-warmup_ratio: 0.1
+warmup_ratio: 0.1
diff --git a/recipes/zephyr-7b-gemma/dpo/config_full.yaml b/recipes/zephyr-7b-gemma/dpo/config_full.yaml
@@ -15,7 +15,7 @@ preprocessing_num_workers: 12
 bf16: true
 beta: 0.05
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 100
 gradient_accumulation_steps: 8
 gradient_checkpointing: true

diff --git a/recipes/zephyr-7b-gemma/sft/config_full.yaml b/recipes/zephyr-7b-gemma/sft/config_full.yaml
@@ -3,7 +3,7 @@ model_name_or_path: google/gemma-7b
 model_revision: main
 tokenizer_name_or_path: philschmid/gemma-tokenizer-chatml # Custom tokenizer with <|im_start|> and <|im_end|> tokens
 torch_dtype: bfloat16
-use_flash_attention_2: true
+attn_implementation: flash_attention_2
 
 # Data training arguments
 dataset_mixer:
@@ -19,7 +19,7 @@ dataset_kwargs:
   add_special_tokens: false  # We already wrap <bos> and <eos> in the chat template
   append_concat_token: false # No need to add <eos> across samples
 do_eval: true
-evaluation_strategy: epoch
+eval_strategy: epoch
 gradient_accumulation_steps: 4
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -21,4 +21,4 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con

		## Advanced: generating you own dataset

		To generate the constitutional AI dataset, see https://github.com/huggingface/llm-swarm/tree/main/examples/constitutional-ai for detailed instructions if you want build or customize the dataset.
		To generate the constitutional AI dataset, see https://github.com/huggingface/llm-swarm/tree/main/examples/constitutional-ai for detailed instructions if you want to build or customize the dataset.