diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx index 0b5020dbad..068f18b312 100644 --- a/docs/source/dpo_trainer.mdx +++ b/docs/source/dpo_trainer.mdx @@ -150,6 +150,7 @@ The DPO algorithm supports several loss functions. The loss function can be set | `"sppo_hard"` | The [SPPO](https://huggingface.co/papers/2405.00675) authors claim that SPPO is capable of solving the Nash equilibrium iteratively by pushing the chosen rewards to be as large as 1/2 and the rejected rewards to be as small as -1/2 and can alleviate data sparsity issues. The implementation approximates this algorithm by employing hard label probabilities, assigning 1 to the winner and 0 to the loser. | | `"aot"` or `loss_type="aot_pair"` | The [AOT](https://huggingface.co/papers/2406.05882) authors propose to use Distributional Preference Alignment Via Optimal Transport. Traditionally, the alignment algorithms use paired preferences at a sample level, which does not ensure alignment on the distributional level. AOT, on the other hand, can align LLMs on paired or unpaired preference data by making the reward distribution of the positive samples stochastically dominant in the first order on the distribution of negative samples. Specifically, `loss_type="aot"` is appropriate for paired datasets, where each prompt has both chosen and rejected responses; `loss_type="aot_pair"` is for unpaired datasets. In a nutshell, `loss_type="aot"` ensures that the log-likelihood ratio of chosen to rejected of the aligned model has higher quantiles than that ratio for the reference model. `loss_type="aot_pair"` ensures that the chosen reward is higher on all quantiles than the rejected reward. Note that in both cases quantiles are obtained via sorting. To fully leverage the advantages of the AOT algorithm, it is important to maximize the per-GPU batch size. | | `"apo_zero"` or `loss_type="apo_down"` | The [APO](https://huggingface.co/papers/2408.06266) method introduces an "anchored" version of the alignment objective. There are two variants: `apo_zero` and `apo_down`. The `apo_zero` loss increases the likelihood of winning outputs while decreasing the likelihood of losing outputs, making it suitable when the model is less performant than the winning outputs. On the other hand, `apo_down` decreases the likelihood of both winning and losing outputs, but with a stronger emphasis on reducing the likelihood of losing outputs. This variant is more effective when the model is better than the winning outputs. | +| `"discopop"` | The [DiscoPOP](https://huggingface.co/papers/2406.08414) paper uses LLMs to discover more efficient offline preference optimization losses. In the paper the proposed DiscoPOP loss (which is a log-ratio modulated loss) outperformed other optimization losses on different tasks (IMDb positive text generation, Reddit TLDR summarization, and Alpaca Eval 2.0). | ### Label smoothing diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py index 084138ea34..b52b86b166 100644 --- a/tests/test_dpo_trainer.py +++ b/tests/test_dpo_trainer.py @@ -196,6 +196,7 @@ def setUp(self): ["t5", "exo_pair", True], ["gpt2", "apo_zero", True], ["t5", "apo_down", False], + ["gpt2", "discopop", False], ] ) def test_dpo_trainer(self, name, loss_type, pre_compute): diff --git a/tests/test_trainers_args.py b/tests/test_trainers_args.py index 55802e9a92..2f62e658a7 100644 --- a/tests/test_trainers_args.py +++ b/tests/test_trainers_args.py @@ -163,6 +163,7 @@ def test_dpo(self): ref_model_mixup_alpha=0.5, ref_model_sync_steps=32, rpo_alpha=0.5, + discopop_tau=0.1, ) trainer = DPOTrainer( model="gpt2", ref_model="gpt2", args=training_args, train_dataset=dataset, processing_class=tokenizer @@ -193,6 +194,7 @@ def test_dpo(self): self.assertEqual(trainer.args.ref_model_mixup_alpha, 0.5) self.assertEqual(trainer.args.ref_model_sync_steps, 32) self.assertEqual(trainer.args.rpo_alpha, 0.5) + self.assertEqual(trainer.args.discopop_tau, 0.1) def test_kto(self): tokenizer = AutoTokenizer.from_pretrained("gpt2") diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py index 3e653fa3f8..80254a2b14 100644 --- a/trl/trainer/dpo_config.py +++ b/trl/trainer/dpo_config.py @@ -63,6 +63,7 @@ class DPOConfig(TrainingArguments): - `"sppo_hard"`: SPPO loss with hard label from the [SPPO](https://huggingface.co/papers/2405.00675) paper. - `"aot"`: AOT loss for paired datasets from the [AOT](https://huggingface.co/papers/2406.05882) paper. - `"aot_pair"`: AOT loss for unpaired datasets from the [AOT](https://huggingface.co/papers/2406.05882) paper. + - `"discopop"`: DiscoPOP (a.k.a Log-Ratio Modulated Loss, LRML) loss from the [DiscoPOP](https://huggingface.co/papers/2406.08414) paper. - `"apo_zero"`: APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper. - `"apo_down"`: APO-down loss from the [APO](https://huggingface.co/papers/2408.06266) paper. use_weighting (`bool`, *optional*, defaults to `False`): @@ -132,6 +133,9 @@ class DPOConfig(TrainingArguments): α parameter from the [RPO](https://huggingface.co/papers/2404.19733) paper (v3), which controls the weighting of the NLL term in the loss. If `None`, no weighting is applied and the loss is the same as the DPO loss. The paper recommends `rpo_alpha=1.0`. + discopop_tau (`float`, *optional*, defaults to `0.05`): + τ/temperature parameter from the [DiscoPOP](https://huggingface.co/papers/2406.08414) paper, which controls + the shape of log ratio modulated loss. The paper recommends the default value `discopop_tau=0.05`. use_num_logits_to_keep (`bool`, *optional*, defaults to `False`): If `True`, only a specified number of logits are computed in the forward pass of CausalLM. This can be useful for saving memory and speeding up training by not computing the logits for all tokens, especially in scenarios @@ -153,6 +157,7 @@ class DPOConfig(TrainingArguments): "sppo_hard", "aot", "aot_pair", + "discopop", "apo_zero", "apo_down", ] = "sigmoid" @@ -181,6 +186,7 @@ class DPOConfig(TrainingArguments): ref_model_mixup_alpha: float = 0.9 ref_model_sync_steps: int = 64 rpo_alpha: Optional[float] = None + discopop_tau: float = 0.05 use_num_logits_to_keep: bool = False def __post_init__(self): diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py index 21382db312..c699bce2ee 100644 --- a/trl/trainer/dpo_trainer.py +++ b/trl/trainer/dpo_trainer.py @@ -1022,10 +1022,24 @@ def dpo_loss( losses_rejected = 1 - F.sigmoid(self.beta * (chosen_logratios - rejected_logratios)) losses = losses_chosen + losses_rejected + elif self.loss_type == "discopop": + # Eqn (5) of the DiscoPOP paper (https://huggingface.co/papers/2406.08414) + # This loss was discovered with LLM discovery + logratios = chosen_logps - rejected_logps + ref_logratios = ref_chosen_logps - ref_rejected_logps + logits = logratios - ref_logratios + logits = logits * self.beta + # Modulate the mixing coefficient based on the log ratio magnitudes + log_ratio_modulation = torch.sigmoid(logits / self.args.discopop_tau) + logistic_component = -F.logsigmoid(logits) + exp_component = torch.exp(-logits) + # Blend between logistic and exponential component based on log ratio modulation + losses = logistic_component * (1 - log_ratio_modulation) + exp_component * log_ratio_modulation + else: raise ValueError( f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'exo_pair', " - "'nca_pair', 'robust', 'bco_pair', 'sppo_hard', 'aot', 'aot_pair', 'apo_zero', 'apo_down']" + "'nca_pair', 'robust', 'bco_pair', 'sppo_hard', 'aot', 'aot_pair', 'discopop', 'apo_zero', 'apo_down']" ) chosen_rewards = self.beta * (chosen_logps.to(device) - ref_chosen_logps.to(device)).detach()