huggingface · Abhishek-TAMU · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -654,6 +654,50 @@ def test_data_collator_completion_lm_with_multiple_text(self):
             result_text = tokenizer.decode(batch["input_ids"][i, last_pad_idx + 1 :])
             self.assertEqual(result_text, "I have not been masked correctly.")
 
+    def test_data_collator_completion_lm_without_padding(self):
+        os.environ["CUDA_VISIBLE_DEVICES"]="0"
+        model_id = "trl-internal-testing/tiny-random-LlamaForCausalLM"
+        torch_dtype = getattr(torch, "bfloat16", None)
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch_dtype, attn_implementation="flash_attention_2")
+        tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+
+        formatted_dataset = lambda example: {
+            "output": f"### prompt:\n{example['prompt'].strip()}\n\n### completion:\n{example['completion'].strip()}{tokenizer.eos_token}"
+        }
+
+        train_dataset = self.standard_prompt_completion_dataset["train"].map(formatted_dataset)
+
+        response_template = "### completion:\n"
+        data_collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer, padding_free=True)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = SFTConfig(
+                output_dir=tmp_dir,
+                dataloader_drop_last=True,
+                max_steps=2,
+                per_device_train_batch_size=2,
+                gradient_accumulation_steps=1,
+                save_steps=2,
+                learning_rate=1e-5,
+                dataset_text_field="output",
+                torch_compile=True,
+                torch_compile_backend="inductor",
+                torch_compile_mode="default"
+            )
+
+            trainer = SFTTrainer(
+                model=model,
+                tokenizer=tokenizer,
+                train_dataset=train_dataset,
+                data_collator=data_collator,
+                args=training_args,
+            )
+
+            trainer.train()
+            assert trainer.state.log_history[(-1)]["train_loss"] is not None
+            assert "model.safetensors" in os.listdir(tmp_dir + "/checkpoint-2")
+            del os.environ["CUDA_VISIBLE_DEVICES"]
+
     def test_data_collator_chat_completion_lm(self):
         instruction_template = "### Human:"
         assistant_template = "### Assistant:"

diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
@@ -232,6 +232,25 @@ def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> D
             batch["labels"] = batch["labels"][attn_mask.bool()].unsqueeze(0)
             batch["labels"][batch["position_ids"] == 0] = self.ignore_index
 
+            # Calculate cumulative sequence lengths for queries and keys to prevent graph breaks during further computations.
+            flattened_position_ids = batch["position_ids"].flatten()
+            indices_q = torch.arange(
+                flattened_position_ids.size(0), device=flattened_position_ids.device, dtype=torch.int32
+            )
+            batch["cu_seq_lens_q"] = torch.cat(
+                (
+                    indices_q[flattened_position_ids == 0],
+                    torch.tensor(
+                        flattened_position_ids.size(), device=flattened_position_ids.device, dtype=torch.int32
+                    ),
+                )
+            )
+            batch["cu_seq_lens_k"] = batch["cu_seq_lens_q"]
+
+            # Determine maximum sequence lengths to prevent graph breaks during further computations.
+            batch["max_length_k"] = flattened_position_ids.max().item() + 1
+            batch["max_length_q"] = batch["max_length_k"]
+
         return batch