Skip to content

Commit

Permalink
pretrain: fix with sample_packing=false (#1841)
Browse files Browse the repository at this point in the history
  • Loading branch information
tmm1 authored Aug 21, 2024
1 parent 5aac4bc commit 649c19a
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 3 deletions.
4 changes: 2 additions & 2 deletions src/axolotl/utils/data/pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@


def encode_pretraining(
tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: List[str]
tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: Dict[str, List]
) -> Dict[str, List]:
res = tokenizer(
examples,
examples["text"],
truncation=True,
max_length=max_tokens - 2,
add_special_tokens=True,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_encode_pretraining(self):
"hello, hello",
]
}
result = encode_pretraining(self.tokenizer, self.max_tokens, examples["text"])
result = encode_pretraining(self.tokenizer, self.max_tokens, examples)

self.assertEqual(len(result["input_ids"]), 3)

Expand Down

0 comments on commit 649c19a

Please sign in to comment.