diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 54e663ebe3..8b7b3dae6a 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -44,6 +44,7 @@ def test_load_hub(self): prepared_path = Path(tmp_dir) / "prepared" cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 1024, "datasets": [ { @@ -80,6 +81,7 @@ def test_load_local_hub(self): # how to load it. cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 1024, "datasets": [ { @@ -113,6 +115,7 @@ def test_load_from_save_to_disk(self): prepared_path = Path(tmp_dir) / "prepared" cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { @@ -143,6 +146,7 @@ def test_load_from_dir_of_parquet(self): prepared_path: Path = Path(tmp_dir) / "prepared" cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { @@ -178,6 +182,7 @@ def test_load_from_dir_of_json(self): prepared_path: Path = Path(tmp_dir) / "prepared" cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { @@ -211,6 +216,7 @@ def test_load_from_single_parquet(self): prepared_path: Path = Path(tmp_dir) / "prepared" cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { @@ -240,6 +246,7 @@ def test_load_from_single_json(self): prepared_path: Path = Path(tmp_dir) / "prepared" cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ {