From adf6d95897b2a36c554879274d77e23a0feb609a Mon Sep 17 00:00:00 2001 From: xiaohanzhangcmu Date: Fri, 11 Oct 2024 15:33:24 -0700 Subject: [PATCH] update --- llmfoundry/data/text_data.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 4476ad7304..8e81ab5e96 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -323,12 +323,26 @@ def build_text_dataloader( if 'streams' in dataset_cfg else None, ) + valid_streaming_text_dataset_parameters = inspect.signature( + StreamingTextDataset, + ).parameters + + valid_base_dataset_params = inspect.signature( + StreamingDataset, + ).parameters + + dataset_config_subset_for_streaming_text_dataset = { + k: v + for k, v in dataset_cfg.items() + if k in valid_streaming_text_dataset_parameters or k in valid_base_dataset_params + } + # build dataset potentially with streams text_dataset = StreamingTextDataset( tokenizer=tokenizer, streams=streams, batch_size=dataset_batch_size, - **dataset_cfg, + **dataset_config_subset_for_streaming_text_dataset, ) dataloader_cfg = {