Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
XiaohanZhangCMU committed Oct 11, 2024
1 parent 3e50fbf commit adf6d95
Showing 1 changed file with 15 additions and 1 deletion.
16 changes: 15 additions & 1 deletion llmfoundry/data/text_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,12 +323,26 @@ def build_text_dataloader(
if 'streams' in dataset_cfg else None,
)

valid_streaming_text_dataset_parameters = inspect.signature(
StreamingTextDataset,
).parameters

valid_base_dataset_params = inspect.signature(
StreamingDataset,
).parameters

dataset_config_subset_for_streaming_text_dataset = {
k: v
for k, v in dataset_cfg.items()
if k in valid_streaming_text_dataset_parameters or k in valid_base_dataset_params
}

# build dataset potentially with streams
text_dataset = StreamingTextDataset(
tokenizer=tokenizer,
streams=streams,
batch_size=dataset_batch_size,
**dataset_cfg,
**dataset_config_subset_for_streaming_text_dataset,
)

dataloader_cfg = {
Expand Down

0 comments on commit adf6d95

Please sign in to comment.