Skip to content

Commit

Permalink
making allow_unsafe_types and replication configurable through datase…
Browse files Browse the repository at this point in the history
…t configs
  • Loading branch information
ShashankMosaicML committed Apr 9, 2024
1 parent cc6bdf1 commit 4238d0d
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 4 deletions.
2 changes: 2 additions & 0 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ def build_finetuning_dataloader(cfg: DictConfig,
sampling_granularity=cfg.dataset.get('sampling_granularity', 1),
batching_method=cfg.dataset.get('batching_method', 'random'),
max_seq_len=cfg.dataset.max_seq_len,
allow_unsafe_types=cfg.dataset.get('allow_unsafe_types', False),
replication=cfg.dataset.get('replication', None),
)

else:
Expand Down
6 changes: 4 additions & 2 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,8 @@ def __init__(self,
sampling_granularity: int = 1,
batching_method: str = 'random',
max_seq_len: int = 2048,
allow_unsafe_types: bool = False,
replication: Optional[int] = None,
**kwargs: Any):

if len(kwargs) > 0:
Expand Down Expand Up @@ -552,8 +554,8 @@ def __init__(self,
sampling_method=sampling_method,
sampling_granularity=sampling_granularity,
batching_method=batching_method,
allow_unsafe_types=False,
replication=None,
allow_unsafe_types=allow_unsafe_types,
replication=replication,
)

self.tokenizer = tokenizer
Expand Down
6 changes: 4 additions & 2 deletions llmfoundry/data/text_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ def __init__(self,
sampling_method: str = 'balanced',
sampling_granularity: int = 1,
batching_method: str = 'random',
allow_unsafe_types: bool = False,
replication: Optional[int] = None,
**kwargs: Any):

if len(kwargs) > 0:
Expand Down Expand Up @@ -151,8 +153,8 @@ def __init__(self,
sampling_method=sampling_method,
sampling_granularity=sampling_granularity,
batching_method=batching_method,
allow_unsafe_types=False,
replication=None,
allow_unsafe_types=allow_unsafe_types,
replication=replication,
)
self.tokenizer = tokenizer
self.max_seq_len = max_seq_len
Expand Down

0 comments on commit 4238d0d

Please sign in to comment.