-
Notifications
You must be signed in to change notification settings - Fork 537
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Raise DatasetTooSmall exception if canonical nodes is less than num s…
…amples (#1518) Co-authored-by: Saaketh Narayan <[email protected]> Co-authored-by: Daniel King <[email protected]>
- Loading branch information
1 parent
6d93260
commit 5465db4
Showing
4 changed files
with
69 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# Copyright 2022 MosaicML LLM Foundry authors | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from contextlib import nullcontext | ||
from typing import Optional | ||
from unittest import mock | ||
|
||
import pytest | ||
|
||
from llmfoundry.data.finetuning.tasks import dataset_constructor | ||
from llmfoundry.utils.exceptions import DatasetTooSmallError | ||
|
||
|
||
@pytest.mark.parametrize('num_canonical_nodes', [None, 8, 2]) | ||
def test_finetuning_streaming_dataset_too_small( | ||
num_canonical_nodes: Optional[int], | ||
): | ||
num_samples = 2 | ||
|
||
class MockDataset: | ||
|
||
def __init__(self): | ||
self.num_canonical_nodes = num_canonical_nodes | ||
self.num_samples = num_samples | ||
|
||
class MockDist: | ||
|
||
def get_world_size(self): | ||
return 32 | ||
|
||
def get_local_world_size(self): | ||
return 8 | ||
|
||
result_context = nullcontext( | ||
) if num_canonical_nodes == 2 else pytest.raises(DatasetTooSmallError) | ||
with result_context: | ||
with mock.patch( | ||
'llmfoundry.data.finetuning.tasks.dist', | ||
new=MockDist(), | ||
): | ||
with mock.patch( | ||
'llmfoundry.data.finetuning.tasks.DatasetConstructor.streaming_dataset_class', | ||
new=MockDataset, | ||
): | ||
dataset_constructor.build_from_streaming() |