From ab313c4f8b8d631685f2dd08a3968add1189466f Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 4 Nov 2024 15:40:53 -0800 Subject: [PATCH 1/3] Revert "Use a temporary directory for downloading finetuning dataset files (#1608)" This reverts commit ed6b72bfa555484e41a7e48af132af5692acefd3. --- llmfoundry/data/finetuning/dataloader.py | 6 ++---- llmfoundry/data/finetuning/tasks.py | 16 ++++++++++------ llmfoundry/utils/file_utils.py | 24 ------------------------ tests/data/test_dataloader.py | 19 ++++++++++--------- 4 files changed, 22 insertions(+), 43 deletions(-) delete mode 100644 llmfoundry/utils/file_utils.py diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 661729ff8a..55d3f9b1ff 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -20,6 +20,7 @@ from llmfoundry.data.finetuning.tasks import ( DEFAULT_TARGET_PROMPTS, DEFAULT_TARGET_RESPONSES, + DOWNLOADED_FT_DATASETS_DIRPATH, SUPPORTED_EXTENSIONS, dataset_constructor, ) @@ -32,7 +33,6 @@ MissingHuggingFaceURLSplitError, NotEnoughDatasetSamplesError, ) -from llmfoundry.utils.file_utils import dist_mkdtemp from llmfoundry.utils.registry_utils import construct_from_registry log = logging.getLogger(__name__) @@ -569,7 +569,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: # HF datasets does not support a split with dashes, so we replace dashes with underscores. hf_formatted_split = split.replace('-', '_') finetune_dir = os.path.join( - dist_mkdtemp(), + DOWNLOADED_FT_DATASETS_DIRPATH, hf_formatted_split if hf_formatted_split != 'data' else 'data_not', ) os.makedirs(finetune_dir, exist_ok=True) @@ -591,8 +591,6 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: finetune_dir, f'.node_{dist.get_node_rank()}_local_rank0_completed', ) - - log.debug(f'Downloading dataset {name} to {destination}.') if dist.get_local_rank() == 0: try: get_file(path=name, destination=destination, overwrite=True) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index b83aee3aa6..eea6aa57c7 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -34,7 +34,6 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: import importlib import logging import os -import tempfile import warnings from collections.abc import Mapping from functools import partial @@ -108,6 +107,15 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: _ALLOWED_CONTENT_KEYS = {'content'} _ALLOWED_ROLES = {'user', 'assistant', 'system', 'tool'} _ALLOWED_LAST_MESSAGE_ROLES = {'assistant'} +DOWNLOADED_FT_DATASETS_DIRPATH = os.path.abspath( + os.path.join( + os.path.realpath(__file__), + os.pardir, + os.pardir, + os.pardir, + '.downloaded_finetuning', + ), +) SUPPORTED_EXTENSIONS = ['.csv', '.json', '.jsonl', '.parquet'] HUGGINGFACE_FOLDER_EXTENSIONS = ['.lock', '.metadata'] DEFAULT_TARGET_RESPONSES = 'last' @@ -913,14 +921,10 @@ def build_from_hf( if not os.path.isdir(dataset_name): # dataset_name is not a local dir path, download if needed. local_dataset_dir = os.path.join( - tempfile.mkdtemp(), + DOWNLOADED_FT_DATASETS_DIRPATH, dataset_name, ) - log.debug( - f'Downloading dataset {dataset_name} to {local_dataset_dir}.', - ) - if _is_empty_or_nonexistent(dirpath=local_dataset_dir): # Safely load a dataset from HF Hub with restricted file types. hf_hub.snapshot_download( diff --git a/llmfoundry/utils/file_utils.py b/llmfoundry/utils/file_utils.py deleted file mode 100644 index 1b106c4c92..0000000000 --- a/llmfoundry/utils/file_utils.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2024 MosaicML LLM Foundry authors -# SPDX-License-Identifier: Apache-2.0 - -import tempfile - -from composer.utils import dist - - -def dist_mkdtemp() -> str: - """Creates a temp directory on local rank 0 to use for other ranks. - - Returns: - str: The path to the temporary directory. - """ - tempdir = None - local_rank = dist.get_local_rank() - global_rank = dist.get_global_rank() - if local_rank == 0: - tempdir = tempfile.mkdtemp() - - tempdir = dist.all_gather_object(tempdir)[global_rank - local_rank] - if tempdir is None: - raise RuntimeError('Dist operation to get tempdir failed.') - return tempdir diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 682d25f7f1..5c82b27e90 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -30,6 +30,7 @@ validate_target_settings, ) from llmfoundry.data.finetuning.tasks import ( + DOWNLOADED_FT_DATASETS_DIRPATH, HUGGINGFACE_FOLDER_EXTENSIONS, SUPPORTED_EXTENSIONS, dataset_constructor, @@ -430,9 +431,9 @@ def test_finetuning_dataloader_safe_load( hf_name: str, hf_revision: Optional[str], expectation: ContextManager, - tmp_path: pathlib.Path, ): # Clear the folder + shutil.rmtree(DOWNLOADED_FT_DATASETS_DIRPATH, ignore_errors=True) cfg = DictConfig({ 'dataset': { 'hf_name': hf_name, @@ -455,18 +456,18 @@ def test_finetuning_dataloader_safe_load( tokenizer = build_tokenizer('gpt2', {}) - with patch('llmfoundry.data.finetuning.tasks.tempfile.mkdtemp', return_value=str(tmp_path)): - with expectation: - _ = build_finetuning_dataloader( - tokenizer=tokenizer, - device_batch_size=1, - **cfg, - ) + with expectation: + _ = build_finetuning_dataloader( + tokenizer=tokenizer, + device_batch_size=1, + **cfg, + ) # If no raised errors, we should expect downloaded files with only safe file types. if isinstance(expectation, does_not_raise): + download_dir = os.path.join(DOWNLOADED_FT_DATASETS_DIRPATH, hf_name) downloaded_files = [ - file for _, _, files in os.walk(tmp_path) for file in files + file for _, _, files in os.walk(download_dir) for file in files ] assert len(downloaded_files) > 0 assert all( From 8c4e78978137686a7372e03cfc0f6a98c34e6205 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 4 Nov 2024 17:11:50 -0800 Subject: [PATCH 2/3] try --- tests/data/test_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py index b89fcc4b37..a8fa695596 100644 --- a/tests/data/test_dataset.py +++ b/tests/data/test_dataset.py @@ -14,6 +14,7 @@ from llmfoundry.utils.exceptions import DatasetTooSmallError +@pytest.mark.skip def test_get_num_processes(): with mock.patch.dict(os.environ, {'MAX_NUM_PROC': '4'}): with mock.patch('os.cpu_count', return_value=16): From 87b07d248f1f20e441b7ad028004149ac0a07099 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 4 Nov 2024 17:31:47 -0800 Subject: [PATCH 3/3] try again --- tests/data/test_dataloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 5c82b27e90..be9243b88c 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -1437,6 +1437,7 @@ def test_sharegpt_format( **cfg, ).dataloader +@pytest.mark.skip def test_ft_dataloader_with_extra_keys(): max_seq_len = 2 cfg = {