diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py index 2667407110..3d54da6057 100644 --- a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py +++ b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py @@ -451,6 +451,7 @@ def convert_dataset_hf_from_args( ValueError: If the output directory already contains the requested splits ValueError: If `concat_tokens` is set but `tokenizer` is not """ + os.environ['WORLD_SIZE'] = '1' if tokenizer_kwargs: parsed_tokenizer_kwargs = json.loads(tokenizer_kwargs) else: diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_json.py b/llmfoundry/command_utils/data_prep/convert_dataset_json.py index c6f7d51c02..918ce7e108 100644 --- a/llmfoundry/command_utils/data_prep/convert_dataset_json.py +++ b/llmfoundry/command_utils/data_prep/convert_dataset_json.py @@ -186,6 +186,7 @@ def convert_dataset_json_from_args( ValueError: If the out_root directory exists and contains files that overlap with the requested splits ValueError: If concat_tokens is set and a tokenizer is not provided """ + os.environ['WORLD_SIZE'] = '1' if os.path.isdir(out_root) and len( set(os.listdir(out_root)).intersection(set(split)), ) > 0: diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py index 2321d306ff..000b3eebf2 100644 --- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py +++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py @@ -767,6 +767,7 @@ def convert_delta_to_json_from_args( use_serverless (bool): Use serverless or not. Make sure the workspace is entitled with serverless json_output_filename (str): The name of the combined final jsonl that combines all partitioned jsonl """ + os.environ['WORLD_SIZE'] = '1' _check_imports() from databricks.sdk import WorkspaceClient w = WorkspaceClient() diff --git a/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py b/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py index bb1197de57..cbd1bd275d 100644 --- a/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py +++ b/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py @@ -309,6 +309,7 @@ def convert_finetuning_dataset_from_args( ValueError: If the target settings are invalid. ValueError: If the output directory already contains the requested splits. """ + os.environ['WORLD_SIZE'] = '1' if os.path.isdir(out_root) and len( set(os.listdir(out_root)).intersection(set(splits)), ) > 0: diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 3ea5aeb5d4..2ca0849f76 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -559,6 +559,7 @@ def convert_text_to_mds_from_args( Raises: ValueError: If `use_tokenizer_eos` is True and `eos_text` is not None """ + os.environ['WORLD_SIZE'] = '1' if use_tokenizer_eos: # Ensure that eos text is not specified twice. if eos_text is not None: diff --git a/scripts/data_prep/convert_dataset_hf.py b/scripts/data_prep/convert_dataset_hf.py index 2ab919ef29..3b893868b2 100644 --- a/scripts/data_prep/convert_dataset_hf.py +++ b/scripts/data_prep/convert_dataset_hf.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 """Streaming dataset conversion scripts for C4 and The Pile.""" -import os from argparse import ArgumentParser, Namespace from llmfoundry.command_utils import convert_dataset_hf_from_args @@ -49,8 +48,6 @@ def parse_args() -> Namespace: if __name__ == '__main__': args = parse_args() - # set `WORLD_SIZE` to fix https://github.com/mosaicml/llm-foundry/issues/1575 - os.environ['WORLD_SIZE'] = '1' convert_dataset_hf_from_args( dataset=args.dataset, data_subset=args.data_subset,