From a43feb1143f897deb0d755811cdd122340766e5f Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Thu, 17 Oct 2024 12:50:07 -0700
Subject: [PATCH] set worldsize to 1

---
 llmfoundry/command_utils/data_prep/convert_dataset_hf.py       | 1 +
 llmfoundry/command_utils/data_prep/convert_dataset_json.py     | 1 +
 llmfoundry/command_utils/data_prep/convert_delta_to_json.py    | 1 +
 .../command_utils/data_prep/convert_finetuning_dataset.py      | 1 +
 llmfoundry/command_utils/data_prep/convert_text_to_mds.py      | 1 +
 scripts/data_prep/convert_dataset_hf.py                        | 3 ---
 6 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
index 2667407110..3d54da6057 100644
--- a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
+++ b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
@@ -451,6 +451,7 @@ def convert_dataset_hf_from_args(
         ValueError: If the output directory already contains the requested splits
         ValueError: If `concat_tokens` is set but `tokenizer` is not
     """
+    os.environ['WORLD_SIZE'] = '1'
     if tokenizer_kwargs:
         parsed_tokenizer_kwargs = json.loads(tokenizer_kwargs)
     else:
diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_json.py b/llmfoundry/command_utils/data_prep/convert_dataset_json.py
index c6f7d51c02..918ce7e108 100644
--- a/llmfoundry/command_utils/data_prep/convert_dataset_json.py
+++ b/llmfoundry/command_utils/data_prep/convert_dataset_json.py
@@ -186,6 +186,7 @@ def convert_dataset_json_from_args(
         ValueError: If the out_root directory exists and contains files that overlap with the requested splits
         ValueError: If concat_tokens is set and a tokenizer is not provided
     """
+    os.environ['WORLD_SIZE'] = '1'
     if os.path.isdir(out_root) and len(
         set(os.listdir(out_root)).intersection(set(split)),
     ) > 0:
diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
index 2321d306ff..000b3eebf2 100644
--- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
+++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -767,6 +767,7 @@ def convert_delta_to_json_from_args(
         use_serverless (bool): Use serverless or not. Make sure the workspace is entitled with serverless
         json_output_filename (str): The name of the combined final jsonl that combines all partitioned jsonl
     """
+    os.environ['WORLD_SIZE'] = '1'
     _check_imports()
     from databricks.sdk import WorkspaceClient
     w = WorkspaceClient()
diff --git a/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py b/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py
index bb1197de57..cbd1bd275d 100644
--- a/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py
+++ b/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py
@@ -309,6 +309,7 @@ def convert_finetuning_dataset_from_args(
         ValueError: If the target settings are invalid.
         ValueError: If the output directory already contains the requested splits.
     """
+    os.environ['WORLD_SIZE'] = '1'
     if os.path.isdir(out_root) and len(
         set(os.listdir(out_root)).intersection(set(splits)),
     ) > 0:
diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
index 3ea5aeb5d4..2ca0849f76 100644
--- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
@@ -559,6 +559,7 @@ def convert_text_to_mds_from_args(
     Raises:
         ValueError: If `use_tokenizer_eos` is True and `eos_text` is not None
     """
+    os.environ['WORLD_SIZE'] = '1'
     if use_tokenizer_eos:
         # Ensure that eos text is not specified twice.
         if eos_text is not None:
diff --git a/scripts/data_prep/convert_dataset_hf.py b/scripts/data_prep/convert_dataset_hf.py
index 2ab919ef29..3b893868b2 100644
--- a/scripts/data_prep/convert_dataset_hf.py
+++ b/scripts/data_prep/convert_dataset_hf.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """Streaming dataset conversion scripts for C4 and The Pile."""
-import os
 from argparse import ArgumentParser, Namespace
 
 from llmfoundry.command_utils import convert_dataset_hf_from_args
@@ -49,8 +48,6 @@ def parse_args() -> Namespace:
 
 if __name__ == '__main__':
     args = parse_args()
-    # set `WORLD_SIZE` to fix https://github.com/mosaicml/llm-foundry/issues/1575
-    os.environ['WORLD_SIZE'] = '1'
     convert_dataset_hf_from_args(
         dataset=args.dataset,
         data_subset=args.data_subset,