Skip to content

Commit

Permalink
set worldsize to 1
Browse files Browse the repository at this point in the history
  • Loading branch information
v-chen_data committed Oct 17, 2024
1 parent 73f31c9 commit a43feb1
Show file tree
Hide file tree
Showing 6 changed files with 5 additions and 3 deletions.
1 change: 1 addition & 0 deletions llmfoundry/command_utils/data_prep/convert_dataset_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@ def convert_dataset_hf_from_args(
ValueError: If the output directory already contains the requested splits
ValueError: If `concat_tokens` is set but `tokenizer` is not
"""
os.environ['WORLD_SIZE'] = '1'
if tokenizer_kwargs:
parsed_tokenizer_kwargs = json.loads(tokenizer_kwargs)
else:
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/command_utils/data_prep/convert_dataset_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ def convert_dataset_json_from_args(
ValueError: If the out_root directory exists and contains files that overlap with the requested splits
ValueError: If concat_tokens is set and a tokenizer is not provided
"""
os.environ['WORLD_SIZE'] = '1'
if os.path.isdir(out_root) and len(
set(os.listdir(out_root)).intersection(set(split)),
) > 0:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,7 @@ def convert_delta_to_json_from_args(
use_serverless (bool): Use serverless or not. Make sure the workspace is entitled with serverless
json_output_filename (str): The name of the combined final jsonl that combines all partitioned jsonl
"""
os.environ['WORLD_SIZE'] = '1'
_check_imports()
from databricks.sdk import WorkspaceClient
w = WorkspaceClient()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ def convert_finetuning_dataset_from_args(
ValueError: If the target settings are invalid.
ValueError: If the output directory already contains the requested splits.
"""
os.environ['WORLD_SIZE'] = '1'
if os.path.isdir(out_root) and len(
set(os.listdir(out_root)).intersection(set(splits)),
) > 0:
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/command_utils/data_prep/convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,7 @@ def convert_text_to_mds_from_args(
Raises:
ValueError: If `use_tokenizer_eos` is True and `eos_text` is not None
"""
os.environ['WORLD_SIZE'] = '1'
if use_tokenizer_eos:
# Ensure that eos text is not specified twice.
if eos_text is not None:
Expand Down
3 changes: 0 additions & 3 deletions scripts/data_prep/convert_dataset_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# SPDX-License-Identifier: Apache-2.0

"""Streaming dataset conversion scripts for C4 and The Pile."""
import os
from argparse import ArgumentParser, Namespace

from llmfoundry.command_utils import convert_dataset_hf_from_args
Expand Down Expand Up @@ -49,8 +48,6 @@ def parse_args() -> Namespace:

if __name__ == '__main__':
args = parse_args()
# set `WORLD_SIZE` to fix https://github.com/mosaicml/llm-foundry/issues/1575
os.environ['WORLD_SIZE'] = '1'
convert_dataset_hf_from_args(
dataset=args.dataset,
data_subset=args.data_subset,
Expand Down

0 comments on commit a43feb1

Please sign in to comment.