Skip to content

Commit

Permalink
pc
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Jul 23, 2024
1 parent 73cd36f commit 3c5498f
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 7 deletions.
6 changes: 4 additions & 2 deletions llmfoundry/command_utils/data_prep/convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
from tqdm import tqdm
from transformers import AutoTokenizer, PreTrainedTokenizerBase

from llmfoundry.utils.builders import build_tokenizer
from llmfoundry.data.data import AbstractConcatTokensDataset
from llmfoundry.utils.builders import build_tokenizer
from llmfoundry.utils.data_prep_utils import (
DownloadingIterable,
download_file,
Expand Down Expand Up @@ -397,7 +397,9 @@ def convert_text_to_mds(
"""
# Load the tokenizer once on the main process so that the files are cached to avoid race conditions
# in the Hugging Face load code
AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=trust_remote_code)
AutoTokenizer.from_pretrained(
tokenizer_name, trust_remote_code=trust_remote_code
)

is_remote_output = is_remote_path(output_folder)
log.info(f'Output is remote: {is_remote_output}')
Expand Down
5 changes: 0 additions & 5 deletions llmfoundry/utils/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,11 +499,6 @@ def build_tokenizer(

signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup'

names = dist.all_gather_object(signal_file_path)
print("+"*30)
print(names)
print("+"*30)

if dist.is_available() and dist.is_initialized(
) and dist.get_world_size() > 1:
# Make sure the tokenizer files are downloaded and cached first by local rank 0
Expand Down

0 comments on commit 3c5498f

Please sign in to comment.