diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 1de3115592..6d11a4b2b9 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -21,8 +21,8 @@ from tqdm import tqdm from transformers import AutoTokenizer, PreTrainedTokenizerBase -from llmfoundry.utils.builders import build_tokenizer from llmfoundry.data.data import AbstractConcatTokensDataset +from llmfoundry.utils.builders import build_tokenizer from llmfoundry.utils.data_prep_utils import ( DownloadingIterable, download_file, @@ -397,7 +397,9 @@ def convert_text_to_mds( """ # Load the tokenizer once on the main process so that the files are cached to avoid race conditions # in the Hugging Face load code - AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=trust_remote_code) + AutoTokenizer.from_pretrained( + tokenizer_name, trust_remote_code=trust_remote_code + ) is_remote_output = is_remote_path(output_folder) log.info(f'Output is remote: {is_remote_output}') diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index de9ef4b05c..9f18c31ec6 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -499,11 +499,6 @@ def build_tokenizer( signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup' - names = dist.all_gather_object(signal_file_path) - print("+"*30) - print(names) - print("+"*30) - if dist.is_available() and dist.is_initialized( ) and dist.get_world_size() > 1: # Make sure the tokenizer files are downloaded and cached first by local rank 0