pc

mosaicml · Jul 23, 2024 · 3c5498f · 3c5498f
1 parent 73cd36f
commit 3c5498f
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 7 deletions.
diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
@@ -21,8 +21,8 @@
 from tqdm import tqdm
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
-from llmfoundry.utils.builders import build_tokenizer
 from llmfoundry.data.data import AbstractConcatTokensDataset
+from llmfoundry.utils.builders import build_tokenizer
 from llmfoundry.utils.data_prep_utils import (
     DownloadingIterable,
     download_file,
@@ -397,7 +397,9 @@ def convert_text_to_mds(
     """
     # Load the tokenizer once on the main process so that the files are cached to avoid race conditions
     # in the Hugging Face load code
-    AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=trust_remote_code)
+    AutoTokenizer.from_pretrained(
+        tokenizer_name, trust_remote_code=trust_remote_code
+    )
 
     is_remote_output = is_remote_path(output_folder)
     log.info(f'Output is remote: {is_remote_output}')

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
@@ -499,11 +499,6 @@ def build_tokenizer(
 
     signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup'
 
-    names = dist.all_gather_object(signal_file_path)
-    print("+"*30)
-    print(names)
-    print("+"*30)
-
     if dist.is_available() and dist.is_initialized(
     ) and dist.get_world_size() > 1:
         # Make sure the tokenizer files are downloaded and cached first by local rank 0