From 73cd36f465d15a69f15dfa4fdb454abb8e3ab93c Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 23 Jul 2024 13:26:26 -0700 Subject: [PATCH] add comment --- llmfoundry/command_utils/data_prep/convert_text_to_mds.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 83bb415314..1de3115592 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -395,6 +395,8 @@ def convert_text_to_mds( reprocess (bool): Whether to always reprocess the given folder of text files trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer """ + # Load the tokenizer once on the main process so that the files are cached to avoid race conditions + # in the Hugging Face load code AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=trust_remote_code) is_remote_output = is_remote_path(output_folder)