From 7b160fcca23ee5a5591704e22c76e4c24962e924 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:11:24 -0700 Subject: [PATCH] Avoid race condition in convert text to mds script (#1390) --- llmfoundry/command_utils/data_prep/convert_text_to_mds.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 14afe279fd..336c82a5e7 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -394,6 +394,13 @@ def convert_text_to_mds( reprocess (bool): Whether to always reprocess the given folder of text files trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer """ + # Load the tokenizer once on the main process so that the files are cached to avoid race conditions + # in the Hugging Face load code + AutoTokenizer.from_pretrained( + tokenizer_name, + trust_remote_code=trust_remote_code, + ) + is_remote_output = is_remote_path(output_folder) log.info(f'Output is remote: {is_remote_output}')