diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 83bb415314..1de3115592 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -395,6 +395,8 @@ def convert_text_to_mds( reprocess (bool): Whether to always reprocess the given folder of text files trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer """ + # Load the tokenizer once on the main process so that the files are cached to avoid race conditions + # in the Hugging Face load code AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=trust_remote_code) is_remote_output = is_remote_path(output_folder)