diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 83cd1c900d..3673a48217 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -335,6 +335,9 @@ def build_from_hf( signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_data_prep_completed' + # Non local rank 0 ranks will wait here for local rank 0 to finish the data processing. + # Once local rank 0 is done, the datasets are all cached on disk, and all other ranks + # can just read them. if dist.get_local_rank() != 0: log.debug('Waiting for local_rank 0 to finish data prep') with dist.local_rank_zero_download_and_wait(signal_file_path): @@ -389,13 +392,16 @@ def dataset_mapper(example: Dict): f'Dropped {empty_examples_removed} examples where the prompt or response was empty, ' + 'or the response was only padding tokens.') + # Now local rank 0 indicates to the other ranks that it is done if dist.get_local_rank() == 0: log.debug('Local rank 0 finished data prep') with open(signal_file_path, 'wb') as f: f.write(b'local_rank0_completed_data_prep') + # All ranks sync up at this barrier, having completed data processing dist.barrier() + # Last, local rank 0 cleans up the signal file if dist.get_local_rank() == 0: os.remove(signal_file_path)