From a50ae9ec01f241b5fb5135f0e1c86c458c10507a Mon Sep 17 00:00:00 2001 From: Manuel Burger Date: Wed, 6 Nov 2024 12:08:40 +0100 Subject: [PATCH] Adapt continuation under mp --- src/nanotron/data/petagraph_dataset.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/nanotron/data/petagraph_dataset.py b/src/nanotron/data/petagraph_dataset.py index 735e50ef..40f73f5b 100644 --- a/src/nanotron/data/petagraph_dataset.py +++ b/src/nanotron/data/petagraph_dataset.py @@ -618,7 +618,11 @@ def __init__(self, restart_consumed_files_set = set(restart_consumed_files) for f in restart_consumed_files_set: url_list.remove(f) - url_list.extend(restart_consumed_files) + + # For now we don't append the consumed files to the end of the url_list + # As in multiprocessing setting we index into arbirary positions + # and we don't want to index into the consumed files + # url_list.extend(restart_consumed_files) # Add the consumed files to the consumed files set self.consumed_files = set(restart_consumed_files) @@ -626,7 +630,7 @@ def __init__(self, # Set the current epoch to the restart epoch self.current_epoch = restart_epoch - log_msg = f"[PetaGAdd lockaphStreamDataset:{self.rank}] Restarting from epoch {self.current_epoch} with {len(self.consumed_files)} files" + log_msg = f"[PetaGraphStreamDataset:{self.rank}] Restarting from epoch {self.current_epoch} with {len(self.consumed_files)} files, {len(url_list)} files left" log_rank(log_msg, logger=logger, level=logging.INFO, rank=self.rank) else: self.consumed_files = set()