From a50ae9ec01f241b5fb5135f0e1c86c458c10507a Mon Sep 17 00:00:00 2001
From: Manuel Burger <manuel.burger@inf.ethz.ch>
Date: Wed, 6 Nov 2024 12:08:40 +0100
Subject: [PATCH] Adapt continuation under mp

---
 src/nanotron/data/petagraph_dataset.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/nanotron/data/petagraph_dataset.py b/src/nanotron/data/petagraph_dataset.py
index 735e50ef..40f73f5b 100644
--- a/src/nanotron/data/petagraph_dataset.py
+++ b/src/nanotron/data/petagraph_dataset.py
@@ -618,7 +618,11 @@ def __init__(self,
             restart_consumed_files_set = set(restart_consumed_files)
             for f in restart_consumed_files_set:
                 url_list.remove(f)
-            url_list.extend(restart_consumed_files)
+            
+            # For now we don't append the consumed files to the end of the url_list
+            # As in multiprocessing setting we index into arbirary positions
+            # and we don't want to index into the consumed files
+            # url_list.extend(restart_consumed_files)
 
             # Add the consumed files to the consumed files set
             self.consumed_files = set(restart_consumed_files)
@@ -626,7 +630,7 @@ def __init__(self,
             # Set the current epoch to the restart epoch
             self.current_epoch = restart_epoch
 
-            log_msg = f"[PetaGAdd lockaphStreamDataset:{self.rank}] Restarting from epoch {self.current_epoch} with {len(self.consumed_files)} files"
+            log_msg = f"[PetaGraphStreamDataset:{self.rank}] Restarting from epoch {self.current_epoch} with {len(self.consumed_files)} files, {len(url_list)} files left"
             log_rank(log_msg, logger=logger, level=logging.INFO, rank=self.rank)
         else:
             self.consumed_files = set()