From eab4770632621238e96c9013227869020e3cd25c Mon Sep 17 00:00:00 2001 From: Lauler Date: Sun, 24 Nov 2024 14:29:31 +0100 Subject: [PATCH 1/2] Add shuffling for subsequent epochs when data is repeated --- src/nanotron/data/nanoset.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py index 90200967..7974437f 100644 --- a/src/nanotron/data/nanoset.py +++ b/src/nanotron/data/nanoset.py @@ -111,14 +111,26 @@ def build_nanoset_index(self) -> np.ndarray: dataset_index, dataset_sample_index = build_nanoset_index_helper( n_samples=samples_per_epoch, weights=self.dataset_weights, dataset_sizes=self.dataset_lengths ) - # Shuffle the indexes the same way - numpy_random_state = np.random.RandomState(self.random_seed) - numpy_random_state.shuffle(dataset_index) - numpy_random_state = np.random.RandomState(self.random_seed) - numpy_random_state.shuffle(dataset_sample_index) - # Concatenate num_epochs the shuffled indexes - dataset_index = np.concatenate([dataset_index for _ in range(num_epochs)]) - dataset_sample_index = np.concatenate([dataset_sample_index for _ in range(num_epochs)]) + + # Shuffle indices in each epoch with different random seeds and concatenate them + r = np.random.RandomState(self.random_seed) + epoch_random_seeds = r.randint(0, 2**32 - 1, num_epochs) + dataset_indices = [] + dataset_sample_indices = [] + for i in range(num_epochs): + # Shuffle the sample and dataset indices in epoch with a given seed + numpy_random_state = np.random.RandomState(epoch_random_seeds[i]) + numpy_random_state.shuffle(dataset_index) + numpy_random_state = np.random.RandomState(epoch_random_seeds[i]) + numpy_random_state.shuffle(dataset_sample_index) + + dataset_indices.append(dataset_index) + dataset_sample_indices.append(dataset_sample_index) + + # Concatenate the within-epoch shuffled indices + dataset_index = np.concatenate(dataset_indices) + dataset_sample_index = np.concatenate(dataset_sample_indices) + # Just keep the necessary samples dataset_index = dataset_index[: self.train_split_num_samples] dataset_sample_index = dataset_sample_index[: self.train_split_num_samples] From f060414194e02c2e22e8fa0ffd8326363917a796 Mon Sep 17 00:00:00 2001 From: Lauler Date: Thu, 28 Nov 2024 09:04:17 +0100 Subject: [PATCH 2/2] Simplify random seed in epoch data for reproducibility --- src/nanotron/data/nanoset.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py index 7974437f..13c5f90b 100644 --- a/src/nanotron/data/nanoset.py +++ b/src/nanotron/data/nanoset.py @@ -113,15 +113,13 @@ def build_nanoset_index(self) -> np.ndarray: ) # Shuffle indices in each epoch with different random seeds and concatenate them - r = np.random.RandomState(self.random_seed) - epoch_random_seeds = r.randint(0, 2**32 - 1, num_epochs) dataset_indices = [] dataset_sample_indices = [] - for i in range(num_epochs): + for num_epoch in range(num_epochs): # Shuffle the sample and dataset indices in epoch with a given seed - numpy_random_state = np.random.RandomState(epoch_random_seeds[i]) + numpy_random_state = np.random.RandomState(self.random_seed + num_epoch) numpy_random_state.shuffle(dataset_index) - numpy_random_state = np.random.RandomState(epoch_random_seeds[i]) + numpy_random_state = np.random.RandomState(self.random_seed + num_epoch) numpy_random_state.shuffle(dataset_sample_index) dataset_indices.append(dataset_index)