From 2e5164948a74bae807feea8c20bb88935981b96a Mon Sep 17 00:00:00 2001 From: Nick Rossenbach Date: Thu, 21 Dec 2023 15:06:22 +0900 Subject: [PATCH] Avoid unnecessary re-init of control dataset ... when using multiple workers --- returnn/datasets/meta.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/returnn/datasets/meta.py b/returnn/datasets/meta.py index 501baecebb..43f9bbfb00 100644 --- a/returnn/datasets/meta.py +++ b/returnn/datasets/meta.py @@ -424,10 +424,16 @@ def get_seq_len(s): for dataset_key, dataset in self.datasets.items(): assert isinstance(dataset, Dataset) - if dataset is seq_order_dataset and not sharding_in_meta: - # only skip if we did not do sharding here, otherwise the sequence list - # of the control dataset needs to be rebuilt as well - continue + if dataset is seq_order_dataset: + if not sharding_in_meta: + # only skip if we did not do sharding here, otherwise the sequence list + # of the control dataset needs to be rebuilt as well + continue + else: + # we can do a faster init using the seq_order directly, + # as the seq_index directly relates to the seq_order_dataset + dataset.init_seq_order(epoch=epoch, seq_order=seq_index) + continue dataset.init_seq_order(epoch=epoch, seq_list=self.seq_list_ordered[dataset_key]) return True