diff --git a/paddlenlp/data/causal_dataset.py b/paddlenlp/data/causal_dataset.py
index c365aa67aad6..a27c07d8d2d7 100644
--- a/paddlenlp/data/causal_dataset.py
+++ b/paddlenlp/data/causal_dataset.py
@@ -297,10 +297,7 @@ def __init__(
         self.return_doc_ids = return_doc_ids
 
         # Build index mappings.
-        if need_data:
-            # Checks
-            if len(documents) == 0:
-                return
+        if need_data and len(documents) > 0:
             assert np.min(documents) >= 0
             assert np.max(documents) < indexed_dataset.sizes.shape[0]
 
@@ -328,7 +325,7 @@ def __init__(
             paddle.distributed.barrier()
 
         # Load mappings.
-        if need_data:
+        if need_data and len(documents) > 0:
             start_time = time.time()
             print_rank_0(f" > loading doc-idx mapping from {doc_idx_filename}")
             self.doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode="r")