pageserver: coalesce index uploads when possible (#10248)

## Problem With upload queue reordering in #10218, we can easily get into a situation where multiple index uploads are queued back to back, which can't be parallelized. This will happen e.g. when multiple layer flushes enqueue layer/index/layer/index/... and the layers skip the queue and are uploaded in parallel. These index uploads will incur serial S3 roundtrip latencies, and may block later operations. Touches #10096. ## Summary of changes When multiple back-to-back index uploads are ready to upload, only upload the most recent index and drop the rest.
neondatabase · Jan 14, 2025 · 6debb49 · 6debb49 · github-actions · Jan 14, 2025
1 parent e58e29e
commit 6debb49
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 26 deletions.
diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs
@@ -67,6 +67,7 @@ fn bench_upload_queue_next_ready(c: &mut Criterion) {
                     task_id,
                     retries: AtomicU32::new(0),
                     op: delete.clone(),
+                    coalesced_ops: Vec::new(),
                 }),
             );
         }

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1880,7 +1880,7 @@ impl RemoteTimelineClient {
     /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
     /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
     fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
-        while let Some(mut next_op) = upload_queue.next_ready() {
+        while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
             debug!("starting op: {next_op}");
 
             // Prepare upload.
@@ -1918,6 +1918,7 @@ impl RemoteTimelineClient {
             let task = Arc::new(UploadTask {
                 task_id: upload_task_id,
                 op: next_op,
+                coalesced_ops,
                 retries: AtomicU32::new(0),
             });
             upload_queue
@@ -2285,6 +2286,9 @@ impl RemoteTimelineClient {
         }
 
         self.metric_end(&task.op);
+        for coalesced_op in &task.coalesced_ops {
+            self.metric_end(coalesced_op);
+        }
     }
 
     fn metric_impl(

diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
@@ -22,6 +22,11 @@ use tracing::info;
 static DISABLE_UPLOAD_QUEUE_REORDERING: Lazy<bool> =
     Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_REORDERING").as_deref() == Ok("true"));
 
+/// Kill switch for index upload coalescing in case it causes problems.
+/// TODO: remove this once we have confidence in it.
+static DISABLE_UPLOAD_QUEUE_INDEX_COALESCING: Lazy<bool> =
+    Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_INDEX_COALESCING").as_deref() == Ok("true"));
+
 // clippy warns that Uninitialized is much smaller than Initialized, which wastes
 // memory for Uninitialized variants. Doesn't matter in practice, there are not
 // that many upload queues in a running pageserver, and most of them are initialized
@@ -130,10 +135,12 @@ impl UploadQueueInitialized {
     /// the first operation in the queue, to avoid head-of-line blocking -- an operation can jump
     /// the queue if it doesn't conflict with operations ahead of it.
     ///
+    /// Also returns any operations that were coalesced into this one, e.g. multiple index uploads.
+    ///
     /// None may be returned even if the queue isn't empty, if no operations are ready yet.
     ///
     /// NB: this is quadratic, but queues are expected to be small, and bounded by inprogress_limit.
-    pub fn next_ready(&mut self) -> Option<UploadOp> {
+    pub fn next_ready(&mut self) -> Option<(UploadOp, Vec<UploadOp>)> {
         // If inprogress_tasks is already at limit, don't schedule anything more.
         if self.inprogress_limit > 0 && self.inprogress_tasks.len() >= self.inprogress_limit {
             return None;
@@ -151,7 +158,36 @@ impl UploadQueueInitialized {
                     return None;
                 }
 
-                return self.queued_operations.remove(i);
+                let mut op = self.queued_operations.remove(i).expect("i can't disappear");
+
+                // Coalesce any back-to-back index uploads by only uploading the newest one that's
+                // ready. This typically happens with layer/index/layer/index/... sequences, where
+                // the layers bypass the indexes, leaving the indexes queued.
+                //
+                // If other operations are interleaved between index uploads we don't try to
+                // coalesce them, since we may as well update the index concurrently with them.
+                // This keeps the index fresh and avoids starvation.
+                //
+                // NB: we assume that all uploaded indexes have the same remote path. This
+                // is true at the time of writing: the path only depends on the tenant,
+                // timeline and generation, all of which are static for a timeline instance.
+                // Otherwise, we must be careful not to coalesce different paths.
+                let mut coalesced_ops = Vec::new();
+                if matches!(op, UploadOp::UploadMetadata { .. }) {
+                    while let Some(UploadOp::UploadMetadata { .. }) = self.queued_operations.get(i)
+                    {
+                        if *DISABLE_UPLOAD_QUEUE_INDEX_COALESCING {
+                            break;
+                        }
+                        if !self.is_ready(i) {
+                            break;
+                        }
+                        coalesced_ops.push(op);
+                        op = self.queued_operations.remove(i).expect("i can't disappear");
+                    }
+                }
+
+                return Some((op, coalesced_ops));
             }
 
             // Nothing can bypass a barrier or shutdown. If it wasn't scheduled above, give up.
@@ -225,11 +261,12 @@ impl UploadQueueInitialized {
     fn schedule_ready(&mut self) -> Vec<Arc<UploadTask>> {
         let mut tasks = Vec::new();
         // NB: schedule operations one by one, to handle conflicts with inprogress_tasks.
-        while let Some(op) = self.next_ready() {
+        while let Some((op, coalesced_ops)) = self.next_ready() {
             self.task_counter += 1;
             let task = Arc::new(UploadTask {
                 task_id: self.task_counter,
                 op,
+                coalesced_ops,
                 retries: 0.into(),
             });
             self.inprogress_tasks.insert(task.task_id, task.clone());
@@ -407,9 +444,13 @@ impl UploadQueue {
 pub struct UploadTask {
     /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
     pub task_id: u64,
+    /// Number of task retries.
     pub retries: AtomicU32,
-
+    /// The upload operation.
     pub op: UploadOp,
+    /// Any upload operations that were coalesced into this operation. This typically happens with
+    /// back-to-back index uploads, see `UploadQueueInitialized::next_ready()`.
+    pub coalesced_ops: Vec<UploadOp>,
 }
 
 /// A deletion of some layers within the lifetime of a timeline.  This is not used
@@ -512,9 +553,8 @@ impl UploadOp {
                 })
             }
 
-            // Indexes can never bypass each other.
-            // TODO: we could coalesce them though, by only uploading the newest ready index. This
-            // is left for later, out of caution.
+            // Indexes can never bypass each other. They can coalesce though, and
+            // `UploadQueue::next_ready()` currently does this when possible.
             (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => false,
         }
     }
@@ -897,9 +937,9 @@ mod tests {
         Ok(())
     }
 
-    /// Index uploads are serialized.
+    /// Index uploads are coalesced.
     #[test]
-    fn schedule_index_serial() -> anyhow::Result<()> {
+    fn schedule_index_coalesce() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
         let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
 
@@ -920,13 +960,11 @@ mod tests {
 
         queue.queued_operations.extend(ops.clone());
 
-        // The uploads should run serially.
-        for op in ops {
-            let tasks = queue.schedule_ready();
-            assert_eq!(tasks.len(), 1);
-            assert_same_op(&tasks[0].op, &op);
-            queue.complete(tasks[0].task_id);
-        }
+        // The index uploads are coalesced into a single operation.
+        let tasks = queue.schedule_ready();
+        assert_eq!(tasks.len(), 1);
+        assert_same_op(&tasks[0].op, &ops[2]);
+        assert_same_ops(&tasks[0].coalesced_ops, &ops[0..2]);
 
         assert!(queue.queued_operations.is_empty());
 
@@ -985,18 +1023,14 @@ mod tests {
         assert_same_op(&index_tasks[0].op, &ops[1]);
         queue.complete(index_tasks[0].task_id);
 
-        // layer 1 completes. This unblocks index 1 then index 2.
+        // layer 1 completes. This unblocks index 1 and 2, which coalesce into
+        // a single upload for index 2.
         queue.complete(upload_tasks[1].task_id);
 
-        let index_tasks = queue.schedule_ready();
-        assert_eq!(index_tasks.len(), 1);
-        assert_same_op(&index_tasks[0].op, &ops[3]);
-        queue.complete(index_tasks[0].task_id);
-
         let index_tasks = queue.schedule_ready();
         assert_eq!(index_tasks.len(), 1);
         assert_same_op(&index_tasks[0].op, &ops[5]);
-        queue.complete(index_tasks[0].task_id);
+        assert_same_ops(&index_tasks[0].coalesced_ops, &ops[3..4]);
 
         assert!(queue.queued_operations.is_empty());
 
@@ -1018,11 +1052,12 @@ mod tests {
         let index_deref = index_without(&index_upload, &layer);
 
         let ops = [
-            // Initial upload.
+            // Initial upload, with a barrier to prevent index coalescing.
             UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
             UploadOp::UploadMetadata {
                 uploaded: index_upload.clone(),
             },
+            UploadOp::Barrier(tokio::sync::watch::channel(()).0),
             // Dereference the layer and delete it.
             UploadOp::UploadMetadata {
                 uploaded: index_deref.clone(),
@@ -1063,11 +1098,12 @@ mod tests {
         let index_ref = index_with(&index_deref, &layer);
 
         let ops = [
-            // Initial upload.
+            // Initial upload, with a barrier to prevent index coalescing.
             UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
             UploadOp::UploadMetadata {
                 uploaded: index_upload.clone(),
             },
+            UploadOp::Barrier(tokio::sync::watch::channel(()).0),
             // Dereference the layer.
             UploadOp::UploadMetadata {
                 uploaded: index_deref.clone(),