From 8bdaee35f3dec86b37bb6b91be57a88a86d9ad33 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 20 Jan 2025 09:20:31 +0000 Subject: [PATCH] pageserver: safety checks on validity of uploaded indices (#10403) ## Problem Occasionally, we encounter bugs in test environments that can be detected at the point of uploading an index, but we proceed to upload it anyway and leave a tenant in a broken state that's awkward to handle. ## Summary of changes - Validate index when submitting it for upload, so that we can see the issue quickly e.g. in an API invoking compaction - Validate index before executing the upload, so that we have a hard enforcement that any code path that tries to upload an index will not overwrite a valid index with an invalid one. --- .../src/tenant/remote_timeline_client.rs | 6 ++ .../tenant/remote_timeline_client/index.rs | 15 +++++ .../tenant/remote_timeline_client/upload.rs | 4 ++ .../src/tenant/storage_layer/layer/tests.rs | 56 ++++++++++++++++--- pageserver/src/tenant/timeline.rs | 20 ++++++- 5 files changed, 91 insertions(+), 10 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index a0066477859f..bcba6d1f62ea 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -803,6 +803,12 @@ impl RemoteTimelineClient { upload_queue.dirty.metadata.apply(update); + // Defense in depth: if we somehow generated invalid metadata, do not persist it. + upload_queue + .dirty + .validate() + .map_err(|e| anyhow::anyhow!(e))?; + self.schedule_index_upload(upload_queue); Ok(()) diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 244be5bbb7a2..08e94ae19715 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -152,6 +152,21 @@ impl IndexPart { }; is_same_remote_layer_path(name, metadata, name, index_metadata) } + + /// Check for invariants in the index: this is useful when uploading an index to ensure that if + /// we encounter a bug, we do not persist buggy metadata. + pub(crate) fn validate(&self) -> Result<(), String> { + if self.import_pgdata.is_none() + && self.metadata.ancestor_timeline().is_none() + && self.layer_metadata.is_empty() + { + // Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must + // always have at least one layer. + return Err("Index has no ancestor and no layers".to_string()); + } + + Ok(()) + } } /// Metadata gathered for each of the layer files. diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index e434d24e5f9c..af4dbbbfb654 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -40,6 +40,10 @@ pub(crate) async fn upload_index_part( }); pausable_failpoint!("before-upload-index-pausable"); + // Safety: refuse to persist invalid index metadata, to mitigate the impact of any bug that produces this + // (this should never happen) + index_part.validate().map_err(|e| anyhow::anyhow!(e))?; + // FIXME: this error comes too late let serialized = index_part.to_json_bytes()?; let serialized = Bytes::from(serialized); diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 36dcc8d8050c..fcb73ad20d59 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,6 +1,6 @@ use std::time::UNIX_EPOCH; -use pageserver_api::key::CONTROLFILE_KEY; +use pageserver_api::key::{Key, CONTROLFILE_KEY}; use tokio::task::JoinSet; use utils::{ completion::{self, Completion}, @@ -9,7 +9,10 @@ use utils::{ use super::failpoints::{Failpoint, FailpointKind}; use super::*; -use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint}; +use crate::{ + context::DownloadBehavior, + tenant::{harness::test_img, storage_layer::LayerVisibilityHint}, +}; use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; /// Used in tests to advance a future to wanted await point, and not futher. @@ -31,20 +34,51 @@ async fn smoke_test() { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); + let image_layers = vec![( + Lsn(0x40), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + test_img("foo"), + )], + )]; + + // Create a test timeline with one real layer, and one synthetic test layer. The synthetic + // one is only there so that we can GC the real one without leaving the timeline's metadata + // empty, which is an illegal state (see [`IndexPart::validate`]). let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline_with_layers( + TimelineId::generate(), + Lsn(0x10), + 14, + &ctx, + Default::default(), + image_layers, + Lsn(0x100), + ) .await .unwrap(); - let layer = { + // Grab one of the timeline's layers to exercise in the test, and the other layer that is just + // there to avoid the timeline being illegally empty + let (layer, dummy_layer) = { let mut layers = { let layers = timeline.layers.read().await; layers.likely_resident_layers().cloned().collect::>() }; - assert_eq!(layers.len(), 1); + assert_eq!(layers.len(), 2); - layers.swap_remove(0) + layers.sort_by_key(|l| l.layer_desc().get_key_range().start); + let synthetic_layer = layers.pop().unwrap(); + let real_layer = layers.pop().unwrap(); + tracing::info!( + "real_layer={:?} ({}), synthetic_layer={:?} ({})", + real_layer, + real_layer.layer_desc().file_size, + synthetic_layer, + synthetic_layer.layer_desc().file_size + ); + (real_layer, synthetic_layer) }; // all layers created at pageserver are like `layer`, initialized with strong @@ -173,10 +207,13 @@ async fn smoke_test() { let rtc = &timeline.remote_client; + // Simulate GC removing our test layer. { - let layers = &[layer]; let mut g = timeline.layers.write().await; + + let layers = &[layer]; g.open_mut().unwrap().finish_gc_timeline(layers); + // this just updates the remote_physical_size for demonstration purposes rtc.schedule_gc_update(layers).unwrap(); } @@ -191,7 +228,10 @@ async fn smoke_test() { rtc.wait_completion().await.unwrap(); - assert_eq!(rtc.get_remote_physical_size(), 0); + assert_eq!( + rtc.get_remote_physical_size(), + dummy_layer.metadata().file_size + ); assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2ba71416b869..5f4272fb2b8b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -5678,9 +5678,17 @@ impl Timeline { info!("force created image layer {}", image_layer.local_path()); { let mut guard = self.layers.write().await; - guard.open_mut().unwrap().force_insert_layer(image_layer); + guard + .open_mut() + .unwrap() + .force_insert_layer(image_layer.clone()); } + // Update remote_timeline_client state to reflect existence of this layer + self.remote_client + .schedule_layer_file_upload(image_layer) + .unwrap(); + Ok(()) } @@ -5731,9 +5739,17 @@ impl Timeline { info!("force created delta layer {}", delta_layer.local_path()); { let mut guard = self.layers.write().await; - guard.open_mut().unwrap().force_insert_layer(delta_layer); + guard + .open_mut() + .unwrap() + .force_insert_layer(delta_layer.clone()); } + // Update remote_timeline_client state to reflect existence of this layer + self.remote_client + .schedule_layer_file_upload(delta_layer) + .unwrap(); + Ok(()) }