Skip to content

Commit

Permalink
pageserver: safety checks on validity of uploaded indices (#10403)
Browse files Browse the repository at this point in the history
## Problem

Occasionally, we encounter bugs in test environments that can be
detected at the point of uploading an index, but we proceed to upload it
anyway and leave a tenant in a broken state that's awkward to handle.

## Summary of changes

- Validate index when submitting it for upload, so that we can see the
issue quickly e.g. in an API invoking compaction
- Validate index before executing the upload, so that we have a hard
enforcement that any code path that tries to upload an index will not
overwrite a valid index with an invalid one.
  • Loading branch information
jcsp authored Jan 20, 2025
1 parent b0f3409 commit 8bdaee3
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 10 deletions.
6 changes: 6 additions & 0 deletions pageserver/src/tenant/remote_timeline_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,12 @@ impl RemoteTimelineClient {

upload_queue.dirty.metadata.apply(update);

// Defense in depth: if we somehow generated invalid metadata, do not persist it.
upload_queue
.dirty
.validate()
.map_err(|e| anyhow::anyhow!(e))?;

self.schedule_index_upload(upload_queue);

Ok(())
Expand Down
15 changes: 15 additions & 0 deletions pageserver/src/tenant/remote_timeline_client/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,21 @@ impl IndexPart {
};
is_same_remote_layer_path(name, metadata, name, index_metadata)
}

/// Check for invariants in the index: this is useful when uploading an index to ensure that if
/// we encounter a bug, we do not persist buggy metadata.
pub(crate) fn validate(&self) -> Result<(), String> {
if self.import_pgdata.is_none()
&& self.metadata.ancestor_timeline().is_none()
&& self.layer_metadata.is_empty()
{
// Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must
// always have at least one layer.
return Err("Index has no ancestor and no layers".to_string());
}

Ok(())
}
}

/// Metadata gathered for each of the layer files.
Expand Down
4 changes: 4 additions & 0 deletions pageserver/src/tenant/remote_timeline_client/upload.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ pub(crate) async fn upload_index_part(
});
pausable_failpoint!("before-upload-index-pausable");

// Safety: refuse to persist invalid index metadata, to mitigate the impact of any bug that produces this
// (this should never happen)
index_part.validate().map_err(|e| anyhow::anyhow!(e))?;

// FIXME: this error comes too late
let serialized = index_part.to_json_bytes()?;
let serialized = Bytes::from(serialized);
Expand Down
56 changes: 48 additions & 8 deletions pageserver/src/tenant/storage_layer/layer/tests.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::time::UNIX_EPOCH;

use pageserver_api::key::CONTROLFILE_KEY;
use pageserver_api::key::{Key, CONTROLFILE_KEY};
use tokio::task::JoinSet;
use utils::{
completion::{self, Completion},
Expand All @@ -9,7 +9,10 @@ use utils::{

use super::failpoints::{Failpoint, FailpointKind};
use super::*;
use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint};
use crate::{
context::DownloadBehavior,
tenant::{harness::test_img, storage_layer::LayerVisibilityHint},
};
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};

/// Used in tests to advance a future to wanted await point, and not futher.
Expand All @@ -31,20 +34,51 @@ async fn smoke_test() {

let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);

let image_layers = vec![(
Lsn(0x40),
vec![(
Key::from_hex("620000000033333333444444445500000000").unwrap(),
test_img("foo"),
)],
)];

// Create a test timeline with one real layer, and one synthetic test layer. The synthetic
// one is only there so that we can GC the real one without leaving the timeline's metadata
// empty, which is an illegal state (see [`IndexPart::validate`]).
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.create_test_timeline_with_layers(
TimelineId::generate(),
Lsn(0x10),
14,
&ctx,
Default::default(),
image_layers,
Lsn(0x100),
)
.await
.unwrap();

let layer = {
// Grab one of the timeline's layers to exercise in the test, and the other layer that is just
// there to avoid the timeline being illegally empty
let (layer, dummy_layer) = {
let mut layers = {
let layers = timeline.layers.read().await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
};

assert_eq!(layers.len(), 1);
assert_eq!(layers.len(), 2);

layers.swap_remove(0)
layers.sort_by_key(|l| l.layer_desc().get_key_range().start);
let synthetic_layer = layers.pop().unwrap();
let real_layer = layers.pop().unwrap();
tracing::info!(
"real_layer={:?} ({}), synthetic_layer={:?} ({})",
real_layer,
real_layer.layer_desc().file_size,
synthetic_layer,
synthetic_layer.layer_desc().file_size
);
(real_layer, synthetic_layer)
};

// all layers created at pageserver are like `layer`, initialized with strong
Expand Down Expand Up @@ -173,10 +207,13 @@ async fn smoke_test() {

let rtc = &timeline.remote_client;

// Simulate GC removing our test layer.
{
let layers = &[layer];
let mut g = timeline.layers.write().await;

let layers = &[layer];
g.open_mut().unwrap().finish_gc_timeline(layers);

// this just updates the remote_physical_size for demonstration purposes
rtc.schedule_gc_update(layers).unwrap();
}
Expand All @@ -191,7 +228,10 @@ async fn smoke_test() {

rtc.wait_completion().await.unwrap();

assert_eq!(rtc.get_remote_physical_size(), 0);
assert_eq!(
rtc.get_remote_physical_size(),
dummy_layer.metadata().file_size
);
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
}

Expand Down
20 changes: 18 additions & 2 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5678,9 +5678,17 @@ impl Timeline {
info!("force created image layer {}", image_layer.local_path());
{
let mut guard = self.layers.write().await;
guard.open_mut().unwrap().force_insert_layer(image_layer);
guard
.open_mut()
.unwrap()
.force_insert_layer(image_layer.clone());
}

// Update remote_timeline_client state to reflect existence of this layer
self.remote_client
.schedule_layer_file_upload(image_layer)
.unwrap();

Ok(())
}

Expand Down Expand Up @@ -5731,9 +5739,17 @@ impl Timeline {
info!("force created delta layer {}", delta_layer.local_path());
{
let mut guard = self.layers.write().await;
guard.open_mut().unwrap().force_insert_layer(delta_layer);
guard
.open_mut()
.unwrap()
.force_insert_layer(delta_layer.clone());
}

// Update remote_timeline_client state to reflect existence of this layer
self.remote_client
.schedule_layer_file_upload(delta_layer)
.unwrap();

Ok(())
}

Expand Down

1 comment on commit 8bdaee3

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7499 tests run: 7111 passed, 1 failed, 387 skipped (full report)


Failures on Postgres 16

  • test_ingest_logical_message[github-actions-selfhosted-nofsync-131072]: release-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_ingest_logical_message[release-pg16-github-actions-selfhosted-nofsync-131072]"
Flaky tests (6)

Postgres 17

Postgres 16

Postgres 15

Code coverage* (full report)

  • functions: 33.6% (8436 of 25093 functions)
  • lines: 49.1% (70615 of 143736 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
8bdaee3 at 2025-01-20T11:47:29.347Z :recycle:

Please sign in to comment.