Skip to content

Commit

Permalink
fix: long timeline create cancelled by tenant delete (#5917)
Browse files Browse the repository at this point in the history
Fix the fallible vs. infallible check order with
`UninitTimeline::finish_creation` so that the incomplete timeline can be
removed. Currently the order of drop guard unwrapping causes uninit
files to be left on pageserver, blocking the tenant deletion.

Cc: #5914
Cc: #investigation-2023-11-23-stuck-tenant-deletion
  • Loading branch information
koivunej authored Nov 24, 2023
1 parent 831fad4 commit 6b1c4cc
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 8 deletions.
1 change: 1 addition & 0 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1858,6 +1858,7 @@ impl Tenant {
});
})
};
// test_long_timeline_create_then_tenant_delete is leaning on this message
tracing::info!("Waiting for timelines...");
while let Some(res) = js.join_next().await {
match res {
Expand Down
2 changes: 2 additions & 0 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2600,6 +2600,8 @@ impl Timeline {
)
};

pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");

if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
}
Expand Down
27 changes: 21 additions & 6 deletions pageserver/src/tenant/timeline/uninit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,20 @@ impl<'t> UninitializedTimeline<'t> {
let timeline_id = self.timeline_id;
let tenant_id = self.owning_tenant.tenant_id;

let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
})?;
if self.raw_timeline.is_none() {
return Err(anyhow::anyhow!(
"No timeline for initialization found for {tenant_id}/{timeline_id}"
));
}

// Check that the caller initialized disk_consistent_lsn
let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
let new_disk_consistent_lsn = self
.raw_timeline
.as_ref()
.expect("checked above")
.0
.get_disk_consistent_lsn();

anyhow::ensure!(
new_disk_consistent_lsn.is_valid(),
"new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
Expand All @@ -62,6 +70,13 @@ impl<'t> UninitializedTimeline<'t> {
"Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
),
Entry::Vacant(v) => {
// after taking here should be no fallible operations, because the drop guard will not
// cleanup after and would block for example the tenant deletion
let (new_timeline, uninit_mark) =
self.raw_timeline.take().expect("already checked");

// this is the mutual exclusion between different retries to create the timeline;
// this should be an assertion.
uninit_mark.remove_uninit_mark().with_context(|| {
format!(
"Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
Expand All @@ -70,10 +85,10 @@ impl<'t> UninitializedTimeline<'t> {
v.insert(Arc::clone(&new_timeline));

new_timeline.maybe_spawn_flush_loop();

Ok(new_timeline)
}
}

Ok(new_timeline)
}

/// Prepares timeline data by loading it from the basebackup archive.
Expand Down
2 changes: 1 addition & 1 deletion test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -1646,7 +1646,7 @@ def __init__(
# env.pageserver.allowed_errors.append(".*could not open garage door.*")
#
# The entries in the list are regular experessions.
self.allowed_errors = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)

def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
"""Get a timeline directory's path based on the repo directory of the test environment"""
Expand Down
77 changes: 76 additions & 1 deletion test_runner/regress/test_tenant_delete.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import enum
import os
import shutil
from threading import Thread

import pytest
from fixtures.log_helper import log
Expand All @@ -27,7 +28,7 @@
available_s3_storages,
)
from fixtures.types import TenantId
from fixtures.utils import run_pg_bench_small
from fixtures.utils import run_pg_bench_small, wait_until


@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
Expand Down Expand Up @@ -399,4 +400,78 @@ def test_tenant_delete_is_resumed_on_attach(
)


def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
"""Reproduction of 2023-11-23 stuck tenants investigation"""

# do not use default tenant/timeline creation because it would output the failpoint log message too early
env = neon_env_builder.init_configs()
env.start()
pageserver_http = env.pageserver.http_client()

# happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero
env.pageserver.allowed_errors.append(
".*Timeline got dropped without initializing, cleaning its files"
)
# the response hit_pausable_failpoint_and_later_fail
env.pageserver.allowed_errors.append(
f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn"
)

pageserver_http.tenant_create(env.initial_tenant)

failpoint = "flush-layer-cancel-after-writing-layer-out-pausable"
pageserver_http.configure_failpoints((failpoint, "pause"))

def hit_pausable_failpoint_and_later_fail():
with pytest.raises(
PageserverApiException, match="new timeline \\S+ has invalid disk_consistent_lsn"
):
pageserver_http.timeline_create(
env.pg_version, env.initial_tenant, env.initial_timeline
)

def start_deletion():
pageserver_http.tenant_delete(env.initial_tenant)

def has_hit_failpoint():
assert env.pageserver.log_contains(f"at failpoint {failpoint}") is not None

def deletion_has_started_waiting_for_timelines():
assert env.pageserver.log_contains("Waiting for timelines...") is not None

def tenant_is_deleted():
try:
pageserver_http.tenant_status(env.initial_tenant)
except PageserverApiException as e:
assert e.status_code == 404
else:
raise RuntimeError("tenant was still accessible")

creation = Thread(target=hit_pausable_failpoint_and_later_fail)
creation.start()

deletion = None

try:
wait_until(10, 1, has_hit_failpoint)

# it should start ok, sync up with the stuck creation, then fail because disk_consistent_lsn was not updated
# then deletion should fail and set the tenant broken
deletion = Thread(target=start_deletion)
deletion.start()

wait_until(10, 1, deletion_has_started_waiting_for_timelines)

pageserver_http.configure_failpoints((failpoint, "off"))

creation.join()
deletion.join()

wait_until(10, 1, tenant_is_deleted)
finally:
creation.join()
if deletion is not None:
deletion.join()


# TODO test concurrent deletions with "hang" failpoint

1 comment on commit 6b1c4cc

@github-actions
Copy link

@github-actions github-actions bot commented on 6b1c4cc Nov 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2492 tests run: 2356 passed, 0 failed, 136 skipped (full report)


Flaky tests (6)

Postgres 16

  • test_branching_with_pgbench[flat-1-10]: debug
  • test_crafted_wal_end[last_wal_record_crossing_segment]: debug
  • test_lfc_resize: debug
  • test_pitr_gc: debug

Postgres 15

  • test_crafted_wal_end[last_wal_record_crossing_segment]: release

Postgres 14

Code coverage (full report)

  • functions: 54.1% (8999 of 16637 functions)
  • lines: 81.5% (52493 of 64406 lines)

The comment gets automatically updated with the latest test results
6b1c4cc at 2023-11-24T17:39:24.728Z :recycle:

Please sign in to comment.