Skip to content

Commit

Permalink
Saga pantry attach/detach: fail if pantry is removed from DNS (#6866)
Browse files Browse the repository at this point in the history
Removal from DNS should correspond with the pantry zone's expungement,
which should result in a saga node failure.
  • Loading branch information
jgallagher authored Nov 13, 2024
1 parent 45f5f1c commit 69de8b6
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 50 deletions.
9 changes: 6 additions & 3 deletions common/src/progenitor_operation_retry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,20 @@ use crate::backoff::retry_notify;
use crate::backoff::retry_policy_internal_service;
use crate::backoff::BackoffError;

#[derive(Debug)]
#[derive(Debug, thiserror::Error)]
pub enum ProgenitorOperationRetryError<E> {
/// Nexus determined that the operation will never return a known result
/// because the remote server is gone.
#[error("remote server is gone")]
Gone,

/// Attempting to check if the retry loop should be stopped failed
GoneCheckError(Error),
#[error("failed to determine whether remote server is gone")]
GoneCheckError(#[source] Error),

/// The retry loop progenitor operation saw a permanent client error
ProgenitorError(progenitor_client::Error<E>),
#[error("permanent error")]
ProgenitorError(#[source] progenitor_client::Error<E>),
}

impl<E> ProgenitorOperationRetryError<E> {
Expand Down
81 changes: 62 additions & 19 deletions nexus/src/app/sagas/common_storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@
use super::*;

use crate::Nexus;
use crucible_pantry_client::types::Error as CruciblePantryClientError;
use crucible_pantry_client::types::VolumeConstructionRequest;
use internal_dns_types::names::ServiceName;
use nexus_db_queries::authz;
use nexus_db_queries::context::OpContext;
use nexus_db_queries::db;
use nexus_db_queries::db::lookup::LookupPath;
use omicron_common::api::external::Error;
use omicron_common::retry_until_known_result;
use omicron_common::progenitor_operation_retry::ProgenitorOperationRetry;
use omicron_common::progenitor_operation_retry::ProgenitorOperationRetryError;
use slog::Logger;
use slog_error_chain::InlineErrorChain;
use std::net::SocketAddrV6;
Expand All @@ -26,7 +29,7 @@ pub(crate) use pantry_pool::PooledPantryClient;
// Common Pantry operations

pub(crate) async fn get_pantry_address(
nexus: &Arc<Nexus>,
nexus: &Nexus,
) -> Result<SocketAddrV6, ActionError> {
let client = nexus.pantry_connection_pool().claim().await.map_err(|e| {
ActionError::action_failed(format!(
Expand All @@ -37,10 +40,42 @@ pub(crate) async fn get_pantry_address(
Ok(client.address())
}

// Helper function for attach/detach below: we retry as long as the pantry isn't
// gone, and we detect "gone" by seeing whether the pantry address we've chosen
// is still present when we resolve all the crucible pantry records in DNS.
//
// This function never returns an error because it's expected to be used with
// `ProgenitorOperationRetry`, which treats an error in the "gone check" as a
// fatal error. We don't want to amplify failures: if something is wrong with
// DNS, we can't go back and choose another pantry anyway, so we'll just keep
// retrying until DNS comes back. All that to say: a failure to resolve DNS is
// treated as "the pantry is not gone".
pub(super) async fn is_pantry_gone(
nexus: &Nexus,
pantry_address: SocketAddrV6,
log: &Logger,
) -> bool {
let all_pantry_dns_entries = match nexus
.resolver()
.lookup_all_socket_v6(ServiceName::CruciblePantry)
.await
{
Ok(entries) => entries,
Err(err) => {
warn!(
log, "Failed to query DNS for Crucible pantry";
InlineErrorChain::new(&err),
);
return false;
}
};
!all_pantry_dns_entries.contains(&pantry_address)
}

pub(crate) async fn call_pantry_attach_for_disk(
log: &slog::Logger,
opctx: &OpContext,
nexus: &Arc<Nexus>,
nexus: &Nexus,
disk_id: Uuid,
pantry_address: SocketAddrV6,
) -> Result<(), ActionError> {
Expand Down Expand Up @@ -82,37 +117,45 @@ pub(crate) async fn call_pantry_attach_for_disk(
volume_construction_request,
};

retry_until_known_result(log, || async {
client.attach(&disk_id.to_string(), &attach_request).await
})
.await
.map_err(|e| {
ActionError::action_failed(format!("pantry attach failed with {:?}", e))
})?;
let attach_operation =
|| async { client.attach(&disk_id.to_string(), &attach_request).await };
let gone_check =
|| async { Ok(is_pantry_gone(nexus, pantry_address, log).await) };

ProgenitorOperationRetry::new(attach_operation, gone_check)
.run(log)
.await
.map_err(|e| {
ActionError::action_failed(format!(
"pantry attach failed: {}",
InlineErrorChain::new(&e)
))
})?;

Ok(())
}

pub(crate) async fn call_pantry_detach_for_disk(
nexus: &Nexus,
log: &slog::Logger,
disk_id: Uuid,
pantry_address: SocketAddrV6,
) -> Result<(), ActionError> {
) -> Result<(), ProgenitorOperationRetryError<CruciblePantryClientError>> {
let endpoint = format!("http://{}", pantry_address);

info!(log, "sending detach for disk {disk_id} to endpoint {endpoint}");

let client = crucible_pantry_client::Client::new(&endpoint);

retry_until_known_result(log, || async {
client.detach(&disk_id.to_string()).await
})
.await
.map_err(|e| {
ActionError::action_failed(format!("pantry detach failed with {:?}", e))
})?;
let detach_operation =
|| async { client.detach(&disk_id.to_string()).await };
let gone_check =
|| async { Ok(is_pantry_gone(nexus, pantry_address, log).await) };

Ok(())
ProgenitorOperationRetry::new(detach_operation, gone_check)
.run(log)
.await
.map(|_response| ())
}

pub(crate) fn find_only_new_region(
Expand Down
11 changes: 3 additions & 8 deletions nexus/src/app/sagas/common_storage/pantry_pool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ impl backend::Connector for PantryConnector {
};
let client =
crucible_pantry_client::Client::new(&format!("http://{address}"));
Ok(PooledPantryClient { client, address })
let mut conn = PooledPantryClient { client, address };
self.is_valid(&mut conn).await?;
Ok(conn)
}

async fn is_valid(
Expand All @@ -72,13 +74,6 @@ impl backend::Connector for PantryConnector {

Ok(())
}

async fn on_acquire(
&self,
conn: &mut Self::Connection,
) -> Result<(), backend::Error> {
self.is_valid(conn).await
}
}

pub(crate) fn make_pantry_connection_pool(
Expand Down
8 changes: 7 additions & 1 deletion nexus/src/app/sagas/disk_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,13 @@ async fn sdc_call_pantry_attach_for_disk_undo(

let pantry_address = sagactx.lookup::<SocketAddrV6>("pantry_address")?;

call_pantry_detach_for_disk(&log, disk_id, pantry_address).await?;
call_pantry_detach_for_disk(
sagactx.user_data().nexus(),
&log,
disk_id,
pantry_address,
)
.await?;

Ok(())
}
Expand Down
15 changes: 14 additions & 1 deletion nexus/src/app/sagas/finalize_disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use omicron_common::api::external::Error;
use omicron_common::api::external::Name;
use serde::Deserialize;
use serde::Serialize;
use slog_error_chain::InlineErrorChain;
use std::net::SocketAddrV6;
use steno::ActionError;
use steno::Node;
Expand Down Expand Up @@ -286,7 +287,19 @@ async fn sfd_call_pantry_detach_for_disk(
let params = sagactx.saga_params::<Params>()?;
let pantry_address = sagactx.lookup::<SocketAddrV6>("pantry_address")?;

call_pantry_detach_for_disk(&log, params.disk_id, pantry_address).await
call_pantry_detach_for_disk(
sagactx.user_data().nexus(),
&log,
params.disk_id,
pantry_address,
)
.await
.map_err(|e| {
ActionError::action_failed(format!(
"pantry detach failed: {}",
InlineErrorChain::new(&e)
))
})
}

async fn sfd_clear_pantry_address(
Expand Down
8 changes: 3 additions & 5 deletions nexus/src/app/sagas/region_replacement_drive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1284,11 +1284,9 @@ async fn execute_pantry_drive_action(
volume_id: Uuid,
job_id: Uuid,
) -> Result<(), ActionError> {
// Importantly, _do not use `call_pantry_attach_for_disk`_! That uses
// `retry_until_known_result`, which we _do not want here_. The Pantry
// attach can fail if there's a racing Volume checkout to be sent to
// Propolis. Additionally, that call uses `attach` instead of
// `attach_activate_background`, which means it will hang on the activation.
// Importantly, _do not use `call_pantry_attach_for_disk`_! That call uses
// `attach` instead of `attach_activate_background`, which means it will
// hang on the activation.

let endpoint = format!("http://{}", pantry_address);
let client = crucible_pantry_client::Client::new(&endpoint);
Expand Down
66 changes: 53 additions & 13 deletions nexus/src/app/sagas/snapshot_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
use super::{
common_storage::{
call_pantry_attach_for_disk, call_pantry_detach_for_disk,
get_pantry_address,
get_pantry_address, is_pantry_gone,
},
ActionRegistry, NexusActionContext, NexusSaga, SagaInitError,
ACTION_GENERATE_ID,
Expand All @@ -103,9 +103,14 @@ use anyhow::anyhow;
use nexus_db_model::Generation;
use nexus_db_queries::db::identity::{Asset, Resource};
use nexus_db_queries::db::lookup::LookupPath;
use omicron_common::api::external;
use omicron_common::api::external::Error;
use omicron_common::retry_until_known_result;
use omicron_common::{
api::external, progenitor_operation_retry::ProgenitorOperationRetry,
};
use omicron_common::{
api::external::Error,
progenitor_operation_retry::ProgenitorOperationRetryError,
};
use omicron_uuid_kinds::{GenericUuid, PropolisUuid, SledUuid};
use rand::{rngs::StdRng, RngCore, SeedableRng};
use serde::Deserialize;
Expand All @@ -114,6 +119,7 @@ use sled_agent_client::types::CrucibleOpts;
use sled_agent_client::types::VmmIssueDiskSnapshotRequestBody;
use sled_agent_client::types::VolumeConstructionRequest;
use slog::info;
use slog_error_chain::InlineErrorChain;
use std::collections::BTreeMap;
use std::collections::VecDeque;
use std::net::SocketAddrV6;
Expand Down Expand Up @@ -1145,8 +1151,25 @@ async fn ssc_call_pantry_attach_for_disk_undo(
pantry_address
);

call_pantry_detach_for_disk(&log, params.disk_id, pantry_address)
.await?;
match call_pantry_detach_for_disk(
sagactx.user_data().nexus(),
&log,
params.disk_id,
pantry_address,
)
.await
{
// We can treat the pantry being permanently gone as success.
Ok(()) | Err(ProgenitorOperationRetryError::Gone) => (),
Err(err) => {
return Err(anyhow!(
"failed to detach disk {} from pantry at {}: {}",
params.disk_id,
pantry_address,
InlineErrorChain::new(&err)
))
}
}
} else {
info!(
log,
Expand All @@ -1162,6 +1185,7 @@ async fn ssc_call_pantry_snapshot_for_disk(
sagactx: NexusActionContext,
) -> Result<(), ActionError> {
let log = sagactx.user_data().log();
let nexus = sagactx.user_data().nexus();
let params = sagactx.saga_params::<Params>()?;

let (pantry_address, _) =
Expand All @@ -1180,7 +1204,7 @@ async fn ssc_call_pantry_snapshot_for_disk(

let client = crucible_pantry_client::Client::new(&endpoint);

retry_until_known_result(log, || async {
let snapshot_operation = || async {
client
.snapshot(
&params.disk_id.to_string(),
Expand All @@ -1189,11 +1213,16 @@ async fn ssc_call_pantry_snapshot_for_disk(
},
)
.await
})
.await
.map_err(|e| {
ActionError::action_failed(Error::internal_error(&e.to_string()))
})?;
};
let gone_check =
|| async { Ok(is_pantry_gone(nexus, pantry_address, log).await) };

ProgenitorOperationRetry::new(snapshot_operation, gone_check)
.run(log)
.await
.map_err(|e| {
ActionError::action_failed(Error::internal_error(&e.to_string()))
})?;

Ok(())
}
Expand Down Expand Up @@ -1248,8 +1277,19 @@ async fn ssc_call_pantry_detach_for_disk(
params.disk_id,
pantry_address
);
call_pantry_detach_for_disk(&log, params.disk_id, pantry_address)
.await?;
call_pantry_detach_for_disk(
sagactx.user_data().nexus(),
&log,
params.disk_id,
pantry_address,
)
.await
.map_err(|e| {
ActionError::action_failed(format!(
"pantry detach failed: {}",
InlineErrorChain::new(&e)
))
})?;
}

Ok(())
Expand Down

0 comments on commit 69de8b6

Please sign in to comment.