From 2406d9d93258cc33b57d03c7abcd90a95ca487c4 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 14 Mar 2024 12:44:49 -0400 Subject: [PATCH] Accept notifications from Crucible (#5135) Allow any Upstairs to notify Nexus about the start or completion (plus status) of live repairs. The motivation for this was to be used in the final stage of region replacement to notify Nexus that the replacement has finished, but more generally this can be used to keep track of how many times repair occurs for each region. Also accept notifications for: - when a downstairs client is requested to stop - when a downstairs client stops These will be used as breadcrumbs to determine when downstairs were having problems, why repairs started in the first place, and more. Fixes #5120 --- Cargo.lock | 3 + clients/nexus-client/Cargo.toml | 1 + clients/nexus-client/src/lib.rs | 4 + common/Cargo.toml | 1 + common/src/api/internal/nexus.rs | 83 ++ common/src/lib.rs | 81 ++ nexus/Cargo.toml | 1 + nexus/db-model/src/downstairs.rs | 133 +++ nexus/db-model/src/lib.rs | 4 + nexus/db-model/src/schema.rs | 46 +- nexus/db-model/src/upstairs_repair.rs | 154 ++++ nexus/db-queries/src/db/datastore/volume.rs | 249 ++++++ nexus/db-queries/src/db/pool_connection.rs | 4 + nexus/src/app/instance_network.rs | 2 +- nexus/src/app/sagas/common_storage.rs | 2 +- .../src/app/sagas/loopback_address_create.rs | 2 +- .../src/app/sagas/loopback_address_delete.rs | 2 +- nexus/src/app/sagas/mod.rs | 82 -- nexus/src/app/sagas/snapshot_create.rs | 2 +- .../app/sagas/switch_port_settings_apply.rs | 2 +- .../app/sagas/switch_port_settings_clear.rs | 2 +- nexus/src/app/volume.rs | 162 ++++ nexus/src/internal_api/http_entrypoints.rs | 180 ++++ .../integration_tests/volume_management.rs | 768 ++++++++++++++++++ openapi/nexus-internal.json | 401 +++++++++ schema/crdb/43.0.0/up01.sql | 5 + schema/crdb/43.0.0/up02.sql | 4 + schema/crdb/43.0.0/up03.sql | 21 + schema/crdb/43.0.0/up04.sql | 8 + schema/crdb/43.0.0/up05.sql | 11 + schema/crdb/43.0.0/up06.sql | 8 + schema/crdb/43.0.0/up07.sql | 11 + schema/crdb/43.0.0/up08.sql | 8 + schema/crdb/dbinit.sql | 86 +- uuid-kinds/src/lib.rs | 5 + 35 files changed, 2447 insertions(+), 91 deletions(-) create mode 100644 nexus/db-model/src/downstairs.rs create mode 100644 nexus/db-model/src/upstairs_repair.rs create mode 100644 schema/crdb/43.0.0/up01.sql create mode 100644 schema/crdb/43.0.0/up02.sql create mode 100644 schema/crdb/43.0.0/up03.sql create mode 100644 schema/crdb/43.0.0/up04.sql create mode 100644 schema/crdb/43.0.0/up05.sql create mode 100644 schema/crdb/43.0.0/up06.sql create mode 100644 schema/crdb/43.0.0/up07.sql create mode 100644 schema/crdb/43.0.0/up08.sql diff --git a/Cargo.lock b/Cargo.lock index 88c78a83d4..84f365e2f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4369,6 +4369,7 @@ dependencies = [ "nexus-types", "omicron-common", "omicron-passwords", + "omicron-uuid-kinds", "omicron-workspace-hack", "progenitor", "regress 0.9.0", @@ -5057,6 +5058,7 @@ dependencies = [ "once_cell", "parse-display", "progenitor", + "progenitor-client", "proptest", "rand 0.8.5", "regress 0.9.0", @@ -5216,6 +5218,7 @@ dependencies = [ "omicron-rpaths", "omicron-sled-agent", "omicron-test-utils", + "omicron-uuid-kinds", "omicron-workspace-hack", "once_cell", "openapi-lint", diff --git a/clients/nexus-client/Cargo.toml b/clients/nexus-client/Cargo.toml index 965e2a7dfb..fd6df6919f 100644 --- a/clients/nexus-client/Cargo.toml +++ b/clients/nexus-client/Cargo.toml @@ -20,3 +20,4 @@ serde_json.workspace = true slog.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true +omicron-uuid-kinds.workspace = true diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 85c67ddbfd..ad8269e675 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -35,6 +35,10 @@ progenitor::generate_api!( NewPasswordHash = omicron_passwords::NewPasswordHash, NetworkInterface = omicron_common::api::internal::shared::NetworkInterface, NetworkInterfaceKind = omicron_common::api::internal::shared::NetworkInterfaceKind, + TypedUuidForDownstairsKind = omicron_uuid_kinds::TypedUuid, + TypedUuidForUpstairsKind = omicron_uuid_kinds::TypedUuid, + TypedUuidForUpstairsRepairKind = omicron_uuid_kinds::TypedUuid, + TypedUuidForUpstairsSessionKind = omicron_uuid_kinds::TypedUuid, }, patch = { SledAgentInfo = { derives = [PartialEq, Eq] }, diff --git a/common/Cargo.toml b/common/Cargo.toml index 4451d92bdb..b16415b828 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -41,6 +41,7 @@ tokio = { workspace = true, features = ["full"] } uuid.workspace = true parse-display.workspace = true progenitor.workspace = true +progenitor-client.workspace = true omicron-workspace-hack.workspace = true once_cell.workspace = true regress.workspace = true diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 3972e011cf..24ef9a16aa 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -9,6 +9,10 @@ use crate::api::external::{ InstanceState, IpNet, SemverVersion, Vni, }; use chrono::{DateTime, Utc}; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsRepairKind; +use omicron_uuid_kinds::UpstairsSessionKind; use parse_display::{Display, FromStr}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -251,3 +255,82 @@ pub enum HostIdentifier { Ip(IpNet), Vpc(Vni), } + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone, Copy)] +#[serde(rename_all = "snake_case")] +pub enum UpstairsRepairType { + Live, + Reconciliation, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct DownstairsUnderRepair { + pub region_uuid: TypedUuid, + pub target_addr: std::net::SocketAddrV6, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairStartInfo { + pub time: DateTime, + pub session_id: TypedUuid, + pub repair_id: TypedUuid, + pub repair_type: UpstairsRepairType, + pub repairs: Vec, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairFinishInfo { + pub time: DateTime, + pub session_id: TypedUuid, + pub repair_id: TypedUuid, + pub repair_type: UpstairsRepairType, + pub repairs: Vec, + pub aborted: bool, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairProgress { + pub time: DateTime, + pub current_item: i64, + pub total_items: i64, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +#[serde(rename_all = "snake_case")] +pub enum DownstairsClientStopRequestReason { + Replacing, + Disabled, + FailedReconcile, + IOError, + BadNegotiationOrder, + Incompatible, + FailedLiveRepair, + TooManyOutstandingJobs, + Deactivated, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct DownstairsClientStopRequest { + pub time: DateTime, + pub reason: DownstairsClientStopRequestReason, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +#[serde(rename_all = "snake_case")] +pub enum DownstairsClientStoppedReason { + ConnectionTimeout, + ConnectionFailed, + Timeout, + WriteFailed, + ReadFailed, + RequestedStop, + Finished, + QueueClosed, + ReceiveTaskCancelled, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct DownstairsClientStopped { + pub time: DateTime, + pub reason: DownstairsClientStoppedReason, +} diff --git a/common/src/lib.rs b/common/src/lib.rs index 411bc3e426..24fa4dfba0 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -77,3 +77,84 @@ impl slog::KV for FileKv { } pub const OMICRON_DPD_TAG: &str = "omicron"; + +use futures::Future; +use slog::warn; + +/// Retry a progenitor client operation until a known result is returned. +/// +/// Saga execution relies on the outcome of an external call being known: since +/// they are idempotent, reissue the external call until a known result comes +/// back. Retry if a communication error is seen, or if another retryable error +/// is seen. +/// +/// Note that retrying is only valid if the call itself is idempotent. +pub async fn retry_until_known_result( + log: &slog::Logger, + mut f: F, +) -> Result> +where + F: FnMut() -> Fut, + Fut: Future>>, + E: std::fmt::Debug, +{ + backoff::retry_notify( + backoff::retry_policy_internal_service(), + move || { + let fut = f(); + async move { + match fut.await { + Err(progenitor_client::Error::CommunicationError(e)) => { + warn!( + log, + "saw transient communication error {}, retrying...", + e, + ); + + Err(backoff::BackoffError::transient( + progenitor_client::Error::CommunicationError(e), + )) + } + + Err(progenitor_client::Error::ErrorResponse( + response_value, + )) => { + match response_value.status() { + // Retry on 503 or 429 + http::StatusCode::SERVICE_UNAVAILABLE + | http::StatusCode::TOO_MANY_REQUESTS => { + Err(backoff::BackoffError::transient( + progenitor_client::Error::ErrorResponse( + response_value, + ), + )) + } + + // Anything else is a permanent error + _ => Err(backoff::BackoffError::Permanent( + progenitor_client::Error::ErrorResponse( + response_value, + ), + )), + } + } + + Err(e) => { + warn!(log, "saw permanent error {}, aborting", e,); + + Err(backoff::BackoffError::Permanent(e)) + } + + Ok(v) => Ok(v), + } + } + }, + |error: progenitor_client::Error<_>, delay| { + warn!( + log, + "failed external call ({:?}), will retry in {:?}", error, delay, + ); + }, + ) + .await +} diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 2b8845f6f5..57d929d44d 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -97,6 +97,7 @@ rustls = { workspace = true } rustls-pemfile = { workspace = true } update-common.workspace = true omicron-workspace-hack.workspace = true +omicron-uuid-kinds.workspace = true [dev-dependencies] async-bb8-diesel.workspace = true diff --git a/nexus/db-model/src/downstairs.rs b/nexus/db-model/src/downstairs.rs new file mode 100644 index 0000000000..6aaeadc810 --- /dev/null +++ b/nexus/db-model/src/downstairs.rs @@ -0,0 +1,133 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::downstairs_client_stop_request_notification; +use crate::schema::downstairs_client_stopped_notification; +use crate::typed_uuid::DbTypedUuid; +use chrono::{DateTime, Utc}; +use omicron_common::api::internal; +use omicron_uuid_kinds::DownstairsKind; +use omicron_uuid_kinds::UpstairsKind; +use serde::{Deserialize, Serialize}; + +// Types for stop request notification + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "downstairs_client_stop_request_reason_type", schema = "public"))] + pub struct DownstairsClientStopRequestReasonEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = DownstairsClientStopRequestReasonEnum)] + pub enum DownstairsClientStopRequestReason; + + // Reason types + Replacing => b"replacing" + Disabled => b"disabled" + FailedReconcile => b"failed_reconcile" + IOError => b"io_error" + BadNegotiationOrder => b"bad_negotiation_order" + Incompatible => b"incompatible" + FailedLiveRepair => b"failed_live_repair" + TooManyOutstandingJobs => b"too_many_outstanding_jobs" + Deactivated => b"deactivated" +); + +impl From + for DownstairsClientStopRequestReason +{ + fn from( + v: internal::nexus::DownstairsClientStopRequestReason, + ) -> DownstairsClientStopRequestReason { + match v { + internal::nexus::DownstairsClientStopRequestReason::Replacing => DownstairsClientStopRequestReason::Replacing, + internal::nexus::DownstairsClientStopRequestReason::Disabled => DownstairsClientStopRequestReason::Disabled, + internal::nexus::DownstairsClientStopRequestReason::FailedReconcile => DownstairsClientStopRequestReason::FailedReconcile, + internal::nexus::DownstairsClientStopRequestReason::IOError => DownstairsClientStopRequestReason::IOError, + internal::nexus::DownstairsClientStopRequestReason::BadNegotiationOrder => DownstairsClientStopRequestReason::BadNegotiationOrder, + internal::nexus::DownstairsClientStopRequestReason::Incompatible => DownstairsClientStopRequestReason::Incompatible, + internal::nexus::DownstairsClientStopRequestReason::FailedLiveRepair => DownstairsClientStopRequestReason::FailedLiveRepair, + internal::nexus::DownstairsClientStopRequestReason::TooManyOutstandingJobs => DownstairsClientStopRequestReason::TooManyOutstandingJobs, + internal::nexus::DownstairsClientStopRequestReason::Deactivated => DownstairsClientStopRequestReason::Deactivated, + } + } +} + +/// A Record of when an Upstairs requested a Downstairs client task stop +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = downstairs_client_stop_request_notification)] +pub struct DownstairsClientStopRequestNotification { + // Importantly, this is client time, not Nexus' time that it received the + // notification. + pub time: DateTime, + + // Which Upstairs sent this notification? + pub upstairs_id: DbTypedUuid, + + // Which Downstairs client was requested to stop? + pub downstairs_id: DbTypedUuid, + + pub reason: DownstairsClientStopRequestReason, +} + +// Types for stopped notification + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "downstairs_client_stopped_reason_type", schema = "public"))] + pub struct DownstairsClientStoppedReasonEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = DownstairsClientStoppedReasonEnum)] + pub enum DownstairsClientStoppedReason; + + // Reason types + ConnectionTimeout => b"connection_timeout" + ConnectionFailed => b"connection_failed" + Timeout => b"timeout" + WriteFailed => b"write_failed" + ReadFailed => b"read_failed" + RequestedStop => b"requested_stop" + Finished => b"finished" + QueueClosed => b"queue_closed" + ReceiveTaskCancelled => b"receive_task_cancelled" +); + +impl From + for DownstairsClientStoppedReason +{ + fn from( + v: internal::nexus::DownstairsClientStoppedReason, + ) -> DownstairsClientStoppedReason { + match v { + internal::nexus::DownstairsClientStoppedReason::ConnectionTimeout => DownstairsClientStoppedReason::ConnectionTimeout, + internal::nexus::DownstairsClientStoppedReason::ConnectionFailed => DownstairsClientStoppedReason::ConnectionFailed, + internal::nexus::DownstairsClientStoppedReason::Timeout => DownstairsClientStoppedReason::Timeout, + internal::nexus::DownstairsClientStoppedReason::WriteFailed => DownstairsClientStoppedReason::WriteFailed, + internal::nexus::DownstairsClientStoppedReason::ReadFailed => DownstairsClientStoppedReason::ReadFailed, + internal::nexus::DownstairsClientStoppedReason::RequestedStop => DownstairsClientStoppedReason::RequestedStop, + internal::nexus::DownstairsClientStoppedReason::Finished => DownstairsClientStoppedReason::Finished, + internal::nexus::DownstairsClientStoppedReason::QueueClosed => DownstairsClientStoppedReason::QueueClosed, + internal::nexus::DownstairsClientStoppedReason::ReceiveTaskCancelled => DownstairsClientStoppedReason::ReceiveTaskCancelled, + } + } +} + +/// A Record of when a Downstairs client task stopped +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = downstairs_client_stopped_notification)] +pub struct DownstairsClientStoppedNotification { + // Importantly, this is client time, not Nexus' time that it received the + // notification. + pub time: DateTime, + + // Which Upstairs sent this notification? + pub upstairs_id: DbTypedUuid, + + // Which Downstairs client was stopped? + pub downstairs_id: DbTypedUuid, + + pub reason: DownstairsClientStoppedReason, +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 5c89134b78..d2b676a3da 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -26,6 +26,7 @@ mod digest; mod disk; mod disk_state; mod dns; +mod downstairs; mod external_ip; mod generation; mod identity_provider; @@ -85,6 +86,7 @@ mod switch; mod tuf_repo; mod typed_uuid; mod unsigned; +mod upstairs_repair; mod user_builtin; mod utilization; mod virtual_provisioning_collection; @@ -127,6 +129,7 @@ pub use digest::*; pub use disk::*; pub use disk_state::*; pub use dns::*; +pub use downstairs::*; pub use external_ip::*; pub use generation::*; pub use identity_provider::*; @@ -176,6 +179,7 @@ pub use switch_interface::*; pub use switch_port::*; pub use tuf_repo::*; pub use typed_uuid::to_db_typed_uuid; +pub use upstairs_repair::*; pub use user_builtin::*; pub use utilization::*; pub use virtual_provisioning_collection::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index bcbf7fa88f..c31ce16775 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion; /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(42, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(43, 0, 0); table! { disk (id) { @@ -1549,6 +1549,50 @@ table! { } } +table! { + upstairs_repair_notification (repair_id, upstairs_id, session_id, region_id, notification_type) { + time -> Timestamptz, + + repair_id -> Uuid, + repair_type -> crate::UpstairsRepairTypeEnum, + upstairs_id -> Uuid, + session_id -> Uuid, + + region_id -> Uuid, + target_ip -> Inet, + target_port -> Int4, + + notification_type -> crate::UpstairsRepairNotificationTypeEnum, + } +} + +table! { + upstairs_repair_progress (repair_id, time, current_item, total_items) { + repair_id -> Uuid, + time -> Timestamptz, + current_item -> Int8, + total_items -> Int8, + } +} + +table! { + downstairs_client_stop_request_notification (time, upstairs_id, downstairs_id, reason) { + time -> Timestamptz, + upstairs_id -> Uuid, + downstairs_id -> Uuid, + reason -> crate::DownstairsClientStopRequestReasonEnum, + } +} + +table! { + downstairs_client_stopped_notification (time, upstairs_id, downstairs_id, reason) { + time -> Timestamptz, + upstairs_id -> Uuid, + downstairs_id -> Uuid, + reason -> crate::DownstairsClientStoppedReasonEnum, + } +} + table! { db_metadata (singleton) { singleton -> Bool, diff --git a/nexus/db-model/src/upstairs_repair.rs b/nexus/db-model/src/upstairs_repair.rs new file mode 100644 index 0000000000..311592f8e4 --- /dev/null +++ b/nexus/db-model/src/upstairs_repair.rs @@ -0,0 +1,154 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::ipv6; +use crate::schema::upstairs_repair_notification; +use crate::schema::upstairs_repair_progress; +use crate::typed_uuid::DbTypedUuid; +use crate::SqlU16; +use chrono::{DateTime, Utc}; +use omicron_common::api::internal; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; +use omicron_uuid_kinds::UpstairsSessionKind; +use serde::{Deserialize, Serialize}; +use std::net::SocketAddrV6; // internal::nexus::UpstairsRepairType; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "upstairs_repair_notification_type", schema = "public"))] + pub struct UpstairsRepairNotificationTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = UpstairsRepairNotificationTypeEnum)] + pub enum UpstairsRepairNotificationType; + + // Notification types + Started => b"started" + Succeeded => b"succeeded" + Failed => b"failed" +); + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "upstairs_repair_type", schema = "public"))] + pub struct UpstairsRepairTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq, Eq, Hash)] + #[diesel(sql_type = UpstairsRepairTypeEnum)] + pub enum UpstairsRepairType; + + // Types of repair a Crucible Upstairs can do + Live => b"live" + Reconciliation => b"reconciliation" +); + +impl From for UpstairsRepairType { + fn from(v: internal::nexus::UpstairsRepairType) -> UpstairsRepairType { + match v { + internal::nexus::UpstairsRepairType::Live => { + UpstairsRepairType::Live + } + internal::nexus::UpstairsRepairType::Reconciliation => { + UpstairsRepairType::Reconciliation + } + } + } +} + +/// A record of Crucible Upstairs repair notifications: when a repair started, +/// succeeded, failed, etc. +/// +/// Each repair attempt is uniquely identified by the repair ID, upstairs ID, +/// session ID, and region ID. How those change tells Nexus about what is going +/// on: +/// +/// - if all IDs are the same for different requests, Nexus knows that the +/// client is retrying the notification. +/// +/// - if the upstairs ID, session ID, and region ID are all the same, but the +/// repair ID is different, then the same Upstairs is trying to repair that +/// region again. This could be due to a failed first attempt, or that +/// downstairs may have been kicked out again. +/// +/// - if the upstairs ID and region ID are the same, but the session ID and +/// repair ID are different, then a different session of the same Upstairs is +/// trying to repair that Downstairs. Session IDs change each time the +/// Upstairs is created, so it could have crashed, or it could have been +/// migrated and the destination Propolis' Upstairs is attempting to repair +/// the same region. +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = upstairs_repair_notification)] +pub struct UpstairsRepairNotification { + // Importantly, this is client time, not Nexus' time that it received the + // notification. + pub time: DateTime, + + pub repair_id: DbTypedUuid, + + // There's a difference between the live repairs and reconciliation: the + // Upstairs can go through reconciliation without there being any error from + // a downstairs, or any region replacement request from Nexus. One example + // is if the rack power is pulled: if everything is powered back up again + // reconciliation could be required but this isn't the fault of any problem + // with a physical disk, or any error that was returned. + // + // Alternatively any record of a live repair means that there was a problem: + // Currently, either an Upstairs kicked out a Downstairs (or two) due to + // some error or because it lagged behind the others, or Nexus has + // instructed an Upstairs to perform a region replacement. + pub repair_type: UpstairsRepairType, + + pub upstairs_id: DbTypedUuid, + pub session_id: DbTypedUuid, + + pub region_id: DbTypedUuid, + pub target_ip: ipv6::Ipv6Addr, + pub target_port: SqlU16, + + pub notification_type: UpstairsRepairNotificationType, +} + +impl UpstairsRepairNotification { + #[allow(clippy::too_many_arguments)] + pub fn new( + time: DateTime, + repair_id: TypedUuid, + repair_type: UpstairsRepairType, + upstairs_id: TypedUuid, + session_id: TypedUuid, + region_id: TypedUuid, + target_addr: SocketAddrV6, + notification_type: UpstairsRepairNotificationType, + ) -> Self { + Self { + time, + repair_id: repair_id.into(), + repair_type, + upstairs_id: upstairs_id.into(), + session_id: session_id.into(), + region_id: region_id.into(), + target_ip: target_addr.ip().into(), + target_port: target_addr.port().into(), + notification_type, + } + } + + pub fn address(&self) -> SocketAddrV6 { + SocketAddrV6::new(*self.target_ip, *self.target_port, 0, 0) + } +} + +/// A record of Crucible Upstairs repair progress. +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = upstairs_repair_progress)] +pub struct UpstairsRepairProgress { + pub repair_id: DbTypedUuid, + pub time: DateTime, + pub current_item: i64, + pub total_items: i64, +} diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 374ef2cf73..a9646b9ef6 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -6,12 +6,18 @@ use super::DataStore; use crate::db; +use crate::db::datastore::OpContext; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; +use crate::db::model::DownstairsClientStopRequestNotification; +use crate::db::model::DownstairsClientStoppedNotification; use crate::db::model::Region; use crate::db::model::RegionSnapshot; +use crate::db::model::UpstairsRepairNotification; +use crate::db::model::UpstairsRepairNotificationType; +use crate::db::model::UpstairsRepairProgress; use crate::db::model::Volume; use crate::db::queries::volume::DecreaseCrucibleResourceCountAndSoftDeleteVolume; use crate::transaction_retry::OptionalError; @@ -25,6 +31,13 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::ResourceType; +use omicron_common::api::internal::nexus::DownstairsClientStopRequest; +use omicron_common::api::internal::nexus::DownstairsClientStopped; +use omicron_common::api::internal::nexus::RepairProgress; +use omicron_uuid_kinds::DownstairsKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; use serde::Deserialize; use serde::Deserializer; use serde::Serialize; @@ -809,6 +822,242 @@ impl DataStore { public_error_from_diesel(e, ErrorHandler::Server) }) } + + // An Upstairs is created as part of a Volume hierarchy if the Volume + // Construction Request includes a "Region" variant. This may be at any + // layer of the Volume, and some notifications will come from an Upstairs + // instead of the top level of the Volume. The following functions have an + // Upstairs ID instead of a Volume ID for this reason. + + /// Record when an Upstairs notifies us about a repair. If that record + /// (uniquely identified by the four IDs passed in plus the notification + /// type) exists already, do nothing. + pub async fn upstairs_repair_notification( + &self, + opctx: &OpContext, + record: UpstairsRepairNotification, + ) -> Result<(), Error> { + use db::schema::upstairs_repair_notification::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); + + self.transaction_retry_wrapper("upstairs_repair_notification") + .transaction(&conn, |conn| { + let record = record.clone(); + let err = err.clone(); + + async move { + // Return 409 if a repair ID does not match types + let mismatched_record_type_count: usize = + dsl::upstairs_repair_notification + .filter(dsl::repair_id.eq(record.repair_id)) + .filter(dsl::repair_type.ne(record.repair_type)) + .execute_async(&conn) + .await?; + + if mismatched_record_type_count > 0 { + return Err(err.bail(Error::conflict(&format!( + "existing repair type for id {} does not match {:?}!", + record.repair_id, + record.repair_type, + )))); + } + + match &record.notification_type { + UpstairsRepairNotificationType::Started => { + // Proceed - the insertion can succeed or fail below + // based on the table's primary key + } + + UpstairsRepairNotificationType::Succeeded + | UpstairsRepairNotificationType::Failed => { + // However, Nexus must accept only one "finished" + // status - an Upstairs cannot change this and must + // instead perform another repair with a new repair + // ID. + let maybe_existing_finish_record: Option< + UpstairsRepairNotification, + > = dsl::upstairs_repair_notification + .filter(dsl::repair_id.eq(record.repair_id)) + .filter(dsl::upstairs_id.eq(record.upstairs_id)) + .filter(dsl::session_id.eq(record.session_id)) + .filter(dsl::region_id.eq(record.region_id)) + .filter(dsl::notification_type.eq_any(vec![ + UpstairsRepairNotificationType::Succeeded, + UpstairsRepairNotificationType::Failed, + ])) + .get_result_async(&conn) + .await + .optional()?; + + if let Some(existing_finish_record) = + maybe_existing_finish_record + { + if existing_finish_record.notification_type + != record.notification_type + { + return Err(err.bail(Error::conflict( + "existing finish record does not match", + ))); + } else { + // inserting the same record, bypass + return Ok(()); + } + } + } + } + + diesel::insert_into(dsl::upstairs_repair_notification) + .values(record) + .on_conflict(( + dsl::repair_id, + dsl::upstairs_id, + dsl::session_id, + dsl::region_id, + dsl::notification_type, + )) + .do_nothing() + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } + + /// Record Upstairs repair progress + pub async fn upstairs_repair_progress( + &self, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_id: TypedUuid, + repair_progress: RepairProgress, + ) -> Result<(), Error> { + use db::schema::upstairs_repair_notification::dsl as notification_dsl; + use db::schema::upstairs_repair_progress::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); + + self.transaction_retry_wrapper("upstairs_repair_progress") + .transaction(&conn, |conn| { + let repair_progress = repair_progress.clone(); + let err = err.clone(); + + async move { + // Check that there is a repair id for the upstairs id + let matching_repair: Option = + notification_dsl::upstairs_repair_notification + .filter(notification_dsl::repair_id.eq(nexus_db_model::to_db_typed_uuid(repair_id))) + .filter(notification_dsl::upstairs_id.eq(nexus_db_model::to_db_typed_uuid(upstairs_id))) + .filter(notification_dsl::notification_type.eq(UpstairsRepairNotificationType::Started)) + .get_result_async(&conn) + .await + .optional()?; + + if matching_repair.is_none() { + return Err(err.bail(Error::non_resourcetype_not_found(&format!( + "upstairs {upstairs_id} repair {repair_id} not found" + )))); + } + + diesel::insert_into(dsl::upstairs_repair_progress) + .values(UpstairsRepairProgress { + repair_id: repair_id.into(), + time: repair_progress.time, + current_item: repair_progress.current_item, + total_items: repair_progress.total_items, + }) + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } + + /// Record when a Downstairs client is requested to stop, and why + pub async fn downstairs_client_stop_request_notification( + &self, + opctx: &OpContext, + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, + downstairs_client_stop_request: DownstairsClientStopRequest, + ) -> Result<(), Error> { + use db::schema::downstairs_client_stop_request_notification::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + diesel::insert_into(dsl::downstairs_client_stop_request_notification) + .values(DownstairsClientStopRequestNotification { + time: downstairs_client_stop_request.time, + upstairs_id: upstairs_id.into(), + downstairs_id: downstairs_id.into(), + reason: downstairs_client_stop_request.reason.into(), + }) + .on_conflict(( + dsl::time, + dsl::upstairs_id, + dsl::downstairs_id, + dsl::reason, + )) + .do_nothing() + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } + + /// Record when a Downstairs client is stopped, and why + pub async fn downstairs_client_stopped_notification( + &self, + opctx: &OpContext, + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, + downstairs_client_stopped: DownstairsClientStopped, + ) -> Result<(), Error> { + use db::schema::downstairs_client_stopped_notification::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + diesel::insert_into(dsl::downstairs_client_stopped_notification) + .values(DownstairsClientStoppedNotification { + time: downstairs_client_stopped.time, + upstairs_id: upstairs_id.into(), + downstairs_id: downstairs_id.into(), + reason: downstairs_client_stopped.reason.into(), + }) + .on_conflict(( + dsl::time, + dsl::upstairs_id, + dsl::downstairs_id, + dsl::reason, + )) + .do_nothing() + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } } #[derive(Default, Clone, Debug, Serialize, Deserialize)] diff --git a/nexus/db-queries/src/db/pool_connection.rs b/nexus/db-queries/src/db/pool_connection.rs index f419ba6852..0331a3a103 100644 --- a/nexus/db-queries/src/db/pool_connection.rs +++ b/nexus/db-queries/src/db/pool_connection.rs @@ -44,6 +44,8 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "caboose_which", "dataset_kind", "dns_group", + "downstairs_client_stop_request_reason_type", + "downstairs_client_stopped_reason_type", "hw_power_state", "hw_rot_slot", "identity_type", @@ -69,6 +71,8 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "switch_link_fec", "switch_link_speed", "switch_port_geometry", + "upstairs_repair_notification_type", + "upstairs_repair_type", "user_provision_type", "vpc_firewall_rule_action", "vpc_firewall_rule_direction", diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index eb5f83470f..741b5b8b6d 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -4,7 +4,6 @@ //! Routines that manage instance-related networking state. -use crate::app::sagas::retry_until_known_result; use ipnetwork::IpNetwork; use ipnetwork::Ipv6Network; use nexus_db_model::ExternalIp; @@ -24,6 +23,7 @@ use omicron_common::api::external::Ipv6Net; use omicron_common::api::internal::nexus; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_common::retry_until_known_result; use sled_agent_client::types::DeleteVirtualNetworkInterfaceHost; use sled_agent_client::types::SetVirtualNetworkInterfaceHost; use std::collections::HashSet; diff --git a/nexus/src/app/sagas/common_storage.rs b/nexus/src/app/sagas/common_storage.rs index a7350d91fd..3b590f6205 100644 --- a/nexus/src/app/sagas/common_storage.rs +++ b/nexus/src/app/sagas/common_storage.rs @@ -6,7 +6,6 @@ use super::*; -use crate::app::sagas::retry_until_known_result; use crate::Nexus; use anyhow::anyhow; use crucible_agent_client::{ @@ -22,6 +21,7 @@ use nexus_db_queries::db::identity::Asset; use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external::Error; use omicron_common::backoff::{self, BackoffError}; +use omicron_common::retry_until_known_result; use slog::Logger; use std::net::SocketAddrV6; diff --git a/nexus/src/app/sagas/loopback_address_create.rs b/nexus/src/app/sagas/loopback_address_create.rs index a5d89f202c..c32a5f387d 100644 --- a/nexus/src/app/sagas/loopback_address_create.rs +++ b/nexus/src/app/sagas/loopback_address_create.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{NexusActionContext, NEXUS_DPD_TAG}; -use crate::app::sagas::retry_until_known_result; use crate::app::sagas::{ declare_saga_actions, ActionRegistry, NexusSaga, SagaInitError, }; @@ -13,6 +12,7 @@ use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::db::model::LoopbackAddress; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_common::retry_until_known_result; use serde::{Deserialize, Serialize}; use std::sync::Arc; use steno::ActionError; diff --git a/nexus/src/app/sagas/loopback_address_delete.rs b/nexus/src/app/sagas/loopback_address_delete.rs index a030178d27..822a360acf 100644 --- a/nexus/src/app/sagas/loopback_address_delete.rs +++ b/nexus/src/app/sagas/loopback_address_delete.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::NexusActionContext; -use crate::app::sagas::retry_until_known_result; use crate::app::sagas::{ declare_saga_actions, ActionRegistry, NexusSaga, SagaInitError, }; @@ -14,6 +13,7 @@ use nexus_db_queries::authz; use nexus_db_queries::db::model::{LoopbackAddress, Name}; use nexus_types::identity::Asset; use omicron_common::api::external::{IpNet, NameOrId}; +use omicron_common::retry_until_known_result; use serde::{Deserialize, Serialize}; use std::sync::Arc; use steno::ActionError; diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index aef8442090..01b01c4571 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -328,88 +328,6 @@ pub(crate) use __emit_action; pub(crate) use __stringify_ident; pub(crate) use declare_saga_actions; -use futures::Future; - -/// Retry a progenitor client operation until a known result is returned. -/// -/// Saga execution relies on the outcome of an external call being known: since -/// they are idempotent, reissue the external call until a known result comes -/// back. Retry if a communication error is seen, or if another retryable error -/// is seen. -/// -/// Note that retrying is only valid if the call itself is idempotent. -pub(crate) async fn retry_until_known_result( - log: &slog::Logger, - mut f: F, -) -> Result> -where - F: FnMut() -> Fut, - Fut: Future>>, - E: std::fmt::Debug, -{ - use omicron_common::backoff; - - backoff::retry_notify( - backoff::retry_policy_internal_service(), - move || { - let fut = f(); - async move { - match fut.await { - Err(progenitor_client::Error::CommunicationError(e)) => { - warn!( - log, - "saw transient communication error {}, retrying...", - e, - ); - - Err(backoff::BackoffError::transient( - progenitor_client::Error::CommunicationError(e), - )) - } - - Err(progenitor_client::Error::ErrorResponse( - response_value, - )) => { - match response_value.status() { - // Retry on 503 or 429 - http::StatusCode::SERVICE_UNAVAILABLE - | http::StatusCode::TOO_MANY_REQUESTS => { - Err(backoff::BackoffError::transient( - progenitor_client::Error::ErrorResponse( - response_value, - ), - )) - } - - // Anything else is a permanent error - _ => Err(backoff::BackoffError::Permanent( - progenitor_client::Error::ErrorResponse( - response_value, - ), - )), - } - } - - Err(e) => { - warn!(log, "saw permanent error {}, aborting", e,); - - Err(backoff::BackoffError::Permanent(e)) - } - - Ok(v) => Ok(v), - } - } - }, - |error: progenitor_client::Error<_>, delay| { - warn!( - log, - "failed external call ({:?}), will retry in {:?}", error, delay, - ); - }, - ) - .await -} - /// Reliable persistent workflows can request that sagas be run as part of their /// activation by sending a SagaRequest through a supplied channel to Nexus. pub enum SagaRequest { diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index e017ab377b..f1d1a2bd02 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -99,7 +99,6 @@ use super::{ ACTION_GENERATE_ID, }; use crate::app::sagas::declare_saga_actions; -use crate::app::sagas::retry_until_known_result; use crate::app::{authn, authz, db}; use crate::external_api::params; use anyhow::anyhow; @@ -109,6 +108,7 @@ use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external; use omicron_common::api::external::Error; +use omicron_common::retry_until_known_result; use rand::{rngs::StdRng, RngCore, SeedableRng}; use serde::Deserialize; use serde::Serialize; diff --git a/nexus/src/app/sagas/switch_port_settings_apply.rs b/nexus/src/app/sagas/switch_port_settings_apply.rs index 9e2331f416..44f2f77ea1 100644 --- a/nexus/src/app/sagas/switch_port_settings_apply.rs +++ b/nexus/src/app/sagas/switch_port_settings_apply.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{NexusActionContext, NEXUS_DPD_TAG}; -use crate::app::sagas::retry_until_known_result; use crate::app::sagas::switch_port_settings_common::{ api_to_dpd_port_settings, ensure_switch_port_bgp_settings, ensure_switch_port_uplink, select_dendrite_client, select_mg_client, @@ -24,6 +23,7 @@ use nexus_db_queries::db::datastore::UpdatePrecondition; use nexus_db_queries::{authn, db}; use omicron_common::api::external::{self, NameOrId}; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_common::retry_until_known_result; use serde::{Deserialize, Serialize}; use std::net::IpAddr; use std::str::FromStr; diff --git a/nexus/src/app/sagas/switch_port_settings_clear.rs b/nexus/src/app/sagas/switch_port_settings_clear.rs index 15290dd75b..2e35530ef1 100644 --- a/nexus/src/app/sagas/switch_port_settings_clear.rs +++ b/nexus/src/app/sagas/switch_port_settings_clear.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{NexusActionContext, NEXUS_DPD_TAG}; -use crate::app::sagas::retry_until_known_result; use crate::app::sagas::switch_port_settings_common::{ api_to_dpd_port_settings, apply_bootstore_update, bootstore_update, ensure_switch_port_bgp_settings, ensure_switch_port_uplink, @@ -23,6 +22,7 @@ use nexus_db_model::NETWORK_KEY; use nexus_db_queries::authn; use nexus_db_queries::db::datastore::UpdatePrecondition; use omicron_common::api::external::{self, NameOrId, SwitchLocation}; +use omicron_common::retry_until_known_result; use serde::{Deserialize, Serialize}; use std::net::IpAddr; use std::str::FromStr; diff --git a/nexus/src/app/volume.rs b/nexus/src/app/volume.rs index c36c4524c1..8cfffdb686 100644 --- a/nexus/src/app/volume.rs +++ b/nexus/src/app/volume.rs @@ -5,9 +5,20 @@ //! Volumes use crate::app::sagas; +use nexus_db_model::UpstairsRepairNotification; +use nexus_db_model::UpstairsRepairNotificationType; use nexus_db_queries::authn; use nexus_db_queries::context::OpContext; use omicron_common::api::external::DeleteResult; +use omicron_common::api::internal::nexus::DownstairsClientStopRequest; +use omicron_common::api::internal::nexus::DownstairsClientStopped; +use omicron_common::api::internal::nexus::RepairFinishInfo; +use omicron_common::api::internal::nexus::RepairProgress; +use omicron_common::api::internal::nexus::RepairStartInfo; +use omicron_uuid_kinds::DownstairsKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; use std::sync::Arc; use uuid::Uuid; @@ -30,4 +41,155 @@ impl super::Nexus { Ok(()) } + + /// An Upstairs is telling us when a repair is starting. + pub(crate) async fn upstairs_repair_start( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_start_info: RepairStartInfo, + ) -> DeleteResult { + info!( + self.log, + "received upstairs_repair_start from upstairs {upstairs_id}: {:?}", + repair_start_info, + ); + + for repaired_downstairs in repair_start_info.repairs { + self.db_datastore + .upstairs_repair_notification( + opctx, + UpstairsRepairNotification::new( + repair_start_info.time, + repair_start_info.repair_id, + repair_start_info.repair_type.into(), + upstairs_id, + repair_start_info.session_id, + repaired_downstairs.region_uuid, + repaired_downstairs.target_addr, + UpstairsRepairNotificationType::Started, + ), + ) + .await?; + } + + Ok(()) + } + + /// An Upstairs is telling us when a repair is finished, and the result. + pub(crate) async fn upstairs_repair_finish( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_finish_info: RepairFinishInfo, + ) -> DeleteResult { + info!( + self.log, + "received upstairs_repair_finish from upstairs {upstairs_id}: {:?}", + repair_finish_info, + ); + + for repaired_downstairs in repair_finish_info.repairs { + self.db_datastore + .upstairs_repair_notification( + opctx, + UpstairsRepairNotification::new( + repair_finish_info.time, + repair_finish_info.repair_id, + repair_finish_info.repair_type.into(), + upstairs_id, + repair_finish_info.session_id, + repaired_downstairs.region_uuid, + repaired_downstairs.target_addr, + if repair_finish_info.aborted { + UpstairsRepairNotificationType::Failed + } else { + UpstairsRepairNotificationType::Succeeded + }, + ), + ) + .await?; + + if !repair_finish_info.aborted { + // TODO-followup if there's an active region replacement + // occurring, a successfully completed live repair can trigger a + // saga to destroy the original region. + } + } + + Ok(()) + } + + /// An Upstairs is updating us with repair progress + pub(crate) async fn upstairs_repair_progress( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_id: TypedUuid, + repair_progress: RepairProgress, + ) -> DeleteResult { + info!( + self.log, + "received upstairs_repair_progress from upstairs {upstairs_id} for repair {repair_id}: {:?}", + repair_progress, + ); + + self.db_datastore + .upstairs_repair_progress( + opctx, + upstairs_id, + repair_id, + repair_progress, + ) + .await + } + + /// An Upstairs is telling us that a Downstairs client task was requested to + /// stop + pub(crate) async fn downstairs_client_stop_request_notification( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, + downstairs_client_stop_request: DownstairsClientStopRequest, + ) -> DeleteResult { + info!( + self.log, + "received downstairs_client_stop_request_notification from upstairs {upstairs_id} for downstairs {downstairs_id}: {:?}", + downstairs_client_stop_request, + ); + + self.db_datastore + .downstairs_client_stop_request_notification( + opctx, + upstairs_id, + downstairs_id, + downstairs_client_stop_request, + ) + .await + } + + /// An Upstairs is telling us that a Downstairs client task was stopped + pub(crate) async fn downstairs_client_stopped_notification( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, + downstairs_client_stopped: DownstairsClientStopped, + ) -> DeleteResult { + info!( + self.log, + "received downstairs_client_stopped_notification from upstairs {upstairs_id} for downstairs {downstairs_id}: {:?}", + downstairs_client_stopped, + ); + + self.db_datastore + .downstairs_client_stopped_notification( + opctx, + upstairs_id, + downstairs_id, + downstairs_client_stopped, + ) + .await + } } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 9871c38117..0676ace70c 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -45,9 +45,18 @@ use omicron_common::api::external::http_pagination::ScanById; use omicron_common::api::external::http_pagination::ScanParams; use omicron_common::api::external::Error; use omicron_common::api::internal::nexus::DiskRuntimeState; +use omicron_common::api::internal::nexus::DownstairsClientStopRequest; +use omicron_common::api::internal::nexus::DownstairsClientStopped; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::RepairFinishInfo; +use omicron_common::api::internal::nexus::RepairProgress; +use omicron_common::api::internal::nexus::RepairStartInfo; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_common::update::ArtifactId; +use omicron_uuid_kinds::DownstairsKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; use oximeter::types::ProducerResults; use oximeter_producer::{collect, ProducerIdPathParams}; use schemars::JsonSchema; @@ -78,6 +87,12 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(cpapi_metrics_collect)?; api.register(cpapi_artifact_download)?; + api.register(cpapi_upstairs_repair_start)?; + api.register(cpapi_upstairs_repair_finish)?; + api.register(cpapi_upstairs_repair_progress)?; + api.register(cpapi_downstairs_client_stop_request)?; + api.register(cpapi_downstairs_client_stopped)?; + api.register(saga_list)?; api.register(saga_view)?; @@ -513,6 +528,171 @@ async fn cpapi_artifact_download( Ok(HttpResponseOk(Body::from(body).into())) } +/// Path parameters for Upstairs requests (internal API) +#[derive(Deserialize, JsonSchema)] +struct UpstairsPathParam { + upstairs_id: TypedUuid, +} + +/// An Upstairs will notify this endpoint when a repair starts +#[endpoint { + method = POST, + path = "/crucible/0/upstairs/{upstairs_id}/repair-start", + }] +async fn cpapi_upstairs_repair_start( + rqctx: RequestContext>, + path_params: Path, + repair_start_info: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .upstairs_repair_start( + &opctx, + path.upstairs_id, + repair_start_info.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// An Upstairs will notify this endpoint when a repair finishes. +#[endpoint { + method = POST, + path = "/crucible/0/upstairs/{upstairs_id}/repair-finish", + }] +async fn cpapi_upstairs_repair_finish( + rqctx: RequestContext>, + path_params: Path, + repair_finish_info: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .upstairs_repair_finish( + &opctx, + path.upstairs_id, + repair_finish_info.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// Path parameters for Upstairs requests (internal API) +#[derive(Deserialize, JsonSchema)] +struct UpstairsRepairPathParam { + upstairs_id: TypedUuid, + repair_id: TypedUuid, +} + +/// An Upstairs will update this endpoint with the progress of a repair +#[endpoint { + method = POST, + path = "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress", + }] +async fn cpapi_upstairs_repair_progress( + rqctx: RequestContext>, + path_params: Path, + repair_progress: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .upstairs_repair_progress( + &opctx, + path.upstairs_id, + path.repair_id, + repair_progress.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// Path parameters for Downstairs requests (internal API) +#[derive(Deserialize, JsonSchema)] +struct UpstairsDownstairsPathParam { + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, +} + +/// An Upstairs will update this endpoint if a Downstairs client task is +/// requested to stop +#[endpoint { + method = POST, + path = "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stop-request", + }] +async fn cpapi_downstairs_client_stop_request( + rqctx: RequestContext>, + path_params: Path, + downstairs_client_stop_request: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .downstairs_client_stop_request_notification( + &opctx, + path.upstairs_id, + path.downstairs_id, + downstairs_client_stop_request.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// An Upstairs will update this endpoint if a Downstairs client task stops for +/// any reason (not just after being requested to) +#[endpoint { + method = POST, + path = "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped", + }] +async fn cpapi_downstairs_client_stopped( + rqctx: RequestContext>, + path_params: Path, + downstairs_client_stopped: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .downstairs_client_stopped_notification( + &opctx, + path.upstairs_id, + path.downstairs_id, + downstairs_client_stopped.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + // Sagas /// List sagas diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 34f037ee8c..289446fe85 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -5,6 +5,7 @@ //! Tests that Nexus properly manages and cleans up Crucible resources //! associated with Volumes +use chrono::Utc; use dropshot::test_util::ClientTestContext; use http::method::Method; use http::StatusCode; @@ -24,6 +25,13 @@ use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Name; +use omicron_common::api::internal; +use omicron_uuid_kinds::DownstairsKind; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; +use omicron_uuid_kinds::UpstairsSessionKind; use rand::prelude::SliceRandom; use rand::{rngs::StdRng, SeedableRng}; use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; @@ -2552,3 +2560,763 @@ async fn test_volume_hard_delete_idempotent( datastore.volume_hard_delete(volume_id).await.unwrap(); datastore.volume_hard_delete(volume_id).await.unwrap(); } + +// internal API related tests + +/// Test that an Upstairs can reissue live repair notifications +#[nexus_test] +async fn test_upstairs_repair_notify_idempotent( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Send the same start request. + let notify_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + + let request = internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345".parse().unwrap(), + }], + }; + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(request.clone()), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Send the same finish request. + let notify_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + + let request = internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345".parse().unwrap(), + }], + aborted: false, + }; + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(request.clone()), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that an Upstairs cannot issue different finish statuses for the same +/// repair. +#[nexus_test] +async fn test_upstairs_repair_notify_different_finish_status( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + let notify_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, // live repair was ok + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, // live repair failed? + }), + StatusCode::CONFLICT, + ) + .await + .unwrap_err(); +} + +/// Test that the same Upstairs can rerun a repair again. +#[nexus_test] +async fn test_upstairs_repair_same_upstairs_retry( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair + + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_finish_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate the same Upstairs restarting the repair, which passes this time + + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that a different Upstairs session can rerun a repair again. +#[nexus_test] +async fn test_upstairs_repair_different_upstairs_retry( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair by one Upstairs + + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_finish_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate a different Upstairs session restarting the repair, which passes this time + + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that a different Upstairs session can rerun an interrupted repair +#[nexus_test] +async fn test_upstairs_repair_different_upstairs_retry_interrupted( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair by one Upstairs, which was interrupted (which + // leads to no finish message). + + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_finish_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate a different Upstairs session restarting the interrupted repair, + // which passes this time + + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that the same repair ID cannot be used for different repair types +#[nexus_test] +async fn test_upstairs_repair_repair_id_and_type_conflict( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: + internal::nexus::UpstairsRepairType::Reconciliation, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::CONFLICT, + ) + .await + .unwrap_err(); +} + +/// Test that an Upstairs can submit progress for a repair +#[nexus_test] +async fn test_upstairs_repair_submit_progress( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // A repair must be started before progress can be submitted + + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + let progress_url = format!( + "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress" + ); + + for i in 0..100 { + int_client + .make_request( + Method::POST, + &progress_url, + Some(internal::nexus::RepairProgress { + time: Utc::now(), + current_item: i, + total_items: 100, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + } +} + +/// Test that an Upstairs can't submit progress unless a repair was started +#[nexus_test] +async fn test_upstairs_repair_reject_submit_progress_when_no_repair( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + let progress_url = format!( + "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress" + ); + + int_client + .make_request( + Method::POST, + &progress_url, + Some(internal::nexus::RepairProgress { + time: Utc::now(), + current_item: 10, + total_items: 100, + }), + StatusCode::NOT_FOUND, + ) + .await + .unwrap_err(); +} + +/// Test that an Upstairs can notify Nexus when a Downstairs client task is +/// requested to stop +#[nexus_test] +async fn test_upstairs_notify_downstairs_client_stop_request( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let downstairs_id: TypedUuid = TypedUuid::new_v4(); + + let stop_request_url = format!( + "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stop-request" + ); + + // Make sure an Upstairs can re-send the notification + + let request = internal::nexus::DownstairsClientStopRequest { + time: Utc::now(), + reason: + internal::nexus::DownstairsClientStopRequestReason::TooManyOutstandingJobs, + }; + + int_client + .make_request( + Method::POST, + &stop_request_url, + Some(request.clone()), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + &stop_request_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // The client can be requested to stop for the same reason a different time + + let request = internal::nexus::DownstairsClientStopRequest { + time: Utc::now(), + reason: + internal::nexus::DownstairsClientStopRequestReason::TooManyOutstandingJobs, + }; + + int_client + .make_request( + Method::POST, + &stop_request_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // The client can also be requested to stop for a different reason + + let request = internal::nexus::DownstairsClientStopRequest { + time: Utc::now(), + reason: internal::nexus::DownstairsClientStopRequestReason::IOError, + }; + + int_client + .make_request( + Method::POST, + &stop_request_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that an Upstairs can notify Nexus when a Downstairs client task stops +#[nexus_test] +async fn test_upstairs_notify_downstairs_client_stops( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let downstairs_id: TypedUuid = TypedUuid::new_v4(); + + let stopped_url = format!( + "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped" + ); + + // Make sure an Upstairs can re-send the notification + + let request = internal::nexus::DownstairsClientStopped { + time: Utc::now(), + reason: internal::nexus::DownstairsClientStoppedReason::ReadFailed, + }; + + int_client + .make_request( + Method::POST, + &stopped_url, + Some(request.clone()), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + &stopped_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // The client can stop for the same reason a different time + + let request = internal::nexus::DownstairsClientStopped { + time: Utc::now(), + reason: internal::nexus::DownstairsClientStoppedReason::ReadFailed, + }; + + int_client + .make_request( + Method::POST, + &stopped_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // The client can also stop for a different reason + + let request = internal::nexus::DownstairsClientStopped { + time: Utc::now(), + reason: internal::nexus::DownstairsClientStoppedReason::Timeout, + }; + + int_client + .make_request( + Method::POST, + &stopped_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 09b5f1e5ab..679597c453 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -125,6 +125,217 @@ } } }, + "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stop-request": { + "post": { + "summary": "An Upstairs will update this endpoint if a Downstairs client task is", + "description": "requested to stop", + "operationId": "cpapi_downstairs_client_stop_request", + "parameters": [ + { + "in": "path", + "name": "downstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForDownstairsKind" + } + }, + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DownstairsClientStopRequest" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped": { + "post": { + "summary": "An Upstairs will update this endpoint if a Downstairs client task stops for", + "description": "any reason (not just after being requested to)", + "operationId": "cpapi_downstairs_client_stopped", + "parameters": [ + { + "in": "path", + "name": "downstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForDownstairsKind" + } + }, + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DownstairsClientStopped" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress": { + "post": { + "summary": "An Upstairs will update this endpoint with the progress of a repair", + "operationId": "cpapi_upstairs_repair_progress", + "parameters": [ + { + "in": "path", + "name": "repair_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsRepairKind" + } + }, + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairProgress" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/crucible/0/upstairs/{upstairs_id}/repair-finish": { + "post": { + "summary": "An Upstairs will notify this endpoint when a repair finishes.", + "operationId": "cpapi_upstairs_repair_finish", + "parameters": [ + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairFinishInfo" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/crucible/0/upstairs/{upstairs_id}/repair-start": { + "post": { + "summary": "An Upstairs will notify this endpoint when a repair starts", + "operationId": "cpapi_upstairs_repair_start", + "parameters": [ + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairStartInfo" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/deployment/blueprints/all": { "get": { "summary": "Lists blueprints", @@ -3649,6 +3860,81 @@ } ] }, + "DownstairsClientStopRequest": { + "type": "object", + "properties": { + "reason": { + "$ref": "#/components/schemas/DownstairsClientStopRequestReason" + }, + "time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "reason", + "time" + ] + }, + "DownstairsClientStopRequestReason": { + "type": "string", + "enum": [ + "replacing", + "disabled", + "failed_reconcile", + "i_o_error", + "bad_negotiation_order", + "incompatible", + "failed_live_repair", + "too_many_outstanding_jobs", + "deactivated" + ] + }, + "DownstairsClientStopped": { + "type": "object", + "properties": { + "reason": { + "$ref": "#/components/schemas/DownstairsClientStoppedReason" + }, + "time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "reason", + "time" + ] + }, + "DownstairsClientStoppedReason": { + "type": "string", + "enum": [ + "connection_timeout", + "connection_failed", + "timeout", + "write_failed", + "read_failed", + "requested_stop", + "finished", + "queue_closed", + "receive_task_cancelled" + ] + }, + "DownstairsUnderRepair": { + "type": "object", + "properties": { + "region_uuid": { + "$ref": "#/components/schemas/TypedUuidForDownstairsRegionKind" + }, + "target_addr": { + "type": "string" + } + }, + "required": [ + "region_uuid", + "target_addr" + ] + }, "Duration": { "type": "object", "properties": { @@ -6102,6 +6388,94 @@ "user_password_hash" ] }, + "RepairFinishInfo": { + "type": "object", + "properties": { + "aborted": { + "type": "boolean" + }, + "repair_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsRepairKind" + }, + "repair_type": { + "$ref": "#/components/schemas/UpstairsRepairType" + }, + "repairs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DownstairsUnderRepair" + } + }, + "session_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsSessionKind" + }, + "time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "aborted", + "repair_id", + "repair_type", + "repairs", + "session_id", + "time" + ] + }, + "RepairProgress": { + "type": "object", + "properties": { + "current_item": { + "type": "integer", + "format": "int64" + }, + "time": { + "type": "string", + "format": "date-time" + }, + "total_items": { + "type": "integer", + "format": "int64" + } + }, + "required": [ + "current_item", + "time", + "total_items" + ] + }, + "RepairStartInfo": { + "type": "object", + "properties": { + "repair_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsRepairKind" + }, + "repair_type": { + "$ref": "#/components/schemas/UpstairsRepairType" + }, + "repairs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DownstairsUnderRepair" + } + }, + "session_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsSessionKind" + }, + "time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "repair_id", + "repair_type", + "repairs", + "session_id", + "time" + ] + }, "RouteConfig": { "type": "object", "properties": { @@ -6979,6 +7353,18 @@ "SwitchPutResponse": { "type": "object" }, + "TypedUuidForDownstairsRegionKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForUpstairsRepairKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForUpstairsSessionKind": { + "type": "string", + "format": "uuid" + }, "UninitializedSled": { "description": "A sled that has not been added to an initialized rack yet", "type": "object", @@ -7039,6 +7425,13 @@ "items" ] }, + "UpstairsRepairType": { + "type": "string", + "enum": [ + "live", + "reconciliation" + ] + }, "UserId": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID though they may contain a UUID.", @@ -7125,6 +7518,14 @@ "type": "string", "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" }, + "TypedUuidForDownstairsKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForUpstairsKind": { + "type": "string", + "format": "uuid" + }, "IdSortMode": { "description": "Supported set of sort modes for scanning by id only.\n\nCurrently, we only support scanning in ascending order.", "oneOf": [ diff --git a/schema/crdb/43.0.0/up01.sql b/schema/crdb/43.0.0/up01.sql new file mode 100644 index 0000000000..b94f43eda7 --- /dev/null +++ b/schema/crdb/43.0.0/up01.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_notification_type AS ENUM ( + 'started', + 'succeeded', + 'failed' +); diff --git a/schema/crdb/43.0.0/up02.sql b/schema/crdb/43.0.0/up02.sql new file mode 100644 index 0000000000..47c5f7f03a --- /dev/null +++ b/schema/crdb/43.0.0/up02.sql @@ -0,0 +1,4 @@ +CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_type AS ENUM ( + 'live', + 'reconciliation' +); diff --git a/schema/crdb/43.0.0/up03.sql b/schema/crdb/43.0.0/up03.sql new file mode 100644 index 0000000000..a33c83c1ab --- /dev/null +++ b/schema/crdb/43.0.0/up03.sql @@ -0,0 +1,21 @@ +CREATE TABLE IF NOT EXISTS omicron.public.upstairs_repair_notification ( + time TIMESTAMPTZ NOT NULL, + + repair_id UUID NOT NULL, + repair_type omicron.public.upstairs_repair_type NOT NULL, + + upstairs_id UUID NOT NULL, + session_id UUID NOT NULL, + + region_id UUID NOT NULL, + target_ip INET NOT NULL, + target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, + + notification_type omicron.public.upstairs_repair_notification_type NOT NULL, + + /* + * A repair is uniquely identified by the four UUIDs here, and a + * notification is uniquely identified by its type. + */ + PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) +); diff --git a/schema/crdb/43.0.0/up04.sql b/schema/crdb/43.0.0/up04.sql new file mode 100644 index 0000000000..ed51c15a1e --- /dev/null +++ b/schema/crdb/43.0.0/up04.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS omicron.public.upstairs_repair_progress ( + repair_id UUID NOT NULL, + time TIMESTAMPTZ NOT NULL, + current_item INT8 NOT NULL, + total_items INT8 NOT NULL, + + PRIMARY KEY (repair_id, time, current_item, total_items) +); diff --git a/schema/crdb/43.0.0/up05.sql b/schema/crdb/43.0.0/up05.sql new file mode 100644 index 0000000000..16fb91d410 --- /dev/null +++ b/schema/crdb/43.0.0/up05.sql @@ -0,0 +1,11 @@ +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stop_request_reason_type AS ENUM ( + 'replacing', + 'disabled', + 'failed_reconcile', + 'io_error', + 'bad_negotiation_order', + 'incompatible', + 'failed_live_repair', + 'too_many_outstanding_jobs', + 'deactivated' +); diff --git a/schema/crdb/43.0.0/up06.sql b/schema/crdb/43.0.0/up06.sql new file mode 100644 index 0000000000..2cf45ce101 --- /dev/null +++ b/schema/crdb/43.0.0/up06.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS omicron.public.downstairs_client_stop_request_notification ( + time TIMESTAMPTZ NOT NULL, + upstairs_id UUID NOT NULL, + downstairs_id UUID NOT NULL, + reason omicron.public.downstairs_client_stop_request_reason_type NOT NULL, + + PRIMARY KEY (time, upstairs_id, downstairs_id, reason) +); diff --git a/schema/crdb/43.0.0/up07.sql b/schema/crdb/43.0.0/up07.sql new file mode 100644 index 0000000000..cb8903b1d3 --- /dev/null +++ b/schema/crdb/43.0.0/up07.sql @@ -0,0 +1,11 @@ +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stopped_reason_type AS ENUM ( + 'connection_timeout', + 'connection_failed', + 'timeout', + 'write_failed', + 'read_failed', + 'requested_stop', + 'finished', + 'queue_closed', + 'receive_task_cancelled' +); diff --git a/schema/crdb/43.0.0/up08.sql b/schema/crdb/43.0.0/up08.sql new file mode 100644 index 0000000000..6f898f382c --- /dev/null +++ b/schema/crdb/43.0.0/up08.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS omicron.public.downstairs_client_stopped_notification ( + time TIMESTAMPTZ NOT NULL, + upstairs_id UUID NOT NULL, + downstairs_id UUID NOT NULL, + reason omicron.public.downstairs_client_stopped_reason_type NOT NULL, + + PRIMARY KEY (time, upstairs_id, downstairs_id, reason) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 9895464fb5..255cdc8135 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3552,6 +3552,90 @@ ALTER TABLE omicron.public.external_ip ADD COLUMN IF NOT EXISTS is_probe BOOL NO ALTER TYPE omicron.public.network_interface_kind ADD VALUE IF NOT EXISTS 'probe'; +CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_notification_type AS ENUM ( + 'started', + 'succeeded', + 'failed' +); + +CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_type AS ENUM ( + 'live', + 'reconciliation' +); + +CREATE TABLE IF NOT EXISTS omicron.public.upstairs_repair_notification ( + time TIMESTAMPTZ NOT NULL, + + repair_id UUID NOT NULL, + repair_type omicron.public.upstairs_repair_type NOT NULL, + + upstairs_id UUID NOT NULL, + session_id UUID NOT NULL, + + region_id UUID NOT NULL, + target_ip INET NOT NULL, + target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, + + notification_type omicron.public.upstairs_repair_notification_type NOT NULL, + + /* + * A repair is uniquely identified by the four UUIDs here, and a + * notification is uniquely identified by its type. + */ + PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) +); + +CREATE TABLE IF NOT EXISTS omicron.public.upstairs_repair_progress ( + repair_id UUID NOT NULL, + time TIMESTAMPTZ NOT NULL, + current_item INT8 NOT NULL, + total_items INT8 NOT NULL, + + PRIMARY KEY (repair_id, time, current_item, total_items) +); + +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stop_request_reason_type AS ENUM ( + 'replacing', + 'disabled', + 'failed_reconcile', + 'io_error', + 'bad_negotiation_order', + 'incompatible', + 'failed_live_repair', + 'too_many_outstanding_jobs', + 'deactivated' +); + +CREATE TABLE IF NOT EXISTS omicron.public.downstairs_client_stop_request_notification ( + time TIMESTAMPTZ NOT NULL, + upstairs_id UUID NOT NULL, + downstairs_id UUID NOT NULL, + reason omicron.public.downstairs_client_stop_request_reason_type NOT NULL, + + PRIMARY KEY (time, upstairs_id, downstairs_id, reason) +); + +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stopped_reason_type AS ENUM ( + 'connection_timeout', + 'connection_failed', + 'timeout', + 'write_failed', + 'read_failed', + 'requested_stop', + 'finished', + 'queue_closed', + 'receive_task_cancelled' +); + +CREATE TABLE IF NOT EXISTS omicron.public.downstairs_client_stopped_notification ( + time TIMESTAMPTZ NOT NULL, + upstairs_id UUID NOT NULL, + downstairs_id UUID NOT NULL, + reason omicron.public.downstairs_client_stopped_reason_type NOT NULL, + + PRIMARY KEY (time, upstairs_id, downstairs_id, reason) +); + /* * Metadata for the schema itself. This version number isn't great, as there's * nothing to ensure it gets bumped when it should be, but it's a start. @@ -3586,7 +3670,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '42.0.0', NULL) + ( TRUE, NOW(), NOW(), '43.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 12bc756d68..7018485b59 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -45,6 +45,11 @@ macro_rules! impl_typed_uuid_kind { // Please keep this list in alphabetical order. impl_typed_uuid_kind! { + DownstairsKind => "downstairs", + DownstairsRegionKind => "downstairs_region", LoopbackAddressKind => "loopback_address", TufRepoKind => "tuf_repo", + UpstairsKind => "upstairs", + UpstairsRepairKind => "upstairs_repair", + UpstairsSessionKind => "upstairs_session", }