diff --git a/Cargo.lock b/Cargo.lock index e15afdfbab..5b0806cb4d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5045,6 +5045,7 @@ dependencies = [ "omicron-rpaths", "omicron-sled-agent", "omicron-test-utils", + "omicron-uuid-kinds", "omicron-workspace-hack", "once_cell", "openapi-lint", diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 3972e011cf..48140bfafe 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -9,6 +9,10 @@ use crate::api::external::{ InstanceState, IpNet, SemverVersion, Vni, }; use chrono::{DateTime, Utc}; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::LiveRepairKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsSessionKind; use parse_display::{Display, FromStr}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -251,3 +255,24 @@ pub enum HostIdentifier { Ip(IpNet), Vpc(Vni), } + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct DownstairsUnderRepair { + pub region_uuid: TypedUuid, + pub target_addr: std::net::SocketAddrV6, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairStartInfo { + pub session_id: TypedUuid, + pub repair_id: TypedUuid, + pub repairs: Vec, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairFinishInfo { + pub session_id: TypedUuid, + pub repair_id: TypedUuid, + pub repairs: Vec, + pub aborted: bool, +} diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 6e9f2f135d..957640b07c 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -76,7 +76,6 @@ tokio-postgres = { workspace = true, features = ["with-serde_json-1"] } tough.workspace = true trust-dns-resolver.workspace = true uuid.workspace = true - nexus-blueprint-execution.workspace = true nexus-defaults.workspace = true nexus-db-model.workspace = true @@ -93,6 +92,7 @@ rustls = { workspace = true } rustls-pemfile = { workspace = true } update-common.workspace = true omicron-workspace-hack.workspace = true +omicron-uuid-kinds.workspace = true [dev-dependencies] async-bb8-diesel.workspace = true diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index ecbb8365fe..f85815b627 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -39,6 +39,7 @@ mod ipv4net; pub mod ipv6; mod ipv6net; mod l4_port_range; +mod live_repair; mod macaddr; mod name; mod network_interface; @@ -139,6 +140,7 @@ pub use ipv4net::*; pub use ipv6::*; pub use ipv6net::*; pub use l4_port_range::*; +pub use live_repair::*; pub use name::*; pub use network_interface::*; pub use oximeter_info::*; diff --git a/nexus/db-model/src/live_repair.rs b/nexus/db-model/src/live_repair.rs new file mode 100644 index 0000000000..88d79b2057 --- /dev/null +++ b/nexus/db-model/src/live_repair.rs @@ -0,0 +1,95 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::ipv6; +use crate::schema::live_repair_notification; +use crate::typed_uuid::DbTypedUuid; +use crate::SqlU16; +use chrono::{DateTime, Utc}; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::LiveRepairKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsSessionKind; +use serde::{Deserialize, Serialize}; +use std::net::SocketAddrV6; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "live_repair_notification_type", schema = "public"))] + pub struct LiveRepairNotificationTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = LiveRepairNotificationTypeEnum)] + pub enum LiveRepairNotificationType; + + // Notification types + Started => b"started" + Succeeded => b"succeeded" + Failed => b"failed" +); + +/// A record of Crucible live repair notifications: when a live repair started, +/// succeeded, failed, etc. +/// +/// Each live repair attempt is uniquely identified by the repair ID, upstairs +/// ID, session ID, and region ID. How those change tells Nexus about what is +/// going on: +/// +/// - if all IDs are the same for different requests, Nexus knows that the +/// client is retrying the notification. +/// +/// - if the upstairs ID, session ID, and region ID are all the same, but the +/// repair ID is different, then the same Upstairs is trying to repair that +/// region again. This could be due to a failed first attempt, or that +/// downstairs may have been kicked out again. +/// +/// - if the upstairs ID and region ID are the same, but the session ID and +/// repair ID are different, then a different session of the same Upstairs is +/// trying to repair that Downstairs. Session IDs change each time the +/// Upstairs is created, so it could have crashed, or it could have been +/// migrated and the destination Propolis' Upstairs is attempting to repair +/// the same region. +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = live_repair_notification)] +pub struct LiveRepairNotification { + pub time: DateTime, + + pub repair_id: DbTypedUuid, + pub upstairs_id: DbTypedUuid, + pub session_id: DbTypedUuid, + + pub region_id: DbTypedUuid, + pub target_ip: ipv6::Ipv6Addr, + pub target_port: SqlU16, + + pub notification_type: LiveRepairNotificationType, +} + +impl LiveRepairNotification { + pub fn new( + repair_id: TypedUuid, + upstairs_id: TypedUuid, + session_id: TypedUuid, + region_id: TypedUuid, + target_addr: SocketAddrV6, + notification_type: LiveRepairNotificationType, + ) -> Self { + Self { + time: Utc::now(), + repair_id: repair_id.into(), + upstairs_id: upstairs_id.into(), + session_id: session_id.into(), + region_id: region_id.into(), + target_ip: target_addr.ip().into(), + target_port: target_addr.port().into(), + notification_type, + } + } + + pub fn address(&self) -> SocketAddrV6 { + SocketAddrV6::new(*self.target_ip, *self.target_port, 0, 0) + } +} diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 54755486e5..7cc7216886 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion; /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(36, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(37, 0, 0); table! { disk (id) { @@ -1517,6 +1517,22 @@ table! { } } +table! { + live_repair_notification (repair_id, upstairs_id, session_id, region_id, notification_type) { + time -> Timestamptz, + + repair_id -> Uuid, + upstairs_id -> Uuid, + session_id -> Uuid, + + region_id -> Uuid, + target_ip -> Inet, + target_port -> Int4, + + notification_type -> crate::LiveRepairNotificationTypeEnum, + } +} + table! { db_metadata (singleton) { singleton -> Bool, diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index d0b093ff45..f502e3afad 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -6,10 +6,13 @@ use super::DataStore; use crate::db; +use crate::db::datastore::OpContext; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; +use crate::db::model::LiveRepairNotification; +use crate::db::model::LiveRepairNotificationType; use crate::db::model::Region; use crate::db::model::RegionSnapshot; use crate::db::model::Volume; @@ -809,6 +812,101 @@ impl DataStore { public_error_from_diesel(e, ErrorHandler::Server) }) } + + // An Upstairs is created as part of a Volume hierarchy if the Volume + // Construction Request includes a "Region" variant. This may be at any + // layer of the Volume, and some notifications will come from an Upstairs + // instead of the top level of the Volume. The following functions have an + // Upstairs ID instead of a Volume ID for this reason. + + /// Record when an Upstairs notifies us about a live repair. If that record + /// (uniquely identified by the four IDs passed in plus the notification + /// type) exists already, do nothing. + pub async fn live_repair_notification( + &self, + opctx: &OpContext, + record: LiveRepairNotification, + ) -> Result<(), Error> { + use db::schema::live_repair_notification::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); + + self.transaction_retry_wrapper("live_repair_notification") + .transaction(&conn, |conn| { + let record = record.clone(); + let err = err.clone(); + + async move { + match &record.notification_type { + LiveRepairNotificationType::Started => { + // Proceed - the insertion can succeed or fail below + // based on the table's primary key + } + + LiveRepairNotificationType::Succeeded + | LiveRepairNotificationType::Failed => { + // However, Nexus must accept only one "finished" + // status - an Upstairs cannot change this and must + // instead perform another repair with a new repair + // ID. + let maybe_existing_finish_record: Option< + LiveRepairNotification, + > = dsl::live_repair_notification + .filter(dsl::repair_id.eq(record.repair_id)) + .filter(dsl::upstairs_id.eq(record.upstairs_id)) + .filter(dsl::session_id.eq(record.session_id)) + .filter(dsl::region_id.eq(record.region_id)) + .filter(dsl::notification_type.eq_any(vec![ + LiveRepairNotificationType::Succeeded, + LiveRepairNotificationType::Failed, + ])) + .get_result_async(&conn) + .await + .optional()?; + + if let Some(existing_finish_record) = + maybe_existing_finish_record + { + if existing_finish_record.notification_type + != record.notification_type + { + return Err(err.bail(Error::conflict( + "existing finish record does not match", + ))); + } else { + // inserting the same record, bypass + return Ok(()); + } + } + } + } + + diesel::insert_into(dsl::live_repair_notification) + .values(record) + .on_conflict(( + dsl::repair_id, + dsl::upstairs_id, + dsl::session_id, + dsl::region_id, + dsl::notification_type, + )) + .do_nothing() + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } } #[derive(Default, Clone, Debug, Serialize, Deserialize)] diff --git a/nexus/db-queries/src/db/pool_connection.rs b/nexus/db-queries/src/db/pool_connection.rs index 66fb125a7c..81855206e9 100644 --- a/nexus/db-queries/src/db/pool_connection.rs +++ b/nexus/db-queries/src/db/pool_connection.rs @@ -51,6 +51,7 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "ip_attach_state", "ip_kind", "ip_pool_resource_type", + "live_repair_notification_type", "network_interface_kind", "physical_disk_kind", "producer_kind", diff --git a/nexus/src/app/volume.rs b/nexus/src/app/volume.rs index c36c4524c1..b9052b1eb1 100644 --- a/nexus/src/app/volume.rs +++ b/nexus/src/app/volume.rs @@ -5,9 +5,15 @@ //! Volumes use crate::app::sagas; +use nexus_db_model::LiveRepairNotification; +use nexus_db_model::LiveRepairNotificationType; use nexus_db_queries::authn; use nexus_db_queries::context::OpContext; use omicron_common::api::external::DeleteResult; +use omicron_common::api::internal::nexus::RepairFinishInfo; +use omicron_common::api::internal::nexus::RepairStartInfo; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; use std::sync::Arc; use uuid::Uuid; @@ -30,4 +36,78 @@ impl super::Nexus { Ok(()) } + + /// An Upstairs is telling us when a live repair is starting. + pub(crate) async fn live_repair_start( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_start_info: RepairStartInfo, + ) -> DeleteResult { + info!( + self.log, + "received live_repair_start from upstairs {upstairs_id}: {:?}", + repair_start_info, + ); + + for repaired_downstairs in repair_start_info.repairs { + self.db_datastore + .live_repair_notification( + opctx, + LiveRepairNotification::new( + repair_start_info.repair_id, + upstairs_id, + repair_start_info.session_id, + repaired_downstairs.region_uuid, + repaired_downstairs.target_addr, + LiveRepairNotificationType::Started, + ), + ) + .await?; + } + + Ok(()) + } + + /// An Upstairs is telling us when a live repair is finished, and the result. + pub(crate) async fn live_repair_finish( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_finish_info: RepairFinishInfo, + ) -> DeleteResult { + info!( + self.log, + "received live_repair_finish from upstairs {upstairs_id}: {:?}", + repair_finish_info, + ); + + for repaired_downstairs in repair_finish_info.repairs { + self.db_datastore + .live_repair_notification( + opctx, + LiveRepairNotification::new( + repair_finish_info.repair_id, + upstairs_id, + repair_finish_info.session_id, + repaired_downstairs.region_uuid, + repaired_downstairs.target_addr, + if repair_finish_info.aborted { + LiveRepairNotificationType::Failed + } else { + LiveRepairNotificationType::Succeeded + }, + ), + ) + .await?; + + if !repair_finish_info.aborted { + // TODO-followup if there's an active region replacement + // occurring, a successfully completed live repair can trigger a + // saga to destroy the original region. + } + } + + Ok(()) + } } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index eddc834a2a..e1ef86b055 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -41,8 +41,12 @@ use omicron_common::api::external::http_pagination::ScanParams; use omicron_common::api::external::Error; use omicron_common::api::internal::nexus::DiskRuntimeState; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::RepairFinishInfo; +use omicron_common::api::internal::nexus::RepairStartInfo; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_common::update::ArtifactId; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; use oximeter::types::ProducerResults; use oximeter_producer::{collect, ProducerIdPathParams}; use schemars::JsonSchema; @@ -71,6 +75,8 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(cpapi_collectors_post)?; api.register(cpapi_metrics_collect)?; api.register(cpapi_artifact_download)?; + api.register(cpapi_live_repair_start)?; + api.register(cpapi_live_repair_finish)?; api.register(saga_list)?; api.register(saga_view)?; @@ -479,6 +485,68 @@ async fn cpapi_artifact_download( Ok(HttpResponseOk(Body::from(body).into())) } +/// Path parameters for Upstairs requests (internal API) +#[derive(Deserialize, JsonSchema)] +struct UpstairsPathParam { + upstairs_id: TypedUuid, +} + +/// An Upstairs will notify this endpoint when a live repair starts +#[endpoint { + method = POST, + path = "/upstairs/{upstairs_id}/live-repair-start", + }] +async fn cpapi_live_repair_start( + rqctx: RequestContext>, + path_params: Path, + repair_start_info: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .live_repair_start( + &opctx, + path.upstairs_id, + repair_start_info.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// An Upstairs will notify this endpoint when a live repair finishes. +#[endpoint { + method = POST, + path = "/upstairs/{upstairs_id}/live-repair-finish", + }] +async fn cpapi_live_repair_finish( + rqctx: RequestContext>, + path_params: Path, + repair_finish_info: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .live_repair_finish( + &opctx, + path.upstairs_id, + repair_finish_info.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + // Sagas /// List sagas diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 34f037ee8c..163267a1d0 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -24,6 +24,12 @@ use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Name; +use omicron_common::api::internal; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::LiveRepairKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsSessionKind; use rand::prelude::SliceRandom; use rand::{rngs::StdRng, SeedableRng}; use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; @@ -2552,3 +2558,445 @@ async fn test_volume_hard_delete_idempotent( datastore.volume_hard_delete(volume_id).await.unwrap(); datastore.volume_hard_delete(volume_id).await.unwrap(); } + +// upstairs related tests + +/// Test that an Upstairs can reissue live repair notifications +#[nexus_test] +async fn test_upstairs_live_repair_notify_idempotent( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Notify start + let notify_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Notify finish + let notify_url = format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that an Upstairs cannot issue different finish statuses for the same +/// repair. +#[nexus_test] +async fn test_upstairs_live_repair_notify_different_finish_status( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + let notify_url = format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, // live repair was ok + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, // live repair failed? + }), + StatusCode::CONFLICT, + ) + .await + .unwrap_err(); +} + +/// Test that the same Upstairs can rerun a repair again. +#[nexus_test] +async fn test_upstairs_live_repair_same_upstairs_retry( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair + + let notify_start_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_finish_url = + format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate the same Upstairs restarting the repair, which passes this time + + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that a different Upstairs session can rerun a repair again. +#[nexus_test] +async fn test_upstairs_live_repair_different_upstairs_retry( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair by one Upstairs + + let notify_start_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_finish_url = + format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate a different Upstairs session restarting the repair, which passes this time + + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that a different Upstairs session can rerun an interrupted repair +#[nexus_test] +async fn test_upstairs_live_repair_different_upstairs_retry_interrupted( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair by one Upstairs, which was interrupted (which + // leads to no finish message). + + let notify_start_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_finish_url = + format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate a different Upstairs session restarting the interrupted repair, + // which passes this time + + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 53a53fb219..b92c5f81a3 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -962,6 +962,80 @@ } } }, + "/upstairs/{upstairs_id}/live-repair-finish": { + "post": { + "summary": "An Upstairs will notify this endpoint when a live repair finishes.", + "operationId": "cpapi_live_repair_finish", + "parameters": [ + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairFinishInfo" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/upstairs/{upstairs_id}/live-repair-start": { + "post": { + "summary": "An Upstairs will notify this endpoint when a live repair starts", + "operationId": "cpapi_live_repair_start", + "parameters": [ + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairStartInfo" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/volume/{volume_id}/remove-read-only-parent": { "post": { "summary": "Request removal of a read_only_parent from a volume", @@ -3409,6 +3483,21 @@ } ] }, + "DownstairsUnderRepair": { + "type": "object", + "properties": { + "region_uuid": { + "$ref": "#/components/schemas/TypedUuidForDownstairsRegionKind" + }, + "target_addr": { + "type": "string" + } + }, + "required": [ + "region_uuid", + "target_addr" + ] + }, "Duration": { "type": "object", "properties": { @@ -5759,6 +5848,54 @@ "user_password_hash" ] }, + "RepairFinishInfo": { + "type": "object", + "properties": { + "aborted": { + "type": "boolean" + }, + "repair_id": { + "$ref": "#/components/schemas/TypedUuidForLiveRepairKind" + }, + "repairs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DownstairsUnderRepair" + } + }, + "session_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsSessionKind" + } + }, + "required": [ + "aborted", + "repair_id", + "repairs", + "session_id" + ] + }, + "RepairStartInfo": { + "type": "object", + "properties": { + "repair_id": { + "$ref": "#/components/schemas/TypedUuidForLiveRepairKind" + }, + "repairs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DownstairsUnderRepair" + } + }, + "session_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsSessionKind" + } + }, + "required": [ + "repair_id", + "repairs", + "session_id" + ] + }, "RouteConfig": { "type": "object", "properties": { @@ -6546,6 +6683,18 @@ "SwitchPutResponse": { "type": "object" }, + "TypedUuidForDownstairsRegionKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForLiveRepairKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForUpstairsSessionKind": { + "type": "string", + "format": "uuid" + }, "UserId": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID though they may contain a UUID.", @@ -6643,6 +6792,10 @@ ] } ] + }, + "TypedUuidForUpstairsKind": { + "type": "string", + "format": "uuid" } }, "responses": { diff --git a/schema/crdb/37.0.0/up01.sql b/schema/crdb/37.0.0/up01.sql new file mode 100644 index 0000000000..91c27bd81a --- /dev/null +++ b/schema/crdb/37.0.0/up01.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.live_repair_notification_type AS ENUM ( + 'started', + 'succeeded', + 'failed' +); diff --git a/schema/crdb/37.0.0/up02.sql b/schema/crdb/37.0.0/up02.sql new file mode 100644 index 0000000000..58d33e39c8 --- /dev/null +++ b/schema/crdb/37.0.0/up02.sql @@ -0,0 +1,19 @@ +CREATE TABLE IF NOT EXISTS live_repair_notification ( + time TIMESTAMPTZ NOT NULL, + + repair_id UUID NOT NULL, + upstairs_id UUID NOT NULL, + session_id UUID NOT NULL, + + region_id UUID NOT NULL, + target_ip INET NOT NULL, + target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, + + notification_type omicron.public.live_repair_notification_type NOT NULL, + + /* + * A live repair is uniquely identified by the four UUIDs here, and a + * notification is uniquely identified by its type. + */ + PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 87a22d1adc..64be993e64 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3511,6 +3511,32 @@ SELECT deleted FROM interleaved_versions; +CREATE TYPE IF NOT EXISTS omicron.public.live_repair_notification_type AS ENUM ( + 'started', + 'succeeded', + 'failed' +); + +CREATE TABLE IF NOT EXISTS live_repair_notification ( + time TIMESTAMPTZ NOT NULL, + + repair_id UUID NOT NULL, + upstairs_id UUID NOT NULL, + session_id UUID NOT NULL, + + region_id UUID NOT NULL, + target_ip INET NOT NULL, + target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, + + notification_type omicron.public.live_repair_notification_type NOT NULL, + + /* + * A live repair is uniquely identified by the four UUIDs here, and a + * notification is uniquely identified by its type. + */ + PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) +); + INSERT INTO omicron.public.db_metadata ( singleton, time_created, @@ -3518,7 +3544,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '36.0.0', NULL) + ( TRUE, NOW(), NOW(), '37.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 12bc756d68..871de4d8ca 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -45,6 +45,10 @@ macro_rules! impl_typed_uuid_kind { // Please keep this list in alphabetical order. impl_typed_uuid_kind! { + DownstairsRegionKind => "downstairs_region", + LiveRepairKind => "live_repair", LoopbackAddressKind => "loopback_address", TufRepoKind => "tuf_repo", + UpstairsKind => "upstairs", + UpstairsSessionKind => "upstairs_session", }