From 128a998058a181b3420ffb240959652c754082be Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 23 Feb 2024 16:08:01 -0500 Subject: [PATCH] Accept live repair status reports from Crucible Allow any Upstairs to notify Nexus about the start or completion (plus status) of live repairs. The motivation for this was to be used in the final stage of region replacement to notify Nexus that the replacement has finished, but more generally this can be used to keep track of how many times repair occurs for each region. Fixes #5120 --- Cargo.lock | 1 + common/src/api/internal/nexus.rs | 25 + nexus/Cargo.toml | 2 +- nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/live_repair.rs | 95 ++++ nexus/db-model/src/schema.rs | 18 +- nexus/db-queries/src/db/datastore/volume.rs | 98 ++++ nexus/db-queries/src/db/pool_connection.rs | 1 + nexus/src/app/volume.rs | 80 ++++ nexus/src/internal_api/http_entrypoints.rs | 68 +++ .../integration_tests/volume_management.rs | 448 ++++++++++++++++++ openapi/nexus-internal.json | 153 ++++++ schema/crdb/37.0.0/up01.sql | 5 + schema/crdb/37.0.0/up02.sql | 19 + schema/crdb/dbinit.sql | 28 +- uuid-kinds/src/lib.rs | 4 + 16 files changed, 1044 insertions(+), 3 deletions(-) create mode 100644 nexus/db-model/src/live_repair.rs create mode 100644 schema/crdb/37.0.0/up01.sql create mode 100644 schema/crdb/37.0.0/up02.sql diff --git a/Cargo.lock b/Cargo.lock index e15afdfbab..5b0806cb4d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5045,6 +5045,7 @@ dependencies = [ "omicron-rpaths", "omicron-sled-agent", "omicron-test-utils", + "omicron-uuid-kinds", "omicron-workspace-hack", "once_cell", "openapi-lint", diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 3972e011cf..48140bfafe 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -9,6 +9,10 @@ use crate::api::external::{ InstanceState, IpNet, SemverVersion, Vni, }; use chrono::{DateTime, Utc}; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::LiveRepairKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsSessionKind; use parse_display::{Display, FromStr}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -251,3 +255,24 @@ pub enum HostIdentifier { Ip(IpNet), Vpc(Vni), } + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct DownstairsUnderRepair { + pub region_uuid: TypedUuid, + pub target_addr: std::net::SocketAddrV6, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairStartInfo { + pub session_id: TypedUuid, + pub repair_id: TypedUuid, + pub repairs: Vec, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairFinishInfo { + pub session_id: TypedUuid, + pub repair_id: TypedUuid, + pub repairs: Vec, + pub aborted: bool, +} diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 6e9f2f135d..957640b07c 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -76,7 +76,6 @@ tokio-postgres = { workspace = true, features = ["with-serde_json-1"] } tough.workspace = true trust-dns-resolver.workspace = true uuid.workspace = true - nexus-blueprint-execution.workspace = true nexus-defaults.workspace = true nexus-db-model.workspace = true @@ -93,6 +92,7 @@ rustls = { workspace = true } rustls-pemfile = { workspace = true } update-common.workspace = true omicron-workspace-hack.workspace = true +omicron-uuid-kinds.workspace = true [dev-dependencies] async-bb8-diesel.workspace = true diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index ecbb8365fe..f85815b627 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -39,6 +39,7 @@ mod ipv4net; pub mod ipv6; mod ipv6net; mod l4_port_range; +mod live_repair; mod macaddr; mod name; mod network_interface; @@ -139,6 +140,7 @@ pub use ipv4net::*; pub use ipv6::*; pub use ipv6net::*; pub use l4_port_range::*; +pub use live_repair::*; pub use name::*; pub use network_interface::*; pub use oximeter_info::*; diff --git a/nexus/db-model/src/live_repair.rs b/nexus/db-model/src/live_repair.rs new file mode 100644 index 0000000000..88d79b2057 --- /dev/null +++ b/nexus/db-model/src/live_repair.rs @@ -0,0 +1,95 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::ipv6; +use crate::schema::live_repair_notification; +use crate::typed_uuid::DbTypedUuid; +use crate::SqlU16; +use chrono::{DateTime, Utc}; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::LiveRepairKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsSessionKind; +use serde::{Deserialize, Serialize}; +use std::net::SocketAddrV6; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "live_repair_notification_type", schema = "public"))] + pub struct LiveRepairNotificationTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = LiveRepairNotificationTypeEnum)] + pub enum LiveRepairNotificationType; + + // Notification types + Started => b"started" + Succeeded => b"succeeded" + Failed => b"failed" +); + +/// A record of Crucible live repair notifications: when a live repair started, +/// succeeded, failed, etc. +/// +/// Each live repair attempt is uniquely identified by the repair ID, upstairs +/// ID, session ID, and region ID. How those change tells Nexus about what is +/// going on: +/// +/// - if all IDs are the same for different requests, Nexus knows that the +/// client is retrying the notification. +/// +/// - if the upstairs ID, session ID, and region ID are all the same, but the +/// repair ID is different, then the same Upstairs is trying to repair that +/// region again. This could be due to a failed first attempt, or that +/// downstairs may have been kicked out again. +/// +/// - if the upstairs ID and region ID are the same, but the session ID and +/// repair ID are different, then a different session of the same Upstairs is +/// trying to repair that Downstairs. Session IDs change each time the +/// Upstairs is created, so it could have crashed, or it could have been +/// migrated and the destination Propolis' Upstairs is attempting to repair +/// the same region. +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = live_repair_notification)] +pub struct LiveRepairNotification { + pub time: DateTime, + + pub repair_id: DbTypedUuid, + pub upstairs_id: DbTypedUuid, + pub session_id: DbTypedUuid, + + pub region_id: DbTypedUuid, + pub target_ip: ipv6::Ipv6Addr, + pub target_port: SqlU16, + + pub notification_type: LiveRepairNotificationType, +} + +impl LiveRepairNotification { + pub fn new( + repair_id: TypedUuid, + upstairs_id: TypedUuid, + session_id: TypedUuid, + region_id: TypedUuid, + target_addr: SocketAddrV6, + notification_type: LiveRepairNotificationType, + ) -> Self { + Self { + time: Utc::now(), + repair_id: repair_id.into(), + upstairs_id: upstairs_id.into(), + session_id: session_id.into(), + region_id: region_id.into(), + target_ip: target_addr.ip().into(), + target_port: target_addr.port().into(), + notification_type, + } + } + + pub fn address(&self) -> SocketAddrV6 { + SocketAddrV6::new(*self.target_ip, *self.target_port, 0, 0) + } +} diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 54755486e5..7cc7216886 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion; /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(36, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(37, 0, 0); table! { disk (id) { @@ -1517,6 +1517,22 @@ table! { } } +table! { + live_repair_notification (repair_id, upstairs_id, session_id, region_id, notification_type) { + time -> Timestamptz, + + repair_id -> Uuid, + upstairs_id -> Uuid, + session_id -> Uuid, + + region_id -> Uuid, + target_ip -> Inet, + target_port -> Int4, + + notification_type -> crate::LiveRepairNotificationTypeEnum, + } +} + table! { db_metadata (singleton) { singleton -> Bool, diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index d0b093ff45..f502e3afad 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -6,10 +6,13 @@ use super::DataStore; use crate::db; +use crate::db::datastore::OpContext; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; +use crate::db::model::LiveRepairNotification; +use crate::db::model::LiveRepairNotificationType; use crate::db::model::Region; use crate::db::model::RegionSnapshot; use crate::db::model::Volume; @@ -809,6 +812,101 @@ impl DataStore { public_error_from_diesel(e, ErrorHandler::Server) }) } + + // An Upstairs is created as part of a Volume hierarchy if the Volume + // Construction Request includes a "Region" variant. This may be at any + // layer of the Volume, and some notifications will come from an Upstairs + // instead of the top level of the Volume. The following functions have an + // Upstairs ID instead of a Volume ID for this reason. + + /// Record when an Upstairs notifies us about a live repair. If that record + /// (uniquely identified by the four IDs passed in plus the notification + /// type) exists already, do nothing. + pub async fn live_repair_notification( + &self, + opctx: &OpContext, + record: LiveRepairNotification, + ) -> Result<(), Error> { + use db::schema::live_repair_notification::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); + + self.transaction_retry_wrapper("live_repair_notification") + .transaction(&conn, |conn| { + let record = record.clone(); + let err = err.clone(); + + async move { + match &record.notification_type { + LiveRepairNotificationType::Started => { + // Proceed - the insertion can succeed or fail below + // based on the table's primary key + } + + LiveRepairNotificationType::Succeeded + | LiveRepairNotificationType::Failed => { + // However, Nexus must accept only one "finished" + // status - an Upstairs cannot change this and must + // instead perform another repair with a new repair + // ID. + let maybe_existing_finish_record: Option< + LiveRepairNotification, + > = dsl::live_repair_notification + .filter(dsl::repair_id.eq(record.repair_id)) + .filter(dsl::upstairs_id.eq(record.upstairs_id)) + .filter(dsl::session_id.eq(record.session_id)) + .filter(dsl::region_id.eq(record.region_id)) + .filter(dsl::notification_type.eq_any(vec![ + LiveRepairNotificationType::Succeeded, + LiveRepairNotificationType::Failed, + ])) + .get_result_async(&conn) + .await + .optional()?; + + if let Some(existing_finish_record) = + maybe_existing_finish_record + { + if existing_finish_record.notification_type + != record.notification_type + { + return Err(err.bail(Error::conflict( + "existing finish record does not match", + ))); + } else { + // inserting the same record, bypass + return Ok(()); + } + } + } + } + + diesel::insert_into(dsl::live_repair_notification) + .values(record) + .on_conflict(( + dsl::repair_id, + dsl::upstairs_id, + dsl::session_id, + dsl::region_id, + dsl::notification_type, + )) + .do_nothing() + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } } #[derive(Default, Clone, Debug, Serialize, Deserialize)] diff --git a/nexus/db-queries/src/db/pool_connection.rs b/nexus/db-queries/src/db/pool_connection.rs index 66fb125a7c..81855206e9 100644 --- a/nexus/db-queries/src/db/pool_connection.rs +++ b/nexus/db-queries/src/db/pool_connection.rs @@ -51,6 +51,7 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "ip_attach_state", "ip_kind", "ip_pool_resource_type", + "live_repair_notification_type", "network_interface_kind", "physical_disk_kind", "producer_kind", diff --git a/nexus/src/app/volume.rs b/nexus/src/app/volume.rs index c36c4524c1..b9052b1eb1 100644 --- a/nexus/src/app/volume.rs +++ b/nexus/src/app/volume.rs @@ -5,9 +5,15 @@ //! Volumes use crate::app::sagas; +use nexus_db_model::LiveRepairNotification; +use nexus_db_model::LiveRepairNotificationType; use nexus_db_queries::authn; use nexus_db_queries::context::OpContext; use omicron_common::api::external::DeleteResult; +use omicron_common::api::internal::nexus::RepairFinishInfo; +use omicron_common::api::internal::nexus::RepairStartInfo; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; use std::sync::Arc; use uuid::Uuid; @@ -30,4 +36,78 @@ impl super::Nexus { Ok(()) } + + /// An Upstairs is telling us when a live repair is starting. + pub(crate) async fn live_repair_start( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_start_info: RepairStartInfo, + ) -> DeleteResult { + info!( + self.log, + "received live_repair_start from upstairs {upstairs_id}: {:?}", + repair_start_info, + ); + + for repaired_downstairs in repair_start_info.repairs { + self.db_datastore + .live_repair_notification( + opctx, + LiveRepairNotification::new( + repair_start_info.repair_id, + upstairs_id, + repair_start_info.session_id, + repaired_downstairs.region_uuid, + repaired_downstairs.target_addr, + LiveRepairNotificationType::Started, + ), + ) + .await?; + } + + Ok(()) + } + + /// An Upstairs is telling us when a live repair is finished, and the result. + pub(crate) async fn live_repair_finish( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_finish_info: RepairFinishInfo, + ) -> DeleteResult { + info!( + self.log, + "received live_repair_finish from upstairs {upstairs_id}: {:?}", + repair_finish_info, + ); + + for repaired_downstairs in repair_finish_info.repairs { + self.db_datastore + .live_repair_notification( + opctx, + LiveRepairNotification::new( + repair_finish_info.repair_id, + upstairs_id, + repair_finish_info.session_id, + repaired_downstairs.region_uuid, + repaired_downstairs.target_addr, + if repair_finish_info.aborted { + LiveRepairNotificationType::Failed + } else { + LiveRepairNotificationType::Succeeded + }, + ), + ) + .await?; + + if !repair_finish_info.aborted { + // TODO-followup if there's an active region replacement + // occurring, a successfully completed live repair can trigger a + // saga to destroy the original region. + } + } + + Ok(()) + } } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index eddc834a2a..e1ef86b055 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -41,8 +41,12 @@ use omicron_common::api::external::http_pagination::ScanParams; use omicron_common::api::external::Error; use omicron_common::api::internal::nexus::DiskRuntimeState; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::RepairFinishInfo; +use omicron_common::api::internal::nexus::RepairStartInfo; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_common::update::ArtifactId; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; use oximeter::types::ProducerResults; use oximeter_producer::{collect, ProducerIdPathParams}; use schemars::JsonSchema; @@ -71,6 +75,8 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(cpapi_collectors_post)?; api.register(cpapi_metrics_collect)?; api.register(cpapi_artifact_download)?; + api.register(cpapi_live_repair_start)?; + api.register(cpapi_live_repair_finish)?; api.register(saga_list)?; api.register(saga_view)?; @@ -479,6 +485,68 @@ async fn cpapi_artifact_download( Ok(HttpResponseOk(Body::from(body).into())) } +/// Path parameters for Upstairs requests (internal API) +#[derive(Deserialize, JsonSchema)] +struct UpstairsPathParam { + upstairs_id: TypedUuid, +} + +/// An Upstairs will notify this endpoint when a live repair starts +#[endpoint { + method = POST, + path = "/upstairs/{upstairs_id}/live-repair-start", + }] +async fn cpapi_live_repair_start( + rqctx: RequestContext>, + path_params: Path, + repair_start_info: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .live_repair_start( + &opctx, + path.upstairs_id, + repair_start_info.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// An Upstairs will notify this endpoint when a live repair finishes. +#[endpoint { + method = POST, + path = "/upstairs/{upstairs_id}/live-repair-finish", + }] +async fn cpapi_live_repair_finish( + rqctx: RequestContext>, + path_params: Path, + repair_finish_info: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .live_repair_finish( + &opctx, + path.upstairs_id, + repair_finish_info.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + // Sagas /// List sagas diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 34f037ee8c..163267a1d0 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -24,6 +24,12 @@ use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Name; +use omicron_common::api::internal; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::LiveRepairKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsSessionKind; use rand::prelude::SliceRandom; use rand::{rngs::StdRng, SeedableRng}; use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; @@ -2552,3 +2558,445 @@ async fn test_volume_hard_delete_idempotent( datastore.volume_hard_delete(volume_id).await.unwrap(); datastore.volume_hard_delete(volume_id).await.unwrap(); } + +// upstairs related tests + +/// Test that an Upstairs can reissue live repair notifications +#[nexus_test] +async fn test_upstairs_live_repair_notify_idempotent( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Notify start + let notify_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Notify finish + let notify_url = format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that an Upstairs cannot issue different finish statuses for the same +/// repair. +#[nexus_test] +async fn test_upstairs_live_repair_notify_different_finish_status( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + let notify_url = format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, // live repair was ok + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, // live repair failed? + }), + StatusCode::CONFLICT, + ) + .await + .unwrap_err(); +} + +/// Test that the same Upstairs can rerun a repair again. +#[nexus_test] +async fn test_upstairs_live_repair_same_upstairs_retry( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair + + let notify_start_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_finish_url = + format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate the same Upstairs restarting the repair, which passes this time + + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that a different Upstairs session can rerun a repair again. +#[nexus_test] +async fn test_upstairs_live_repair_different_upstairs_retry( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair by one Upstairs + + let notify_start_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_finish_url = + format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate a different Upstairs session restarting the repair, which passes this time + + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that a different Upstairs session can rerun an interrupted repair +#[nexus_test] +async fn test_upstairs_live_repair_different_upstairs_retry_interrupted( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair by one Upstairs, which was interrupted (which + // leads to no finish message). + + let notify_start_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_finish_url = + format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate a different Upstairs session restarting the interrupted repair, + // which passes this time + + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 53a53fb219..b92c5f81a3 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -962,6 +962,80 @@ } } }, + "/upstairs/{upstairs_id}/live-repair-finish": { + "post": { + "summary": "An Upstairs will notify this endpoint when a live repair finishes.", + "operationId": "cpapi_live_repair_finish", + "parameters": [ + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairFinishInfo" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/upstairs/{upstairs_id}/live-repair-start": { + "post": { + "summary": "An Upstairs will notify this endpoint when a live repair starts", + "operationId": "cpapi_live_repair_start", + "parameters": [ + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairStartInfo" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/volume/{volume_id}/remove-read-only-parent": { "post": { "summary": "Request removal of a read_only_parent from a volume", @@ -3409,6 +3483,21 @@ } ] }, + "DownstairsUnderRepair": { + "type": "object", + "properties": { + "region_uuid": { + "$ref": "#/components/schemas/TypedUuidForDownstairsRegionKind" + }, + "target_addr": { + "type": "string" + } + }, + "required": [ + "region_uuid", + "target_addr" + ] + }, "Duration": { "type": "object", "properties": { @@ -5759,6 +5848,54 @@ "user_password_hash" ] }, + "RepairFinishInfo": { + "type": "object", + "properties": { + "aborted": { + "type": "boolean" + }, + "repair_id": { + "$ref": "#/components/schemas/TypedUuidForLiveRepairKind" + }, + "repairs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DownstairsUnderRepair" + } + }, + "session_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsSessionKind" + } + }, + "required": [ + "aborted", + "repair_id", + "repairs", + "session_id" + ] + }, + "RepairStartInfo": { + "type": "object", + "properties": { + "repair_id": { + "$ref": "#/components/schemas/TypedUuidForLiveRepairKind" + }, + "repairs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DownstairsUnderRepair" + } + }, + "session_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsSessionKind" + } + }, + "required": [ + "repair_id", + "repairs", + "session_id" + ] + }, "RouteConfig": { "type": "object", "properties": { @@ -6546,6 +6683,18 @@ "SwitchPutResponse": { "type": "object" }, + "TypedUuidForDownstairsRegionKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForLiveRepairKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForUpstairsSessionKind": { + "type": "string", + "format": "uuid" + }, "UserId": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID though they may contain a UUID.", @@ -6643,6 +6792,10 @@ ] } ] + }, + "TypedUuidForUpstairsKind": { + "type": "string", + "format": "uuid" } }, "responses": { diff --git a/schema/crdb/37.0.0/up01.sql b/schema/crdb/37.0.0/up01.sql new file mode 100644 index 0000000000..91c27bd81a --- /dev/null +++ b/schema/crdb/37.0.0/up01.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.live_repair_notification_type AS ENUM ( + 'started', + 'succeeded', + 'failed' +); diff --git a/schema/crdb/37.0.0/up02.sql b/schema/crdb/37.0.0/up02.sql new file mode 100644 index 0000000000..58d33e39c8 --- /dev/null +++ b/schema/crdb/37.0.0/up02.sql @@ -0,0 +1,19 @@ +CREATE TABLE IF NOT EXISTS live_repair_notification ( + time TIMESTAMPTZ NOT NULL, + + repair_id UUID NOT NULL, + upstairs_id UUID NOT NULL, + session_id UUID NOT NULL, + + region_id UUID NOT NULL, + target_ip INET NOT NULL, + target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, + + notification_type omicron.public.live_repair_notification_type NOT NULL, + + /* + * A live repair is uniquely identified by the four UUIDs here, and a + * notification is uniquely identified by its type. + */ + PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 87a22d1adc..64be993e64 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3511,6 +3511,32 @@ SELECT deleted FROM interleaved_versions; +CREATE TYPE IF NOT EXISTS omicron.public.live_repair_notification_type AS ENUM ( + 'started', + 'succeeded', + 'failed' +); + +CREATE TABLE IF NOT EXISTS live_repair_notification ( + time TIMESTAMPTZ NOT NULL, + + repair_id UUID NOT NULL, + upstairs_id UUID NOT NULL, + session_id UUID NOT NULL, + + region_id UUID NOT NULL, + target_ip INET NOT NULL, + target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, + + notification_type omicron.public.live_repair_notification_type NOT NULL, + + /* + * A live repair is uniquely identified by the four UUIDs here, and a + * notification is uniquely identified by its type. + */ + PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) +); + INSERT INTO omicron.public.db_metadata ( singleton, time_created, @@ -3518,7 +3544,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '36.0.0', NULL) + ( TRUE, NOW(), NOW(), '37.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 12bc756d68..871de4d8ca 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -45,6 +45,10 @@ macro_rules! impl_typed_uuid_kind { // Please keep this list in alphabetical order. impl_typed_uuid_kind! { + DownstairsRegionKind => "downstairs_region", + LiveRepairKind => "live_repair", LoopbackAddressKind => "loopback_address", TufRepoKind => "tuf_repo", + UpstairsKind => "upstairs", + UpstairsSessionKind => "upstairs_session", }