From 128a998058a181b3420ffb240959652c754082be Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 23 Feb 2024 16:08:01 -0500 Subject: [PATCH 01/24] Accept live repair status reports from Crucible Allow any Upstairs to notify Nexus about the start or completion (plus status) of live repairs. The motivation for this was to be used in the final stage of region replacement to notify Nexus that the replacement has finished, but more generally this can be used to keep track of how many times repair occurs for each region. Fixes #5120 --- Cargo.lock | 1 + common/src/api/internal/nexus.rs | 25 + nexus/Cargo.toml | 2 +- nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/live_repair.rs | 95 ++++ nexus/db-model/src/schema.rs | 18 +- nexus/db-queries/src/db/datastore/volume.rs | 98 ++++ nexus/db-queries/src/db/pool_connection.rs | 1 + nexus/src/app/volume.rs | 80 ++++ nexus/src/internal_api/http_entrypoints.rs | 68 +++ .../integration_tests/volume_management.rs | 448 ++++++++++++++++++ openapi/nexus-internal.json | 153 ++++++ schema/crdb/37.0.0/up01.sql | 5 + schema/crdb/37.0.0/up02.sql | 19 + schema/crdb/dbinit.sql | 28 +- uuid-kinds/src/lib.rs | 4 + 16 files changed, 1044 insertions(+), 3 deletions(-) create mode 100644 nexus/db-model/src/live_repair.rs create mode 100644 schema/crdb/37.0.0/up01.sql create mode 100644 schema/crdb/37.0.0/up02.sql diff --git a/Cargo.lock b/Cargo.lock index e15afdfbab..5b0806cb4d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5045,6 +5045,7 @@ dependencies = [ "omicron-rpaths", "omicron-sled-agent", "omicron-test-utils", + "omicron-uuid-kinds", "omicron-workspace-hack", "once_cell", "openapi-lint", diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 3972e011cf..48140bfafe 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -9,6 +9,10 @@ use crate::api::external::{ InstanceState, IpNet, SemverVersion, Vni, }; use chrono::{DateTime, Utc}; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::LiveRepairKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsSessionKind; use parse_display::{Display, FromStr}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -251,3 +255,24 @@ pub enum HostIdentifier { Ip(IpNet), Vpc(Vni), } + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct DownstairsUnderRepair { + pub region_uuid: TypedUuid, + pub target_addr: std::net::SocketAddrV6, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairStartInfo { + pub session_id: TypedUuid, + pub repair_id: TypedUuid, + pub repairs: Vec, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairFinishInfo { + pub session_id: TypedUuid, + pub repair_id: TypedUuid, + pub repairs: Vec, + pub aborted: bool, +} diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 6e9f2f135d..957640b07c 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -76,7 +76,6 @@ tokio-postgres = { workspace = true, features = ["with-serde_json-1"] } tough.workspace = true trust-dns-resolver.workspace = true uuid.workspace = true - nexus-blueprint-execution.workspace = true nexus-defaults.workspace = true nexus-db-model.workspace = true @@ -93,6 +92,7 @@ rustls = { workspace = true } rustls-pemfile = { workspace = true } update-common.workspace = true omicron-workspace-hack.workspace = true +omicron-uuid-kinds.workspace = true [dev-dependencies] async-bb8-diesel.workspace = true diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index ecbb8365fe..f85815b627 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -39,6 +39,7 @@ mod ipv4net; pub mod ipv6; mod ipv6net; mod l4_port_range; +mod live_repair; mod macaddr; mod name; mod network_interface; @@ -139,6 +140,7 @@ pub use ipv4net::*; pub use ipv6::*; pub use ipv6net::*; pub use l4_port_range::*; +pub use live_repair::*; pub use name::*; pub use network_interface::*; pub use oximeter_info::*; diff --git a/nexus/db-model/src/live_repair.rs b/nexus/db-model/src/live_repair.rs new file mode 100644 index 0000000000..88d79b2057 --- /dev/null +++ b/nexus/db-model/src/live_repair.rs @@ -0,0 +1,95 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::ipv6; +use crate::schema::live_repair_notification; +use crate::typed_uuid::DbTypedUuid; +use crate::SqlU16; +use chrono::{DateTime, Utc}; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::LiveRepairKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsSessionKind; +use serde::{Deserialize, Serialize}; +use std::net::SocketAddrV6; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "live_repair_notification_type", schema = "public"))] + pub struct LiveRepairNotificationTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = LiveRepairNotificationTypeEnum)] + pub enum LiveRepairNotificationType; + + // Notification types + Started => b"started" + Succeeded => b"succeeded" + Failed => b"failed" +); + +/// A record of Crucible live repair notifications: when a live repair started, +/// succeeded, failed, etc. +/// +/// Each live repair attempt is uniquely identified by the repair ID, upstairs +/// ID, session ID, and region ID. How those change tells Nexus about what is +/// going on: +/// +/// - if all IDs are the same for different requests, Nexus knows that the +/// client is retrying the notification. +/// +/// - if the upstairs ID, session ID, and region ID are all the same, but the +/// repair ID is different, then the same Upstairs is trying to repair that +/// region again. This could be due to a failed first attempt, or that +/// downstairs may have been kicked out again. +/// +/// - if the upstairs ID and region ID are the same, but the session ID and +/// repair ID are different, then a different session of the same Upstairs is +/// trying to repair that Downstairs. Session IDs change each time the +/// Upstairs is created, so it could have crashed, or it could have been +/// migrated and the destination Propolis' Upstairs is attempting to repair +/// the same region. +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = live_repair_notification)] +pub struct LiveRepairNotification { + pub time: DateTime, + + pub repair_id: DbTypedUuid, + pub upstairs_id: DbTypedUuid, + pub session_id: DbTypedUuid, + + pub region_id: DbTypedUuid, + pub target_ip: ipv6::Ipv6Addr, + pub target_port: SqlU16, + + pub notification_type: LiveRepairNotificationType, +} + +impl LiveRepairNotification { + pub fn new( + repair_id: TypedUuid, + upstairs_id: TypedUuid, + session_id: TypedUuid, + region_id: TypedUuid, + target_addr: SocketAddrV6, + notification_type: LiveRepairNotificationType, + ) -> Self { + Self { + time: Utc::now(), + repair_id: repair_id.into(), + upstairs_id: upstairs_id.into(), + session_id: session_id.into(), + region_id: region_id.into(), + target_ip: target_addr.ip().into(), + target_port: target_addr.port().into(), + notification_type, + } + } + + pub fn address(&self) -> SocketAddrV6 { + SocketAddrV6::new(*self.target_ip, *self.target_port, 0, 0) + } +} diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 54755486e5..7cc7216886 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion; /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(36, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(37, 0, 0); table! { disk (id) { @@ -1517,6 +1517,22 @@ table! { } } +table! { + live_repair_notification (repair_id, upstairs_id, session_id, region_id, notification_type) { + time -> Timestamptz, + + repair_id -> Uuid, + upstairs_id -> Uuid, + session_id -> Uuid, + + region_id -> Uuid, + target_ip -> Inet, + target_port -> Int4, + + notification_type -> crate::LiveRepairNotificationTypeEnum, + } +} + table! { db_metadata (singleton) { singleton -> Bool, diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index d0b093ff45..f502e3afad 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -6,10 +6,13 @@ use super::DataStore; use crate::db; +use crate::db::datastore::OpContext; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; +use crate::db::model::LiveRepairNotification; +use crate::db::model::LiveRepairNotificationType; use crate::db::model::Region; use crate::db::model::RegionSnapshot; use crate::db::model::Volume; @@ -809,6 +812,101 @@ impl DataStore { public_error_from_diesel(e, ErrorHandler::Server) }) } + + // An Upstairs is created as part of a Volume hierarchy if the Volume + // Construction Request includes a "Region" variant. This may be at any + // layer of the Volume, and some notifications will come from an Upstairs + // instead of the top level of the Volume. The following functions have an + // Upstairs ID instead of a Volume ID for this reason. + + /// Record when an Upstairs notifies us about a live repair. If that record + /// (uniquely identified by the four IDs passed in plus the notification + /// type) exists already, do nothing. + pub async fn live_repair_notification( + &self, + opctx: &OpContext, + record: LiveRepairNotification, + ) -> Result<(), Error> { + use db::schema::live_repair_notification::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); + + self.transaction_retry_wrapper("live_repair_notification") + .transaction(&conn, |conn| { + let record = record.clone(); + let err = err.clone(); + + async move { + match &record.notification_type { + LiveRepairNotificationType::Started => { + // Proceed - the insertion can succeed or fail below + // based on the table's primary key + } + + LiveRepairNotificationType::Succeeded + | LiveRepairNotificationType::Failed => { + // However, Nexus must accept only one "finished" + // status - an Upstairs cannot change this and must + // instead perform another repair with a new repair + // ID. + let maybe_existing_finish_record: Option< + LiveRepairNotification, + > = dsl::live_repair_notification + .filter(dsl::repair_id.eq(record.repair_id)) + .filter(dsl::upstairs_id.eq(record.upstairs_id)) + .filter(dsl::session_id.eq(record.session_id)) + .filter(dsl::region_id.eq(record.region_id)) + .filter(dsl::notification_type.eq_any(vec![ + LiveRepairNotificationType::Succeeded, + LiveRepairNotificationType::Failed, + ])) + .get_result_async(&conn) + .await + .optional()?; + + if let Some(existing_finish_record) = + maybe_existing_finish_record + { + if existing_finish_record.notification_type + != record.notification_type + { + return Err(err.bail(Error::conflict( + "existing finish record does not match", + ))); + } else { + // inserting the same record, bypass + return Ok(()); + } + } + } + } + + diesel::insert_into(dsl::live_repair_notification) + .values(record) + .on_conflict(( + dsl::repair_id, + dsl::upstairs_id, + dsl::session_id, + dsl::region_id, + dsl::notification_type, + )) + .do_nothing() + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } } #[derive(Default, Clone, Debug, Serialize, Deserialize)] diff --git a/nexus/db-queries/src/db/pool_connection.rs b/nexus/db-queries/src/db/pool_connection.rs index 66fb125a7c..81855206e9 100644 --- a/nexus/db-queries/src/db/pool_connection.rs +++ b/nexus/db-queries/src/db/pool_connection.rs @@ -51,6 +51,7 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "ip_attach_state", "ip_kind", "ip_pool_resource_type", + "live_repair_notification_type", "network_interface_kind", "physical_disk_kind", "producer_kind", diff --git a/nexus/src/app/volume.rs b/nexus/src/app/volume.rs index c36c4524c1..b9052b1eb1 100644 --- a/nexus/src/app/volume.rs +++ b/nexus/src/app/volume.rs @@ -5,9 +5,15 @@ //! Volumes use crate::app::sagas; +use nexus_db_model::LiveRepairNotification; +use nexus_db_model::LiveRepairNotificationType; use nexus_db_queries::authn; use nexus_db_queries::context::OpContext; use omicron_common::api::external::DeleteResult; +use omicron_common::api::internal::nexus::RepairFinishInfo; +use omicron_common::api::internal::nexus::RepairStartInfo; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; use std::sync::Arc; use uuid::Uuid; @@ -30,4 +36,78 @@ impl super::Nexus { Ok(()) } + + /// An Upstairs is telling us when a live repair is starting. + pub(crate) async fn live_repair_start( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_start_info: RepairStartInfo, + ) -> DeleteResult { + info!( + self.log, + "received live_repair_start from upstairs {upstairs_id}: {:?}", + repair_start_info, + ); + + for repaired_downstairs in repair_start_info.repairs { + self.db_datastore + .live_repair_notification( + opctx, + LiveRepairNotification::new( + repair_start_info.repair_id, + upstairs_id, + repair_start_info.session_id, + repaired_downstairs.region_uuid, + repaired_downstairs.target_addr, + LiveRepairNotificationType::Started, + ), + ) + .await?; + } + + Ok(()) + } + + /// An Upstairs is telling us when a live repair is finished, and the result. + pub(crate) async fn live_repair_finish( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_finish_info: RepairFinishInfo, + ) -> DeleteResult { + info!( + self.log, + "received live_repair_finish from upstairs {upstairs_id}: {:?}", + repair_finish_info, + ); + + for repaired_downstairs in repair_finish_info.repairs { + self.db_datastore + .live_repair_notification( + opctx, + LiveRepairNotification::new( + repair_finish_info.repair_id, + upstairs_id, + repair_finish_info.session_id, + repaired_downstairs.region_uuid, + repaired_downstairs.target_addr, + if repair_finish_info.aborted { + LiveRepairNotificationType::Failed + } else { + LiveRepairNotificationType::Succeeded + }, + ), + ) + .await?; + + if !repair_finish_info.aborted { + // TODO-followup if there's an active region replacement + // occurring, a successfully completed live repair can trigger a + // saga to destroy the original region. + } + } + + Ok(()) + } } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index eddc834a2a..e1ef86b055 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -41,8 +41,12 @@ use omicron_common::api::external::http_pagination::ScanParams; use omicron_common::api::external::Error; use omicron_common::api::internal::nexus::DiskRuntimeState; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::RepairFinishInfo; +use omicron_common::api::internal::nexus::RepairStartInfo; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_common::update::ArtifactId; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; use oximeter::types::ProducerResults; use oximeter_producer::{collect, ProducerIdPathParams}; use schemars::JsonSchema; @@ -71,6 +75,8 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(cpapi_collectors_post)?; api.register(cpapi_metrics_collect)?; api.register(cpapi_artifact_download)?; + api.register(cpapi_live_repair_start)?; + api.register(cpapi_live_repair_finish)?; api.register(saga_list)?; api.register(saga_view)?; @@ -479,6 +485,68 @@ async fn cpapi_artifact_download( Ok(HttpResponseOk(Body::from(body).into())) } +/// Path parameters for Upstairs requests (internal API) +#[derive(Deserialize, JsonSchema)] +struct UpstairsPathParam { + upstairs_id: TypedUuid, +} + +/// An Upstairs will notify this endpoint when a live repair starts +#[endpoint { + method = POST, + path = "/upstairs/{upstairs_id}/live-repair-start", + }] +async fn cpapi_live_repair_start( + rqctx: RequestContext>, + path_params: Path, + repair_start_info: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .live_repair_start( + &opctx, + path.upstairs_id, + repair_start_info.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// An Upstairs will notify this endpoint when a live repair finishes. +#[endpoint { + method = POST, + path = "/upstairs/{upstairs_id}/live-repair-finish", + }] +async fn cpapi_live_repair_finish( + rqctx: RequestContext>, + path_params: Path, + repair_finish_info: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .live_repair_finish( + &opctx, + path.upstairs_id, + repair_finish_info.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + // Sagas /// List sagas diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 34f037ee8c..163267a1d0 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -24,6 +24,12 @@ use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Name; +use omicron_common::api::internal; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::LiveRepairKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsSessionKind; use rand::prelude::SliceRandom; use rand::{rngs::StdRng, SeedableRng}; use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; @@ -2552,3 +2558,445 @@ async fn test_volume_hard_delete_idempotent( datastore.volume_hard_delete(volume_id).await.unwrap(); datastore.volume_hard_delete(volume_id).await.unwrap(); } + +// upstairs related tests + +/// Test that an Upstairs can reissue live repair notifications +#[nexus_test] +async fn test_upstairs_live_repair_notify_idempotent( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Notify start + let notify_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Notify finish + let notify_url = format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that an Upstairs cannot issue different finish statuses for the same +/// repair. +#[nexus_test] +async fn test_upstairs_live_repair_notify_different_finish_status( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + let notify_url = format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, // live repair was ok + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, // live repair failed? + }), + StatusCode::CONFLICT, + ) + .await + .unwrap_err(); +} + +/// Test that the same Upstairs can rerun a repair again. +#[nexus_test] +async fn test_upstairs_live_repair_same_upstairs_retry( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair + + let notify_start_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_finish_url = + format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate the same Upstairs restarting the repair, which passes this time + + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that a different Upstairs session can rerun a repair again. +#[nexus_test] +async fn test_upstairs_live_repair_different_upstairs_retry( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair by one Upstairs + + let notify_start_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_finish_url = + format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate a different Upstairs session restarting the repair, which passes this time + + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that a different Upstairs session can rerun an interrupted repair +#[nexus_test] +async fn test_upstairs_live_repair_different_upstairs_retry_interrupted( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair by one Upstairs, which was interrupted (which + // leads to no finish message). + + let notify_start_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_finish_url = + format!("/upstairs/{upstairs_id}/live_repair_finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate a different Upstairs session restarting the interrupted repair, + // which passes this time + + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + session_id, + repair_id, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 53a53fb219..b92c5f81a3 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -962,6 +962,80 @@ } } }, + "/upstairs/{upstairs_id}/live-repair-finish": { + "post": { + "summary": "An Upstairs will notify this endpoint when a live repair finishes.", + "operationId": "cpapi_live_repair_finish", + "parameters": [ + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairFinishInfo" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/upstairs/{upstairs_id}/live-repair-start": { + "post": { + "summary": "An Upstairs will notify this endpoint when a live repair starts", + "operationId": "cpapi_live_repair_start", + "parameters": [ + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairStartInfo" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/volume/{volume_id}/remove-read-only-parent": { "post": { "summary": "Request removal of a read_only_parent from a volume", @@ -3409,6 +3483,21 @@ } ] }, + "DownstairsUnderRepair": { + "type": "object", + "properties": { + "region_uuid": { + "$ref": "#/components/schemas/TypedUuidForDownstairsRegionKind" + }, + "target_addr": { + "type": "string" + } + }, + "required": [ + "region_uuid", + "target_addr" + ] + }, "Duration": { "type": "object", "properties": { @@ -5759,6 +5848,54 @@ "user_password_hash" ] }, + "RepairFinishInfo": { + "type": "object", + "properties": { + "aborted": { + "type": "boolean" + }, + "repair_id": { + "$ref": "#/components/schemas/TypedUuidForLiveRepairKind" + }, + "repairs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DownstairsUnderRepair" + } + }, + "session_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsSessionKind" + } + }, + "required": [ + "aborted", + "repair_id", + "repairs", + "session_id" + ] + }, + "RepairStartInfo": { + "type": "object", + "properties": { + "repair_id": { + "$ref": "#/components/schemas/TypedUuidForLiveRepairKind" + }, + "repairs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DownstairsUnderRepair" + } + }, + "session_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsSessionKind" + } + }, + "required": [ + "repair_id", + "repairs", + "session_id" + ] + }, "RouteConfig": { "type": "object", "properties": { @@ -6546,6 +6683,18 @@ "SwitchPutResponse": { "type": "object" }, + "TypedUuidForDownstairsRegionKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForLiveRepairKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForUpstairsSessionKind": { + "type": "string", + "format": "uuid" + }, "UserId": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID though they may contain a UUID.", @@ -6643,6 +6792,10 @@ ] } ] + }, + "TypedUuidForUpstairsKind": { + "type": "string", + "format": "uuid" } }, "responses": { diff --git a/schema/crdb/37.0.0/up01.sql b/schema/crdb/37.0.0/up01.sql new file mode 100644 index 0000000000..91c27bd81a --- /dev/null +++ b/schema/crdb/37.0.0/up01.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.live_repair_notification_type AS ENUM ( + 'started', + 'succeeded', + 'failed' +); diff --git a/schema/crdb/37.0.0/up02.sql b/schema/crdb/37.0.0/up02.sql new file mode 100644 index 0000000000..58d33e39c8 --- /dev/null +++ b/schema/crdb/37.0.0/up02.sql @@ -0,0 +1,19 @@ +CREATE TABLE IF NOT EXISTS live_repair_notification ( + time TIMESTAMPTZ NOT NULL, + + repair_id UUID NOT NULL, + upstairs_id UUID NOT NULL, + session_id UUID NOT NULL, + + region_id UUID NOT NULL, + target_ip INET NOT NULL, + target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, + + notification_type omicron.public.live_repair_notification_type NOT NULL, + + /* + * A live repair is uniquely identified by the four UUIDs here, and a + * notification is uniquely identified by its type. + */ + PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 87a22d1adc..64be993e64 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3511,6 +3511,32 @@ SELECT deleted FROM interleaved_versions; +CREATE TYPE IF NOT EXISTS omicron.public.live_repair_notification_type AS ENUM ( + 'started', + 'succeeded', + 'failed' +); + +CREATE TABLE IF NOT EXISTS live_repair_notification ( + time TIMESTAMPTZ NOT NULL, + + repair_id UUID NOT NULL, + upstairs_id UUID NOT NULL, + session_id UUID NOT NULL, + + region_id UUID NOT NULL, + target_ip INET NOT NULL, + target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, + + notification_type omicron.public.live_repair_notification_type NOT NULL, + + /* + * A live repair is uniquely identified by the four UUIDs here, and a + * notification is uniquely identified by its type. + */ + PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) +); + INSERT INTO omicron.public.db_metadata ( singleton, time_created, @@ -3518,7 +3544,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '36.0.0', NULL) + ( TRUE, NOW(), NOW(), '37.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 12bc756d68..871de4d8ca 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -45,6 +45,10 @@ macro_rules! impl_typed_uuid_kind { // Please keep this list in alphabetical order. impl_typed_uuid_kind! { + DownstairsRegionKind => "downstairs_region", + LiveRepairKind => "live_repair", LoopbackAddressKind => "loopback_address", TufRepoKind => "tuf_repo", + UpstairsKind => "upstairs", + UpstairsSessionKind => "upstairs_session", } From 169b478aad06e88814a95abbf501c565ef6598d5 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 23 Feb 2024 17:55:46 -0500 Subject: [PATCH 02/24] use replace directive for TypedUuidFor* --- Cargo.lock | 1 + clients/nexus-client/Cargo.toml | 1 + clients/nexus-client/src/lib.rs | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 5b0806cb4d..f6a80dd117 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4304,6 +4304,7 @@ dependencies = [ "nexus-types", "omicron-common", "omicron-passwords", + "omicron-uuid-kinds", "omicron-workspace-hack", "progenitor 0.5.0 (git+https://github.com/oxidecomputer/progenitor?branch=main)", "regress", diff --git a/clients/nexus-client/Cargo.toml b/clients/nexus-client/Cargo.toml index 965e2a7dfb..fd6df6919f 100644 --- a/clients/nexus-client/Cargo.toml +++ b/clients/nexus-client/Cargo.toml @@ -20,3 +20,4 @@ serde_json.workspace = true slog.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true +omicron-uuid-kinds.workspace = true diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 17fb5aa367..dbf13dc610 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -33,6 +33,10 @@ progenitor::generate_api!( MacAddr = omicron_common::api::external::MacAddr, Name = omicron_common::api::external::Name, NewPasswordHash = omicron_passwords::NewPasswordHash, + + TypedUuidForUpstairsKind = omicron_uuid_kinds::TypedUuid, + TypedUuidForUpstairsSessionKind = omicron_uuid_kinds::TypedUuid, + TypedUuidForLiveRepairKind = omicron_uuid_kinds::TypedUuid, } ); From 7e7934e455293bc7da3045b736365cf47ea839d4 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 26 Feb 2024 15:30:35 -0500 Subject: [PATCH 03/24] no more underscores --- .../integration_tests/volume_management.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 163267a1d0..c4e2c3b998 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -2574,7 +2574,7 @@ async fn test_upstairs_live_repair_notify_idempotent( let region_id: TypedUuid = TypedUuid::new_v4(); // Notify start - let notify_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_url = format!("/upstairs/{upstairs_id}/live-repair-start"); int_client .make_request( @@ -2615,7 +2615,7 @@ async fn test_upstairs_live_repair_notify_idempotent( .unwrap(); // Notify finish - let notify_url = format!("/upstairs/{upstairs_id}/live_repair_finish"); + let notify_url = format!("/upstairs/{upstairs_id}/live-repair-finish"); int_client .make_request( @@ -2671,7 +2671,7 @@ async fn test_upstairs_live_repair_notify_different_finish_status( let repair_id: TypedUuid = TypedUuid::new_v4(); let region_id: TypedUuid = TypedUuid::new_v4(); - let notify_url = format!("/upstairs/{upstairs_id}/live_repair_finish"); + let notify_url = format!("/upstairs/{upstairs_id}/live-repair-finish"); int_client .make_request( @@ -2728,9 +2728,9 @@ async fn test_upstairs_live_repair_same_upstairs_retry( // Simulate one failed repair - let notify_start_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_start_url = format!("/upstairs/{upstairs_id}/live-repair-start"); let notify_finish_url = - format!("/upstairs/{upstairs_id}/live_repair_finish"); + format!("/upstairs/{upstairs_id}/live-repair-finish"); int_client .make_request( @@ -2829,9 +2829,9 @@ async fn test_upstairs_live_repair_different_upstairs_retry( // Simulate one failed repair by one Upstairs - let notify_start_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_start_url = format!("/upstairs/{upstairs_id}/live-repair-start"); let notify_finish_url = - format!("/upstairs/{upstairs_id}/live_repair_finish"); + format!("/upstairs/{upstairs_id}/live-repair-finish"); int_client .make_request( @@ -2932,9 +2932,9 @@ async fn test_upstairs_live_repair_different_upstairs_retry_interrupted( // Simulate one failed repair by one Upstairs, which was interrupted (which // leads to no finish message). - let notify_start_url = format!("/upstairs/{upstairs_id}/live_repair_start"); + let notify_start_url = format!("/upstairs/{upstairs_id}/live-repair-start"); let notify_finish_url = - format!("/upstairs/{upstairs_id}/live_repair_finish"); + format!("/upstairs/{upstairs_id}/live-repair-finish"); int_client .make_request( From ff08b000180963a05e95b65bae880d3cf8b36655 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 27 Feb 2024 14:37:05 -0500 Subject: [PATCH 04/24] support status for live repair and reconciliation --- clients/nexus-client/src/lib.rs | 2 +- common/src/api/internal/nexus.rs | 14 +- nexus/db-model/src/lib.rs | 4 +- nexus/db-model/src/live_repair.rs | 95 ------------ nexus/db-model/src/schema.rs | 5 +- nexus/db-model/src/upstairs_repair.rs | 135 ++++++++++++++++++ nexus/db-queries/src/db/datastore/volume.rs | 51 +++++-- nexus/src/app/volume.rs | 32 +++-- nexus/src/internal_api/http_entrypoints.rs | 20 +-- .../integration_tests/volume_management.rs | 118 ++++++++++++--- schema/crdb/37.0.0/up01.sql | 2 +- schema/crdb/37.0.0/up02.sql | 21 +-- schema/crdb/37.0.0/up03.sql | 21 +++ schema/crdb/dbinit.sql | 15 +- uuid-kinds/src/lib.rs | 2 +- 15 files changed, 347 insertions(+), 190 deletions(-) delete mode 100644 nexus/db-model/src/live_repair.rs create mode 100644 nexus/db-model/src/upstairs_repair.rs create mode 100644 schema/crdb/37.0.0/up03.sql diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index dbf13dc610..adc6d78a64 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -35,8 +35,8 @@ progenitor::generate_api!( NewPasswordHash = omicron_passwords::NewPasswordHash, TypedUuidForUpstairsKind = omicron_uuid_kinds::TypedUuid, + TypedUuidForUpstairsRepairKind = omicron_uuid_kinds::TypedUuid, TypedUuidForUpstairsSessionKind = omicron_uuid_kinds::TypedUuid, - TypedUuidForLiveRepairKind = omicron_uuid_kinds::TypedUuid, } ); diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 48140bfafe..9b81201aa8 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -10,7 +10,7 @@ use crate::api::external::{ }; use chrono::{DateTime, Utc}; use omicron_uuid_kinds::DownstairsRegionKind; -use omicron_uuid_kinds::LiveRepairKind; +use omicron_uuid_kinds::UpstairsRepairKind; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsSessionKind; use parse_display::{Display, FromStr}; @@ -256,6 +256,12 @@ pub enum HostIdentifier { Vpc(Vni), } +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone, Copy)] +pub enum UpstairsRepairType { + Live, + Reconciliation, +} + #[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] pub struct DownstairsUnderRepair { pub region_uuid: TypedUuid, @@ -265,14 +271,16 @@ pub struct DownstairsUnderRepair { #[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] pub struct RepairStartInfo { pub session_id: TypedUuid, - pub repair_id: TypedUuid, + pub repair_id: TypedUuid, + pub repair_type: UpstairsRepairType, pub repairs: Vec, } #[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] pub struct RepairFinishInfo { pub session_id: TypedUuid, - pub repair_id: TypedUuid, + pub repair_id: TypedUuid, + pub repair_type: UpstairsRepairType, pub repairs: Vec, pub aborted: bool, } diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index f85815b627..95de134c27 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -39,7 +39,6 @@ mod ipv4net; pub mod ipv6; mod ipv6net; mod l4_port_range; -mod live_repair; mod macaddr; mod name; mod network_interface; @@ -84,6 +83,7 @@ mod switch; mod tuf_repo; mod typed_uuid; mod unsigned; +mod upstairs_repair; mod user_builtin; mod utilization; mod virtual_provisioning_collection; @@ -140,7 +140,6 @@ pub use ipv4net::*; pub use ipv6::*; pub use ipv6net::*; pub use l4_port_range::*; -pub use live_repair::*; pub use name::*; pub use network_interface::*; pub use oximeter_info::*; @@ -174,6 +173,7 @@ pub use switch_interface::*; pub use switch_port::*; pub use tuf_repo::*; pub use typed_uuid::to_db_typed_uuid; +pub use upstairs_repair::*; pub use user_builtin::*; pub use utilization::*; pub use virtual_provisioning_collection::*; diff --git a/nexus/db-model/src/live_repair.rs b/nexus/db-model/src/live_repair.rs deleted file mode 100644 index 88d79b2057..0000000000 --- a/nexus/db-model/src/live_repair.rs +++ /dev/null @@ -1,95 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use super::impl_enum_type; -use crate::ipv6; -use crate::schema::live_repair_notification; -use crate::typed_uuid::DbTypedUuid; -use crate::SqlU16; -use chrono::{DateTime, Utc}; -use omicron_uuid_kinds::DownstairsRegionKind; -use omicron_uuid_kinds::LiveRepairKind; -use omicron_uuid_kinds::TypedUuid; -use omicron_uuid_kinds::UpstairsKind; -use omicron_uuid_kinds::UpstairsSessionKind; -use serde::{Deserialize, Serialize}; -use std::net::SocketAddrV6; - -impl_enum_type!( - #[derive(SqlType, Debug, QueryId)] - #[diesel(postgres_type(name = "live_repair_notification_type", schema = "public"))] - pub struct LiveRepairNotificationTypeEnum; - - #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] - #[diesel(sql_type = LiveRepairNotificationTypeEnum)] - pub enum LiveRepairNotificationType; - - // Notification types - Started => b"started" - Succeeded => b"succeeded" - Failed => b"failed" -); - -/// A record of Crucible live repair notifications: when a live repair started, -/// succeeded, failed, etc. -/// -/// Each live repair attempt is uniquely identified by the repair ID, upstairs -/// ID, session ID, and region ID. How those change tells Nexus about what is -/// going on: -/// -/// - if all IDs are the same for different requests, Nexus knows that the -/// client is retrying the notification. -/// -/// - if the upstairs ID, session ID, and region ID are all the same, but the -/// repair ID is different, then the same Upstairs is trying to repair that -/// region again. This could be due to a failed first attempt, or that -/// downstairs may have been kicked out again. -/// -/// - if the upstairs ID and region ID are the same, but the session ID and -/// repair ID are different, then a different session of the same Upstairs is -/// trying to repair that Downstairs. Session IDs change each time the -/// Upstairs is created, so it could have crashed, or it could have been -/// migrated and the destination Propolis' Upstairs is attempting to repair -/// the same region. -#[derive(Queryable, Insertable, Debug, Clone, Selectable)] -#[diesel(table_name = live_repair_notification)] -pub struct LiveRepairNotification { - pub time: DateTime, - - pub repair_id: DbTypedUuid, - pub upstairs_id: DbTypedUuid, - pub session_id: DbTypedUuid, - - pub region_id: DbTypedUuid, - pub target_ip: ipv6::Ipv6Addr, - pub target_port: SqlU16, - - pub notification_type: LiveRepairNotificationType, -} - -impl LiveRepairNotification { - pub fn new( - repair_id: TypedUuid, - upstairs_id: TypedUuid, - session_id: TypedUuid, - region_id: TypedUuid, - target_addr: SocketAddrV6, - notification_type: LiveRepairNotificationType, - ) -> Self { - Self { - time: Utc::now(), - repair_id: repair_id.into(), - upstairs_id: upstairs_id.into(), - session_id: session_id.into(), - region_id: region_id.into(), - target_ip: target_addr.ip().into(), - target_port: target_addr.port().into(), - notification_type, - } - } - - pub fn address(&self) -> SocketAddrV6 { - SocketAddrV6::new(*self.target_ip, *self.target_port, 0, 0) - } -} diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 7cc7216886..8bb6182a3e 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1518,10 +1518,11 @@ table! { } table! { - live_repair_notification (repair_id, upstairs_id, session_id, region_id, notification_type) { + upstairs_repair_notification (repair_id, upstairs_id, session_id, region_id, notification_type) { time -> Timestamptz, repair_id -> Uuid, + repair_type -> crate::UpstairsRepairTypeEnum, upstairs_id -> Uuid, session_id -> Uuid, @@ -1529,7 +1530,7 @@ table! { target_ip -> Inet, target_port -> Int4, - notification_type -> crate::LiveRepairNotificationTypeEnum, + notification_type -> crate::UpstairsRepairNotificationTypeEnum, } } diff --git a/nexus/db-model/src/upstairs_repair.rs b/nexus/db-model/src/upstairs_repair.rs new file mode 100644 index 0000000000..fb8849849a --- /dev/null +++ b/nexus/db-model/src/upstairs_repair.rs @@ -0,0 +1,135 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::ipv6; +use crate::schema::upstairs_repair_notification; +use crate::typed_uuid::DbTypedUuid; +use crate::SqlU16; +use chrono::{DateTime, Utc}; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::UpstairsRepairKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsSessionKind; +use serde::{Deserialize, Serialize}; +use std::net::SocketAddrV6; +use omicron_common::api::internal; // internal::nexus::UpstairsRepairType; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "upstairs_repair_notification_type", schema = "public"))] + pub struct UpstairsRepairNotificationTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = UpstairsRepairNotificationTypeEnum)] + pub enum UpstairsRepairNotificationType; + + // Notification types + Started => b"started" + Succeeded => b"succeeded" + Failed => b"failed" +); + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "upstairs_repair_type", schema = "public"))] + pub struct UpstairsRepairTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq, Eq, Hash)] + #[diesel(sql_type = UpstairsRepairTypeEnum)] + pub enum UpstairsRepairType; + + // Types of repair a Crucible Upstairs can do + Live => b"live" + Reconciliation => b"reconciliation" +); + +impl From for UpstairsRepairType { + fn from(v: internal::nexus::UpstairsRepairType) -> UpstairsRepairType { + match v { + internal::nexus::UpstairsRepairType::Live => UpstairsRepairType::Live, + internal::nexus::UpstairsRepairType::Reconciliation => UpstairsRepairType::Reconciliation, + } + } +} + +/// A record of Crucible Upstairs repair notifications: when a repair started, +/// succeeded, failed, etc. +/// +/// Each repair attempt is uniquely identified by the repair ID, upstairs ID, +/// session ID, and region ID. How those change tells Nexus about what is going +/// on: +/// +/// - if all IDs are the same for different requests, Nexus knows that the +/// client is retrying the notification. +/// +/// - if the upstairs ID, session ID, and region ID are all the same, but the +/// repair ID is different, then the same Upstairs is trying to repair that +/// region again. This could be due to a failed first attempt, or that +/// downstairs may have been kicked out again. +/// +/// - if the upstairs ID and region ID are the same, but the session ID and +/// repair ID are different, then a different session of the same Upstairs is +/// trying to repair that Downstairs. Session IDs change each time the +/// Upstairs is created, so it could have crashed, or it could have been +/// migrated and the destination Propolis' Upstairs is attempting to repair +/// the same region. +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = upstairs_repair_notification)] +pub struct UpstairsRepairNotification { + pub time: DateTime, + + pub repair_id: DbTypedUuid, + + // There's a difference between the live repairs and reconciliation: the + // Upstairs can go through reconciliation without there being any error from + // a downstairs, or any region replacement request from Nexus. One example + // is if the rack power is pulled: if everything is powered back up again + // reconciliation could be required but this isn't the fault of any problem + // with a physical disk, or any error that was returned. + // + // Alternatively any record of a live repair means that there was a problem: + // Currently, either an Upstairs kicked out a Downstairs (or two) due to + // some error or because it lagged behind the others, or Nexus has + // instructed an Upstairs to perform a region replacement. + pub repair_type: UpstairsRepairType, + + pub upstairs_id: DbTypedUuid, + pub session_id: DbTypedUuid, + + pub region_id: DbTypedUuid, + pub target_ip: ipv6::Ipv6Addr, + pub target_port: SqlU16, + + pub notification_type: UpstairsRepairNotificationType, +} + +impl UpstairsRepairNotification { + pub fn new( + repair_id: TypedUuid, + repair_type: UpstairsRepairType, + upstairs_id: TypedUuid, + session_id: TypedUuid, + region_id: TypedUuid, + target_addr: SocketAddrV6, + notification_type: UpstairsRepairNotificationType, + ) -> Self { + Self { + time: Utc::now(), + repair_id: repair_id.into(), + repair_type, + upstairs_id: upstairs_id.into(), + session_id: session_id.into(), + region_id: region_id.into(), + target_ip: target_addr.ip().into(), + target_port: target_addr.port().into(), + notification_type, + } + } + + pub fn address(&self) -> SocketAddrV6 { + SocketAddrV6::new(*self.target_ip, *self.target_port, 0, 0) + } +} diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index f502e3afad..dfd16b6599 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -11,8 +11,8 @@ use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; -use crate::db::model::LiveRepairNotification; -use crate::db::model::LiveRepairNotificationType; +use crate::db::model::UpstairsRepairNotification; +use crate::db::model::UpstairsRepairNotificationType; use crate::db::model::Region; use crate::db::model::RegionSnapshot; use crate::db::model::Volume; @@ -819,47 +819,68 @@ impl DataStore { // instead of the top level of the Volume. The following functions have an // Upstairs ID instead of a Volume ID for this reason. - /// Record when an Upstairs notifies us about a live repair. If that record + /// Record when an Upstairs notifies us about a repair. If that record /// (uniquely identified by the four IDs passed in plus the notification /// type) exists already, do nothing. - pub async fn live_repair_notification( + pub async fn upstairs_repair_notification( &self, opctx: &OpContext, - record: LiveRepairNotification, + record: UpstairsRepairNotification, ) -> Result<(), Error> { - use db::schema::live_repair_notification::dsl; + use db::schema::upstairs_repair_notification::dsl; let conn = self.pool_connection_authorized(opctx).await?; let err = OptionalError::new(); - self.transaction_retry_wrapper("live_repair_notification") + self.transaction_retry_wrapper("upstairs_repair_notification") .transaction(&conn, |conn| { let record = record.clone(); let err = err.clone(); async move { + // Return 409 if a repair ID does not match types + let mismatched_record_type_count: usize = + dsl::upstairs_repair_notification + .filter(dsl::repair_id.eq(record.repair_id)) + .filter(dsl::repair_type.ne(record.repair_type)) + .execute_async(&conn) + .await?; + + if mismatched_record_type_count == 0 { + // ok, no existing records or the existing records match + // the type + } else if mismatched_record_type_count == 1 { + // XXX is it possible that the match count is larger + // than 1? + return Err(err.bail(Error::conflict(&format!( + "existing repair type for id {} does not match {:?}!", + record.repair_id, + record.repair_type, + )))); + } + match &record.notification_type { - LiveRepairNotificationType::Started => { + UpstairsRepairNotificationType::Started => { // Proceed - the insertion can succeed or fail below // based on the table's primary key } - LiveRepairNotificationType::Succeeded - | LiveRepairNotificationType::Failed => { + UpstairsRepairNotificationType::Succeeded + | UpstairsRepairNotificationType::Failed => { // However, Nexus must accept only one "finished" // status - an Upstairs cannot change this and must // instead perform another repair with a new repair // ID. let maybe_existing_finish_record: Option< - LiveRepairNotification, - > = dsl::live_repair_notification + UpstairsRepairNotification, + > = dsl::upstairs_repair_notification .filter(dsl::repair_id.eq(record.repair_id)) .filter(dsl::upstairs_id.eq(record.upstairs_id)) .filter(dsl::session_id.eq(record.session_id)) .filter(dsl::region_id.eq(record.region_id)) .filter(dsl::notification_type.eq_any(vec![ - LiveRepairNotificationType::Succeeded, - LiveRepairNotificationType::Failed, + UpstairsRepairNotificationType::Succeeded, + UpstairsRepairNotificationType::Failed, ])) .get_result_async(&conn) .await @@ -882,7 +903,7 @@ impl DataStore { } } - diesel::insert_into(dsl::live_repair_notification) + diesel::insert_into(dsl::upstairs_repair_notification) .values(record) .on_conflict(( dsl::repair_id, diff --git a/nexus/src/app/volume.rs b/nexus/src/app/volume.rs index b9052b1eb1..e0e1ac90e0 100644 --- a/nexus/src/app/volume.rs +++ b/nexus/src/app/volume.rs @@ -5,8 +5,8 @@ //! Volumes use crate::app::sagas; -use nexus_db_model::LiveRepairNotification; -use nexus_db_model::LiveRepairNotificationType; +use nexus_db_model::UpstairsRepairNotification; +use nexus_db_model::UpstairsRepairNotificationType; use nexus_db_queries::authn; use nexus_db_queries::context::OpContext; use omicron_common::api::external::DeleteResult; @@ -37,8 +37,8 @@ impl super::Nexus { Ok(()) } - /// An Upstairs is telling us when a live repair is starting. - pub(crate) async fn live_repair_start( + /// An Upstairs is telling us when a repair is starting. + pub(crate) async fn upstairs_repair_start( self: &Arc, opctx: &OpContext, upstairs_id: TypedUuid, @@ -46,21 +46,22 @@ impl super::Nexus { ) -> DeleteResult { info!( self.log, - "received live_repair_start from upstairs {upstairs_id}: {:?}", + "received upstairs_repair_start from upstairs {upstairs_id}: {:?}", repair_start_info, ); for repaired_downstairs in repair_start_info.repairs { self.db_datastore - .live_repair_notification( + .upstairs_repair_notification( opctx, - LiveRepairNotification::new( + UpstairsRepairNotification::new( repair_start_info.repair_id, + repair_start_info.repair_type.into(), upstairs_id, repair_start_info.session_id, repaired_downstairs.region_uuid, repaired_downstairs.target_addr, - LiveRepairNotificationType::Started, + UpstairsRepairNotificationType::Started, ), ) .await?; @@ -69,8 +70,8 @@ impl super::Nexus { Ok(()) } - /// An Upstairs is telling us when a live repair is finished, and the result. - pub(crate) async fn live_repair_finish( + /// An Upstairs is telling us when a repair is finished, and the result. + pub(crate) async fn upstairs_repair_finish( self: &Arc, opctx: &OpContext, upstairs_id: TypedUuid, @@ -78,24 +79,25 @@ impl super::Nexus { ) -> DeleteResult { info!( self.log, - "received live_repair_finish from upstairs {upstairs_id}: {:?}", + "received upstairs_repair_finish from upstairs {upstairs_id}: {:?}", repair_finish_info, ); for repaired_downstairs in repair_finish_info.repairs { self.db_datastore - .live_repair_notification( + .upstairs_repair_notification( opctx, - LiveRepairNotification::new( + UpstairsRepairNotification::new( repair_finish_info.repair_id, + repair_finish_info.repair_type.into(), upstairs_id, repair_finish_info.session_id, repaired_downstairs.region_uuid, repaired_downstairs.target_addr, if repair_finish_info.aborted { - LiveRepairNotificationType::Failed + UpstairsRepairNotificationType::Failed } else { - LiveRepairNotificationType::Succeeded + UpstairsRepairNotificationType::Succeeded }, ), ) diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index e1ef86b055..70871f0188 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -75,8 +75,8 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(cpapi_collectors_post)?; api.register(cpapi_metrics_collect)?; api.register(cpapi_artifact_download)?; - api.register(cpapi_live_repair_start)?; - api.register(cpapi_live_repair_finish)?; + api.register(cpapi_upstairs_repair_start)?; + api.register(cpapi_upstairs_repair_finish)?; api.register(saga_list)?; api.register(saga_view)?; @@ -491,12 +491,12 @@ struct UpstairsPathParam { upstairs_id: TypedUuid, } -/// An Upstairs will notify this endpoint when a live repair starts +/// An Upstairs will notify this endpoint when a repair starts #[endpoint { method = POST, - path = "/upstairs/{upstairs_id}/live-repair-start", + path = "/upstairs/{upstairs_id}/repair-start", }] -async fn cpapi_live_repair_start( +async fn cpapi_upstairs_repair_start( rqctx: RequestContext>, path_params: Path, repair_start_info: TypedBody, @@ -508,7 +508,7 @@ async fn cpapi_live_repair_start( let handler = async { let opctx = crate::context::op_context_for_internal_api(&rqctx).await; nexus - .live_repair_start( + .upstairs_repair_start( &opctx, path.upstairs_id, repair_start_info.into_inner(), @@ -519,12 +519,12 @@ async fn cpapi_live_repair_start( apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// An Upstairs will notify this endpoint when a live repair finishes. +/// An Upstairs will notify this endpoint when a repair finishes. #[endpoint { method = POST, - path = "/upstairs/{upstairs_id}/live-repair-finish", + path = "/upstairs/{upstairs_id}/repair-finish", }] -async fn cpapi_live_repair_finish( +async fn cpapi_upstairs_repair_finish( rqctx: RequestContext>, path_params: Path, repair_finish_info: TypedBody, @@ -536,7 +536,7 @@ async fn cpapi_live_repair_finish( let handler = async { let opctx = crate::context::op_context_for_internal_api(&rqctx).await; nexus - .live_repair_finish( + .upstairs_repair_finish( &opctx, path.upstairs_id, repair_finish_info.into_inner(), diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index c4e2c3b998..f9db6b8a2b 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -26,9 +26,9 @@ use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Name; use omicron_common::api::internal; use omicron_uuid_kinds::DownstairsRegionKind; -use omicron_uuid_kinds::LiveRepairKind; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; use omicron_uuid_kinds::UpstairsSessionKind; use rand::prelude::SliceRandom; use rand::{rngs::StdRng, SeedableRng}; @@ -2563,18 +2563,18 @@ async fn test_volume_hard_delete_idempotent( /// Test that an Upstairs can reissue live repair notifications #[nexus_test] -async fn test_upstairs_live_repair_notify_idempotent( +async fn test_upstairs_repair_notify_idempotent( cptestctx: &ControlPlaneTestContext, ) { let int_client = &cptestctx.internal_client; let upstairs_id: TypedUuid = TypedUuid::new_v4(); let session_id: TypedUuid = TypedUuid::new_v4(); - let repair_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); let region_id: TypedUuid = TypedUuid::new_v4(); // Notify start - let notify_url = format!("/upstairs/{upstairs_id}/live-repair-start"); + let notify_url = format!("/upstairs/{upstairs_id}/repair-start"); int_client .make_request( @@ -2583,6 +2583,7 @@ async fn test_upstairs_live_repair_notify_idempotent( Some(internal::nexus::RepairStartInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2602,6 +2603,7 @@ async fn test_upstairs_live_repair_notify_idempotent( Some(internal::nexus::RepairStartInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2615,7 +2617,7 @@ async fn test_upstairs_live_repair_notify_idempotent( .unwrap(); // Notify finish - let notify_url = format!("/upstairs/{upstairs_id}/live-repair-finish"); + let notify_url = format!("/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -2624,6 +2626,7 @@ async fn test_upstairs_live_repair_notify_idempotent( Some(internal::nexus::RepairFinishInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2644,6 +2647,7 @@ async fn test_upstairs_live_repair_notify_idempotent( Some(internal::nexus::RepairFinishInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2661,17 +2665,17 @@ async fn test_upstairs_live_repair_notify_idempotent( /// Test that an Upstairs cannot issue different finish statuses for the same /// repair. #[nexus_test] -async fn test_upstairs_live_repair_notify_different_finish_status( +async fn test_upstairs_repair_notify_different_finish_status( cptestctx: &ControlPlaneTestContext, ) { let int_client = &cptestctx.internal_client; let upstairs_id: TypedUuid = TypedUuid::new_v4(); let session_id: TypedUuid = TypedUuid::new_v4(); - let repair_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); let region_id: TypedUuid = TypedUuid::new_v4(); - let notify_url = format!("/upstairs/{upstairs_id}/live-repair-finish"); + let notify_url = format!("/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -2680,6 +2684,7 @@ async fn test_upstairs_live_repair_notify_different_finish_status( Some(internal::nexus::RepairFinishInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2700,6 +2705,7 @@ async fn test_upstairs_live_repair_notify_different_finish_status( Some(internal::nexus::RepairFinishInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2716,21 +2722,21 @@ async fn test_upstairs_live_repair_notify_different_finish_status( /// Test that the same Upstairs can rerun a repair again. #[nexus_test] -async fn test_upstairs_live_repair_same_upstairs_retry( +async fn test_upstairs_repair_same_upstairs_retry( cptestctx: &ControlPlaneTestContext, ) { let int_client = &cptestctx.internal_client; let upstairs_id: TypedUuid = TypedUuid::new_v4(); let session_id: TypedUuid = TypedUuid::new_v4(); - let repair_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); let region_id: TypedUuid = TypedUuid::new_v4(); // Simulate one failed repair - let notify_start_url = format!("/upstairs/{upstairs_id}/live-repair-start"); + let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); let notify_finish_url = - format!("/upstairs/{upstairs_id}/live-repair-finish"); + format!("/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -2739,6 +2745,7 @@ async fn test_upstairs_live_repair_same_upstairs_retry( Some(internal::nexus::RepairStartInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2758,6 +2765,7 @@ async fn test_upstairs_live_repair_same_upstairs_retry( Some(internal::nexus::RepairFinishInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2773,7 +2781,7 @@ async fn test_upstairs_live_repair_same_upstairs_retry( // Simulate the same Upstairs restarting the repair, which passes this time - let repair_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); int_client .make_request( @@ -2782,6 +2790,7 @@ async fn test_upstairs_live_repair_same_upstairs_retry( Some(internal::nexus::RepairStartInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2801,6 +2810,7 @@ async fn test_upstairs_live_repair_same_upstairs_retry( Some(internal::nexus::RepairFinishInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2817,21 +2827,21 @@ async fn test_upstairs_live_repair_same_upstairs_retry( /// Test that a different Upstairs session can rerun a repair again. #[nexus_test] -async fn test_upstairs_live_repair_different_upstairs_retry( +async fn test_upstairs_repair_different_upstairs_retry( cptestctx: &ControlPlaneTestContext, ) { let int_client = &cptestctx.internal_client; let upstairs_id: TypedUuid = TypedUuid::new_v4(); let session_id: TypedUuid = TypedUuid::new_v4(); - let repair_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); let region_id: TypedUuid = TypedUuid::new_v4(); // Simulate one failed repair by one Upstairs - let notify_start_url = format!("/upstairs/{upstairs_id}/live-repair-start"); + let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); let notify_finish_url = - format!("/upstairs/{upstairs_id}/live-repair-finish"); + format!("/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -2840,6 +2850,7 @@ async fn test_upstairs_live_repair_different_upstairs_retry( Some(internal::nexus::RepairStartInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2859,6 +2870,7 @@ async fn test_upstairs_live_repair_different_upstairs_retry( Some(internal::nexus::RepairFinishInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2875,7 +2887,7 @@ async fn test_upstairs_live_repair_different_upstairs_retry( // Simulate a different Upstairs session restarting the repair, which passes this time let session_id: TypedUuid = TypedUuid::new_v4(); - let repair_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); int_client .make_request( @@ -2884,6 +2896,7 @@ async fn test_upstairs_live_repair_different_upstairs_retry( Some(internal::nexus::RepairStartInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2903,6 +2916,7 @@ async fn test_upstairs_live_repair_different_upstairs_retry( Some(internal::nexus::RepairFinishInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2919,22 +2933,22 @@ async fn test_upstairs_live_repair_different_upstairs_retry( /// Test that a different Upstairs session can rerun an interrupted repair #[nexus_test] -async fn test_upstairs_live_repair_different_upstairs_retry_interrupted( +async fn test_upstairs_repair_different_upstairs_retry_interrupted( cptestctx: &ControlPlaneTestContext, ) { let int_client = &cptestctx.internal_client; let upstairs_id: TypedUuid = TypedUuid::new_v4(); let session_id: TypedUuid = TypedUuid::new_v4(); - let repair_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); let region_id: TypedUuid = TypedUuid::new_v4(); // Simulate one failed repair by one Upstairs, which was interrupted (which // leads to no finish message). - let notify_start_url = format!("/upstairs/{upstairs_id}/live-repair-start"); + let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); let notify_finish_url = - format!("/upstairs/{upstairs_id}/live-repair-finish"); + format!("/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -2943,6 +2957,7 @@ async fn test_upstairs_live_repair_different_upstairs_retry_interrupted( Some(internal::nexus::RepairStartInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2959,7 +2974,7 @@ async fn test_upstairs_live_repair_different_upstairs_retry_interrupted( // which passes this time let session_id: TypedUuid = TypedUuid::new_v4(); - let repair_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); int_client .make_request( @@ -2968,6 +2983,7 @@ async fn test_upstairs_live_repair_different_upstairs_retry_interrupted( Some(internal::nexus::RepairStartInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -2987,6 +3003,7 @@ async fn test_upstairs_live_repair_different_upstairs_retry_interrupted( Some(internal::nexus::RepairFinishInfo { session_id, repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -3000,3 +3017,58 @@ async fn test_upstairs_live_repair_different_upstairs_retry_interrupted( .await .unwrap(); } + +/// Test that the same repair ID cannot be used for different repair types +#[nexus_test] +async fn test_upstairs_repair_repair_id_and_type_conflict( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Reconciliation, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::CONFLICT, + ) + .await + .unwrap_err(); +} diff --git a/schema/crdb/37.0.0/up01.sql b/schema/crdb/37.0.0/up01.sql index 91c27bd81a..b94f43eda7 100644 --- a/schema/crdb/37.0.0/up01.sql +++ b/schema/crdb/37.0.0/up01.sql @@ -1,4 +1,4 @@ -CREATE TYPE IF NOT EXISTS omicron.public.live_repair_notification_type AS ENUM ( +CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_notification_type AS ENUM ( 'started', 'succeeded', 'failed' diff --git a/schema/crdb/37.0.0/up02.sql b/schema/crdb/37.0.0/up02.sql index 58d33e39c8..47c5f7f03a 100644 --- a/schema/crdb/37.0.0/up02.sql +++ b/schema/crdb/37.0.0/up02.sql @@ -1,19 +1,4 @@ -CREATE TABLE IF NOT EXISTS live_repair_notification ( - time TIMESTAMPTZ NOT NULL, - - repair_id UUID NOT NULL, - upstairs_id UUID NOT NULL, - session_id UUID NOT NULL, - - region_id UUID NOT NULL, - target_ip INET NOT NULL, - target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, - - notification_type omicron.public.live_repair_notification_type NOT NULL, - - /* - * A live repair is uniquely identified by the four UUIDs here, and a - * notification is uniquely identified by its type. - */ - PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) +CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_type AS ENUM ( + 'live', + 'reconciliation' ); diff --git a/schema/crdb/37.0.0/up03.sql b/schema/crdb/37.0.0/up03.sql new file mode 100644 index 0000000000..3affc24900 --- /dev/null +++ b/schema/crdb/37.0.0/up03.sql @@ -0,0 +1,21 @@ +CREATE TABLE IF NOT EXISTS upstairs_repair_notification ( + time TIMESTAMPTZ NOT NULL, + + repair_id UUID NOT NULL, + repair_type omicron.public.upstairs_repair_type NOT NULL, + + upstairs_id UUID NOT NULL, + session_id UUID NOT NULL, + + region_id UUID NOT NULL, + target_ip INET NOT NULL, + target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, + + notification_type omicron.public.upstairs_repair_notification_type NOT NULL, + + /* + * A repair is uniquely identified by the four UUIDs here, and a + * notification is uniquely identified by its type. + */ + PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 64be993e64..2f32dab562 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3511,16 +3511,23 @@ SELECT deleted FROM interleaved_versions; -CREATE TYPE IF NOT EXISTS omicron.public.live_repair_notification_type AS ENUM ( +CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_notification_type AS ENUM ( 'started', 'succeeded', 'failed' ); -CREATE TABLE IF NOT EXISTS live_repair_notification ( +CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_type AS ENUM ( + 'live', + 'reconciliation' +); + +CREATE TABLE IF NOT EXISTS upstairs_repair_notification ( time TIMESTAMPTZ NOT NULL, repair_id UUID NOT NULL, + repair_type omicron.public.upstairs_repair_type NOT NULL, + upstairs_id UUID NOT NULL, session_id UUID NOT NULL, @@ -3528,10 +3535,10 @@ CREATE TABLE IF NOT EXISTS live_repair_notification ( target_ip INET NOT NULL, target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, - notification_type omicron.public.live_repair_notification_type NOT NULL, + notification_type omicron.public.upstairs_repair_notification_type NOT NULL, /* - * A live repair is uniquely identified by the four UUIDs here, and a + * A repair is uniquely identified by the four UUIDs here, and a * notification is uniquely identified by its type. */ PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 871de4d8ca..10551683fb 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -46,9 +46,9 @@ macro_rules! impl_typed_uuid_kind { impl_typed_uuid_kind! { DownstairsRegionKind => "downstairs_region", - LiveRepairKind => "live_repair", LoopbackAddressKind => "loopback_address", TufRepoKind => "tuf_repo", UpstairsKind => "upstairs", + UpstairsRepairKind => "upstairs_repair", UpstairsSessionKind => "upstairs_session", } From b1940f0a039d29d8c4b8987f8632d385fbd173ff Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 27 Feb 2024 14:37:42 -0500 Subject: [PATCH 05/24] schema 37 -> 38 --- schema/crdb/{37.0.0 => 38.0.0}/up01.sql | 0 schema/crdb/{37.0.0 => 38.0.0}/up02.sql | 0 schema/crdb/{37.0.0 => 38.0.0}/up03.sql | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename schema/crdb/{37.0.0 => 38.0.0}/up01.sql (100%) rename schema/crdb/{37.0.0 => 38.0.0}/up02.sql (100%) rename schema/crdb/{37.0.0 => 38.0.0}/up03.sql (100%) diff --git a/schema/crdb/37.0.0/up01.sql b/schema/crdb/38.0.0/up01.sql similarity index 100% rename from schema/crdb/37.0.0/up01.sql rename to schema/crdb/38.0.0/up01.sql diff --git a/schema/crdb/37.0.0/up02.sql b/schema/crdb/38.0.0/up02.sql similarity index 100% rename from schema/crdb/37.0.0/up02.sql rename to schema/crdb/38.0.0/up02.sql diff --git a/schema/crdb/37.0.0/up03.sql b/schema/crdb/38.0.0/up03.sql similarity index 100% rename from schema/crdb/37.0.0/up03.sql rename to schema/crdb/38.0.0/up03.sql From 1b03223553930df0d6250a9cbe1528c591a09b14 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 27 Feb 2024 14:47:24 -0500 Subject: [PATCH 06/24] more schema update --- nexus/db-model/src/schema.rs | 2 +- schema/crdb/dbinit.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 8bb6182a3e..f02597e54a 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion; /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(37, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(38, 0, 0); table! { disk (id) { diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 2f32dab562..5718d19411 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3551,7 +3551,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '37.0.0', NULL) + ( TRUE, NOW(), NOW(), '38.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From 0cd8601af4c76fa4b817973911b9f6cc57e2f954 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 27 Feb 2024 16:47:19 -0500 Subject: [PATCH 07/24] accept upstairs repair progress --- common/src/api/internal/nexus.rs | 11 +- nexus/db-model/src/schema.rs | 9 ++ nexus/db-model/src/upstairs_repair.rs | 31 ++++- nexus/db-queries/src/db/datastore/volume.rs | 69 ++++++++++- nexus/src/app/volume.rs | 28 +++++ nexus/src/internal_api/http_entrypoints.rs | 40 ++++++ .../integration_tests/volume_management.rs | 117 ++++++++++++++++-- schema/crdb/38.0.0/up04.sql | 8 ++ schema/crdb/dbinit.sql | 9 ++ 9 files changed, 306 insertions(+), 16 deletions(-) create mode 100644 schema/crdb/38.0.0/up04.sql diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 9b81201aa8..b347a5ddb7 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -10,8 +10,8 @@ use crate::api::external::{ }; use chrono::{DateTime, Utc}; use omicron_uuid_kinds::DownstairsRegionKind; -use omicron_uuid_kinds::UpstairsRepairKind; use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsRepairKind; use omicron_uuid_kinds::UpstairsSessionKind; use parse_display::{Display, FromStr}; use schemars::JsonSchema; @@ -270,6 +270,7 @@ pub struct DownstairsUnderRepair { #[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] pub struct RepairStartInfo { + pub time: DateTime, pub session_id: TypedUuid, pub repair_id: TypedUuid, pub repair_type: UpstairsRepairType, @@ -278,9 +279,17 @@ pub struct RepairStartInfo { #[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] pub struct RepairFinishInfo { + pub time: DateTime, pub session_id: TypedUuid, pub repair_id: TypedUuid, pub repair_type: UpstairsRepairType, pub repairs: Vec, pub aborted: bool, } + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairProgress { + pub time: DateTime, + pub current_item: i64, + pub total_items: i64, +} diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index f02597e54a..775f67f330 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1534,6 +1534,15 @@ table! { } } +table! { + upstairs_repair_progress (repair_id, time, current_item, total_items) { + repair_id -> Uuid, + time -> Timestamptz, + current_item -> Int8, + total_items -> Int8, + } +} + table! { db_metadata (singleton) { singleton -> Bool, diff --git a/nexus/db-model/src/upstairs_repair.rs b/nexus/db-model/src/upstairs_repair.rs index fb8849849a..311592f8e4 100644 --- a/nexus/db-model/src/upstairs_repair.rs +++ b/nexus/db-model/src/upstairs_repair.rs @@ -5,17 +5,18 @@ use super::impl_enum_type; use crate::ipv6; use crate::schema::upstairs_repair_notification; +use crate::schema::upstairs_repair_progress; use crate::typed_uuid::DbTypedUuid; use crate::SqlU16; use chrono::{DateTime, Utc}; +use omicron_common::api::internal; use omicron_uuid_kinds::DownstairsRegionKind; -use omicron_uuid_kinds::UpstairsRepairKind; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; use omicron_uuid_kinds::UpstairsSessionKind; use serde::{Deserialize, Serialize}; -use std::net::SocketAddrV6; -use omicron_common::api::internal; // internal::nexus::UpstairsRepairType; +use std::net::SocketAddrV6; // internal::nexus::UpstairsRepairType; impl_enum_type!( #[derive(SqlType, Debug, QueryId)] @@ -49,8 +50,12 @@ impl_enum_type!( impl From for UpstairsRepairType { fn from(v: internal::nexus::UpstairsRepairType) -> UpstairsRepairType { match v { - internal::nexus::UpstairsRepairType::Live => UpstairsRepairType::Live, - internal::nexus::UpstairsRepairType::Reconciliation => UpstairsRepairType::Reconciliation, + internal::nexus::UpstairsRepairType::Live => { + UpstairsRepairType::Live + } + internal::nexus::UpstairsRepairType::Reconciliation => { + UpstairsRepairType::Reconciliation + } } } } @@ -79,6 +84,8 @@ impl From for UpstairsRepairType { #[derive(Queryable, Insertable, Debug, Clone, Selectable)] #[diesel(table_name = upstairs_repair_notification)] pub struct UpstairsRepairNotification { + // Importantly, this is client time, not Nexus' time that it received the + // notification. pub time: DateTime, pub repair_id: DbTypedUuid, @@ -107,7 +114,9 @@ pub struct UpstairsRepairNotification { } impl UpstairsRepairNotification { + #[allow(clippy::too_many_arguments)] pub fn new( + time: DateTime, repair_id: TypedUuid, repair_type: UpstairsRepairType, upstairs_id: TypedUuid, @@ -117,7 +126,7 @@ impl UpstairsRepairNotification { notification_type: UpstairsRepairNotificationType, ) -> Self { Self { - time: Utc::now(), + time, repair_id: repair_id.into(), repair_type, upstairs_id: upstairs_id.into(), @@ -133,3 +142,13 @@ impl UpstairsRepairNotification { SocketAddrV6::new(*self.target_ip, *self.target_port, 0, 0) } } + +/// A record of Crucible Upstairs repair progress. +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = upstairs_repair_progress)] +pub struct UpstairsRepairProgress { + pub repair_id: DbTypedUuid, + pub time: DateTime, + pub current_item: i64, + pub total_items: i64, +} diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index dfd16b6599..9bfe550fe7 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -11,10 +11,11 @@ use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; -use crate::db::model::UpstairsRepairNotification; -use crate::db::model::UpstairsRepairNotificationType; use crate::db::model::Region; use crate::db::model::RegionSnapshot; +use crate::db::model::UpstairsRepairNotification; +use crate::db::model::UpstairsRepairNotificationType; +use crate::db::model::UpstairsRepairProgress; use crate::db::model::Volume; use crate::db::queries::volume::DecreaseCrucibleResourceCountAndSoftDeleteVolume; use crate::transaction_retry::OptionalError; @@ -28,6 +29,10 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::ResourceType; +use omicron_common::api::internal::nexus::RepairProgress; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; use serde::Deserialize; use serde::Deserializer; use serde::Serialize; @@ -928,6 +933,66 @@ impl DataStore { } }) } + + /// Record Upstairs repair progress + pub async fn upstairs_repair_progress( + &self, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_id: TypedUuid, + repair_progress: RepairProgress, + ) -> Result<(), Error> { + use db::schema::upstairs_repair_notification::dsl as notification_dsl; + use db::schema::upstairs_repair_progress::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); + + self.transaction_retry_wrapper("upstairs_repair_progress") + .transaction(&conn, |conn| { + let repair_progress = repair_progress.clone(); + let err = err.clone(); + + async move { + // Check that there is a repair id for the upstairs id + let matching_repair: Option = + notification_dsl::upstairs_repair_notification + .filter(notification_dsl::repair_id.eq(nexus_db_model::to_db_typed_uuid(repair_id))) + .filter(notification_dsl::upstairs_id.eq(nexus_db_model::to_db_typed_uuid(upstairs_id))) + .filter(notification_dsl::notification_type.eq(UpstairsRepairNotificationType::Started)) + .get_result_async(&conn) + .await + .optional()?; + + if matching_repair.is_none() { + // XXX should be 404 + return Err(err.bail(Error::invalid_request(&format!( + "upstairs {upstairs_id} repair {repair_id} not found" + )))); + } + + diesel::insert_into(dsl::upstairs_repair_progress) + .values(UpstairsRepairProgress { + repair_id: repair_id.into(), + time: repair_progress.time, + current_item: repair_progress.current_item, + total_items: repair_progress.total_items, + }) + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } } #[derive(Default, Clone, Debug, Serialize, Deserialize)] diff --git a/nexus/src/app/volume.rs b/nexus/src/app/volume.rs index e0e1ac90e0..60ba36091f 100644 --- a/nexus/src/app/volume.rs +++ b/nexus/src/app/volume.rs @@ -11,9 +11,11 @@ use nexus_db_queries::authn; use nexus_db_queries::context::OpContext; use omicron_common::api::external::DeleteResult; use omicron_common::api::internal::nexus::RepairFinishInfo; +use omicron_common::api::internal::nexus::RepairProgress; use omicron_common::api::internal::nexus::RepairStartInfo; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; use std::sync::Arc; use uuid::Uuid; @@ -55,6 +57,7 @@ impl super::Nexus { .upstairs_repair_notification( opctx, UpstairsRepairNotification::new( + repair_start_info.time, repair_start_info.repair_id, repair_start_info.repair_type.into(), upstairs_id, @@ -88,6 +91,7 @@ impl super::Nexus { .upstairs_repair_notification( opctx, UpstairsRepairNotification::new( + repair_finish_info.time, repair_finish_info.repair_id, repair_finish_info.repair_type.into(), upstairs_id, @@ -112,4 +116,28 @@ impl super::Nexus { Ok(()) } + + /// An Upstairs is updating us with repair progress + pub(crate) async fn upstairs_repair_progress( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_id: TypedUuid, + repair_progress: RepairProgress, + ) -> DeleteResult { + info!( + self.log, + "received upstairs_repair_progress from upstairs {upstairs_id} for repair {repair_id}: {:?}", + repair_progress, + ); + + self.db_datastore + .upstairs_repair_progress( + opctx, + upstairs_id, + repair_id, + repair_progress, + ) + .await + } } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 70871f0188..95a12f6f97 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -42,11 +42,13 @@ use omicron_common::api::external::Error; use omicron_common::api::internal::nexus::DiskRuntimeState; use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::api::internal::nexus::RepairFinishInfo; +use omicron_common::api::internal::nexus::RepairProgress; use omicron_common::api::internal::nexus::RepairStartInfo; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_common::update::ArtifactId; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; use oximeter::types::ProducerResults; use oximeter_producer::{collect, ProducerIdPathParams}; use schemars::JsonSchema; @@ -75,8 +77,10 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(cpapi_collectors_post)?; api.register(cpapi_metrics_collect)?; api.register(cpapi_artifact_download)?; + api.register(cpapi_upstairs_repair_start)?; api.register(cpapi_upstairs_repair_finish)?; + api.register(cpapi_upstairs_repair_progress)?; api.register(saga_list)?; api.register(saga_view)?; @@ -547,6 +551,42 @@ async fn cpapi_upstairs_repair_finish( apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// Path parameters for Upstairs requests (internal API) +#[derive(Deserialize, JsonSchema)] +struct UpstairsRepairPathParam { + upstairs_id: TypedUuid, + repair_id: TypedUuid, +} + +/// An Upstairs will update this endpoint with the progress of a repair +#[endpoint { + method = POST, + path = "/upstairs/{upstairs_id}/repair/{repair_id}/progress", + }] +async fn cpapi_upstairs_repair_progress( + rqctx: RequestContext>, + path_params: Path, + repair_progress: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .upstairs_repair_progress( + &opctx, + path.upstairs_id, + path.repair_id, + repair_progress.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + // Sagas /// List sagas diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index f9db6b8a2b..3405a950dc 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -5,6 +5,7 @@ //! Tests that Nexus properly manages and cleans up Crucible resources //! associated with Volumes +use chrono::Utc; use dropshot::test_util::ClientTestContext; use http::method::Method; use http::StatusCode; @@ -2581,6 +2582,7 @@ async fn test_upstairs_repair_notify_idempotent( Method::POST, ¬ify_url, Some(internal::nexus::RepairStartInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2601,6 +2603,7 @@ async fn test_upstairs_repair_notify_idempotent( Method::POST, ¬ify_url, Some(internal::nexus::RepairStartInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2624,6 +2627,7 @@ async fn test_upstairs_repair_notify_idempotent( Method::POST, ¬ify_url, Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2645,6 +2649,7 @@ async fn test_upstairs_repair_notify_idempotent( Method::POST, ¬ify_url, Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2682,6 +2687,7 @@ async fn test_upstairs_repair_notify_different_finish_status( Method::POST, ¬ify_url, Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2703,6 +2709,7 @@ async fn test_upstairs_repair_notify_different_finish_status( Method::POST, ¬ify_url, Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2735,14 +2742,14 @@ async fn test_upstairs_repair_same_upstairs_retry( // Simulate one failed repair let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); - let notify_finish_url = - format!("/upstairs/{upstairs_id}/repair-finish"); + let notify_finish_url = format!("/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( Method::POST, ¬ify_start_url, Some(internal::nexus::RepairStartInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2763,6 +2770,7 @@ async fn test_upstairs_repair_same_upstairs_retry( Method::POST, ¬ify_finish_url, Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2788,6 +2796,7 @@ async fn test_upstairs_repair_same_upstairs_retry( Method::POST, ¬ify_start_url, Some(internal::nexus::RepairStartInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2808,6 +2817,7 @@ async fn test_upstairs_repair_same_upstairs_retry( Method::POST, ¬ify_finish_url, Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2840,14 +2850,14 @@ async fn test_upstairs_repair_different_upstairs_retry( // Simulate one failed repair by one Upstairs let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); - let notify_finish_url = - format!("/upstairs/{upstairs_id}/repair-finish"); + let notify_finish_url = format!("/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( Method::POST, ¬ify_start_url, Some(internal::nexus::RepairStartInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2868,6 +2878,7 @@ async fn test_upstairs_repair_different_upstairs_retry( Method::POST, ¬ify_finish_url, Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2894,6 +2905,7 @@ async fn test_upstairs_repair_different_upstairs_retry( Method::POST, ¬ify_start_url, Some(internal::nexus::RepairStartInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2914,6 +2926,7 @@ async fn test_upstairs_repair_different_upstairs_retry( Method::POST, ¬ify_finish_url, Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2947,14 +2960,14 @@ async fn test_upstairs_repair_different_upstairs_retry_interrupted( // leads to no finish message). let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); - let notify_finish_url = - format!("/upstairs/{upstairs_id}/repair-finish"); + let notify_finish_url = format!("/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( Method::POST, ¬ify_start_url, Some(internal::nexus::RepairStartInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -2981,6 +2994,7 @@ async fn test_upstairs_repair_different_upstairs_retry_interrupted( Method::POST, ¬ify_start_url, Some(internal::nexus::RepairStartInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -3001,6 +3015,7 @@ async fn test_upstairs_repair_different_upstairs_retry_interrupted( Method::POST, ¬ify_finish_url, Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -3037,6 +3052,7 @@ async fn test_upstairs_repair_repair_id_and_type_conflict( Method::POST, ¬ify_start_url, Some(internal::nexus::RepairStartInfo { + time: Utc::now(), session_id, repair_id, repair_type: internal::nexus::UpstairsRepairType::Live, @@ -3057,9 +3073,11 @@ async fn test_upstairs_repair_repair_id_and_type_conflict( Method::POST, ¬ify_start_url, Some(internal::nexus::RepairStartInfo { + time: Utc::now(), session_id, repair_id, - repair_type: internal::nexus::UpstairsRepairType::Reconciliation, + repair_type: + internal::nexus::UpstairsRepairType::Reconciliation, repairs: vec![internal::nexus::DownstairsUnderRepair { region_uuid: region_id, target_addr: "[fd00:1122:3344:101::8]:12345" @@ -3072,3 +3090,88 @@ async fn test_upstairs_repair_repair_id_and_type_conflict( .await .unwrap_err(); } + +/// Test that an Upstairs can submit progress for a repair +#[nexus_test] +async fn test_upstairs_repair_submit_progress( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // A repair must be started before progress can be submitted + + let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + let progress_url = + format!("/upstairs/{upstairs_id}/repair/{repair_id}/progress"); + + for i in 0..100 { + int_client + .make_request( + Method::POST, + &progress_url, + Some(internal::nexus::RepairProgress { + time: Utc::now(), + current_item: i, + total_items: 100, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + } +} + +/// Test that an Upstairs can't submit progress unless a repair was started +#[nexus_test] +async fn test_upstairs_repair_reject_submit_progress_when_no_repair( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + let progress_url = + format!("/upstairs/{upstairs_id}/repair/{repair_id}/progress"); + + int_client + .make_request( + Method::POST, + &progress_url, + Some(internal::nexus::RepairProgress { + time: Utc::now(), + current_item: 10, + total_items: 100, + }), + StatusCode::BAD_REQUEST, // XXX 404 instead? + ) + .await + .unwrap_err(); +} diff --git a/schema/crdb/38.0.0/up04.sql b/schema/crdb/38.0.0/up04.sql new file mode 100644 index 0000000000..c01ee8152d --- /dev/null +++ b/schema/crdb/38.0.0/up04.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS upstairs_repair_progress ( + repair_id UUID NOT NULL, + time TIMESTAMPTZ NOT NULL, + current_item INT8 NOT NULL, + total_items INT8 NOT NULL, + + PRIMARY KEY (repair_id, time, current_item, total_items) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 5718d19411..32339c286a 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3544,6 +3544,15 @@ CREATE TABLE IF NOT EXISTS upstairs_repair_notification ( PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) ); +CREATE TABLE IF NOT EXISTS upstairs_repair_progress ( + repair_id UUID NOT NULL, + time TIMESTAMPTZ NOT NULL, + current_item INT8 NOT NULL, + total_items INT8 NOT NULL, + + PRIMARY KEY (repair_id, time, current_item, total_items) +); + INSERT INTO omicron.public.db_metadata ( singleton, time_created, From d2bf5f566f0baf983f0e15577f8e7dd105756bce Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 27 Feb 2024 17:27:33 -0500 Subject: [PATCH 08/24] tests pass --- common/src/api/internal/nexus.rs | 1 + nexus/db-queries/src/db/pool_connection.rs | 3 +- openapi/nexus-internal.json | 114 +++++++++++++++++++-- 3 files changed, 106 insertions(+), 12 deletions(-) diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index b347a5ddb7..a36fea96a6 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -257,6 +257,7 @@ pub enum HostIdentifier { } #[derive(Debug, Deserialize, Serialize, JsonSchema, Clone, Copy)] +#[serde(rename_all = "snake_case")] pub enum UpstairsRepairType { Live, Reconciliation, diff --git a/nexus/db-queries/src/db/pool_connection.rs b/nexus/db-queries/src/db/pool_connection.rs index 81855206e9..39b3533993 100644 --- a/nexus/db-queries/src/db/pool_connection.rs +++ b/nexus/db-queries/src/db/pool_connection.rs @@ -51,7 +51,6 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "ip_attach_state", "ip_kind", "ip_pool_resource_type", - "live_repair_notification_type", "network_interface_kind", "physical_disk_kind", "producer_kind", @@ -69,6 +68,8 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "switch_link_fec", "switch_link_speed", "switch_port_geometry", + "upstairs_repair_notification_type", + "upstairs_repair_type", "user_provision_type", "vpc_firewall_rule_action", "vpc_firewall_rule_direction", diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index b92c5f81a3..c1137e3914 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -962,10 +962,55 @@ } } }, - "/upstairs/{upstairs_id}/live-repair-finish": { + "/upstairs/{upstairs_id}/repair/{repair_id}/progress": { "post": { - "summary": "An Upstairs will notify this endpoint when a live repair finishes.", - "operationId": "cpapi_live_repair_finish", + "summary": "An Upstairs will update this endpoint with the progress of a repair", + "operationId": "cpapi_upstairs_repair_progress", + "parameters": [ + { + "in": "path", + "name": "repair_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsRepairKind" + } + }, + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairProgress" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/upstairs/{upstairs_id}/repair-finish": { + "post": { + "summary": "An Upstairs will notify this endpoint when a repair finishes.", + "operationId": "cpapi_upstairs_repair_finish", "parameters": [ { "in": "path", @@ -999,10 +1044,10 @@ } } }, - "/upstairs/{upstairs_id}/live-repair-start": { + "/upstairs/{upstairs_id}/repair-start": { "post": { - "summary": "An Upstairs will notify this endpoint when a live repair starts", - "operationId": "cpapi_live_repair_start", + "summary": "An Upstairs will notify this endpoint when a repair starts", + "operationId": "cpapi_upstairs_repair_start", "parameters": [ { "in": "path", @@ -5855,7 +5900,10 @@ "type": "boolean" }, "repair_id": { - "$ref": "#/components/schemas/TypedUuidForLiveRepairKind" + "$ref": "#/components/schemas/TypedUuidForUpstairsRepairKind" + }, + "repair_type": { + "$ref": "#/components/schemas/UpstairsRepairType" }, "repairs": { "type": "array", @@ -5865,20 +5913,51 @@ }, "session_id": { "$ref": "#/components/schemas/TypedUuidForUpstairsSessionKind" + }, + "time": { + "type": "string", + "format": "date-time" } }, "required": [ "aborted", "repair_id", + "repair_type", "repairs", - "session_id" + "session_id", + "time" + ] + }, + "RepairProgress": { + "type": "object", + "properties": { + "current_item": { + "type": "integer", + "format": "int64" + }, + "time": { + "type": "string", + "format": "date-time" + }, + "total_items": { + "type": "integer", + "format": "int64" + } + }, + "required": [ + "current_item", + "time", + "total_items" ] }, "RepairStartInfo": { "type": "object", "properties": { "repair_id": { - "$ref": "#/components/schemas/TypedUuidForLiveRepairKind" + "$ref": "#/components/schemas/TypedUuidForUpstairsRepairKind" + }, + "repair_type": { + "$ref": "#/components/schemas/UpstairsRepairType" }, "repairs": { "type": "array", @@ -5888,12 +5967,18 @@ }, "session_id": { "$ref": "#/components/schemas/TypedUuidForUpstairsSessionKind" + }, + "time": { + "type": "string", + "format": "date-time" } }, "required": [ "repair_id", + "repair_type", "repairs", - "session_id" + "session_id", + "time" ] }, "RouteConfig": { @@ -6687,7 +6772,7 @@ "type": "string", "format": "uuid" }, - "TypedUuidForLiveRepairKind": { + "TypedUuidForUpstairsRepairKind": { "type": "string", "format": "uuid" }, @@ -6695,6 +6780,13 @@ "type": "string", "format": "uuid" }, + "UpstairsRepairType": { + "type": "string", + "enum": [ + "live", + "reconciliation" + ] + }, "UserId": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID though they may contain a UUID.", From 832e649f9514b48c7146222d31c9effb483d7398 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 28 Feb 2024 13:32:48 -0500 Subject: [PATCH 09/24] simple mismatched record type check --- nexus/db-queries/src/db/datastore/volume.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 1acf2b0e79..c297b976c8 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -851,12 +851,7 @@ impl DataStore { .execute_async(&conn) .await?; - if mismatched_record_type_count == 0 { - // ok, no existing records or the existing records match - // the type - } else if mismatched_record_type_count == 1 { - // XXX is it possible that the match count is larger - // than 1? + if mismatched_record_type_count > 0 { return Err(err.bail(Error::conflict(&format!( "existing repair type for id {} does not match {:?}!", record.repair_id, From 53947b384aea248a8a78c49cc27f162d45c81044 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 1 Mar 2024 16:57:10 -0500 Subject: [PATCH 10/24] bad merge --- nexus/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index e234d5a412..c428c436bd 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -76,7 +76,7 @@ tokio-postgres = { workspace = true, features = ["with-serde_json-1"] } tough.workspace = true trust-dns-resolver.workspace = true uuid.workspace = true -nexus-blueprint-execution.workspace = true + nexus-defaults.workspace = true nexus-db-model.workspace = true nexus-db-queries.workspace = true From 1c81d75ee67258746b8abc7d129af74ec9477541 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 1 Mar 2024 17:02:38 -0500 Subject: [PATCH 11/24] move retry_until_known_result into common --- Cargo.lock | 1 + common/Cargo.toml | 1 + common/src/lib.rs | 81 ++++++++++++++++++ nexus/src/app/instance_network.rs | 2 +- nexus/src/app/sagas/common_storage.rs | 2 +- .../src/app/sagas/loopback_address_create.rs | 2 +- .../src/app/sagas/loopback_address_delete.rs | 2 +- nexus/src/app/sagas/mod.rs | 82 ------------------- nexus/src/app/sagas/snapshot_create.rs | 2 +- .../app/sagas/switch_port_settings_apply.rs | 2 +- .../app/sagas/switch_port_settings_clear.rs | 2 +- 11 files changed, 90 insertions(+), 89 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0ab06b5fdd..1bbeadd391 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4891,6 +4891,7 @@ dependencies = [ "once_cell", "parse-display", "progenitor", + "progenitor-client", "proptest", "rand 0.8.5", "regress", diff --git a/common/Cargo.toml b/common/Cargo.toml index 5628a93397..39f81393b4 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -37,6 +37,7 @@ toml.workspace = true uuid.workspace = true parse-display.workspace = true progenitor.workspace = true +progenitor-client.workspace = true omicron-workspace-hack.workspace = true once_cell.workspace = true regress.workspace = true diff --git a/common/src/lib.rs b/common/src/lib.rs index 0d63de90fb..f743824f7a 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -79,3 +79,84 @@ impl slog::KV for FileKv { } pub const OMICRON_DPD_TAG: &str = "omicron"; + +use futures::Future; +use slog::warn; + +/// Retry a progenitor client operation until a known result is returned. +/// +/// Saga execution relies on the outcome of an external call being known: since +/// they are idempotent, reissue the external call until a known result comes +/// back. Retry if a communication error is seen, or if another retryable error +/// is seen. +/// +/// Note that retrying is only valid if the call itself is idempotent. +pub async fn retry_until_known_result( + log: &slog::Logger, + mut f: F, +) -> Result> +where + F: FnMut() -> Fut, + Fut: Future>>, + E: std::fmt::Debug, +{ + backoff::retry_notify( + backoff::retry_policy_internal_service(), + move || { + let fut = f(); + async move { + match fut.await { + Err(progenitor_client::Error::CommunicationError(e)) => { + warn!( + log, + "saw transient communication error {}, retrying...", + e, + ); + + Err(backoff::BackoffError::transient( + progenitor_client::Error::CommunicationError(e), + )) + } + + Err(progenitor_client::Error::ErrorResponse( + response_value, + )) => { + match response_value.status() { + // Retry on 503 or 429 + http::StatusCode::SERVICE_UNAVAILABLE + | http::StatusCode::TOO_MANY_REQUESTS => { + Err(backoff::BackoffError::transient( + progenitor_client::Error::ErrorResponse( + response_value, + ), + )) + } + + // Anything else is a permanent error + _ => Err(backoff::BackoffError::Permanent( + progenitor_client::Error::ErrorResponse( + response_value, + ), + )), + } + } + + Err(e) => { + warn!(log, "saw permanent error {}, aborting", e,); + + Err(backoff::BackoffError::Permanent(e)) + } + + Ok(v) => Ok(v), + } + } + }, + |error: progenitor_client::Error<_>, delay| { + warn!( + log, + "failed external call ({:?}), will retry in {:?}", error, delay, + ); + }, + ) + .await +} diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index c0bc5d237b..1a1a393657 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -4,7 +4,6 @@ //! Routines that manage instance-related networking state. -use crate::app::sagas::retry_until_known_result; use ipnetwork::IpNetwork; use ipnetwork::Ipv6Network; use nexus_db_model::ExternalIp; @@ -23,6 +22,7 @@ use omicron_common::api::external::Ipv4Net; use omicron_common::api::external::Ipv6Net; use omicron_common::api::internal::nexus; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_common::retry_until_known_result; use sled_agent_client::types::DeleteVirtualNetworkInterfaceHost; use sled_agent_client::types::SetVirtualNetworkInterfaceHost; use std::collections::HashSet; diff --git a/nexus/src/app/sagas/common_storage.rs b/nexus/src/app/sagas/common_storage.rs index a7350d91fd..3b590f6205 100644 --- a/nexus/src/app/sagas/common_storage.rs +++ b/nexus/src/app/sagas/common_storage.rs @@ -6,7 +6,6 @@ use super::*; -use crate::app::sagas::retry_until_known_result; use crate::Nexus; use anyhow::anyhow; use crucible_agent_client::{ @@ -22,6 +21,7 @@ use nexus_db_queries::db::identity::Asset; use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external::Error; use omicron_common::backoff::{self, BackoffError}; +use omicron_common::retry_until_known_result; use slog::Logger; use std::net::SocketAddrV6; diff --git a/nexus/src/app/sagas/loopback_address_create.rs b/nexus/src/app/sagas/loopback_address_create.rs index a5d89f202c..c32a5f387d 100644 --- a/nexus/src/app/sagas/loopback_address_create.rs +++ b/nexus/src/app/sagas/loopback_address_create.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{NexusActionContext, NEXUS_DPD_TAG}; -use crate::app::sagas::retry_until_known_result; use crate::app::sagas::{ declare_saga_actions, ActionRegistry, NexusSaga, SagaInitError, }; @@ -13,6 +12,7 @@ use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::db::model::LoopbackAddress; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_common::retry_until_known_result; use serde::{Deserialize, Serialize}; use std::sync::Arc; use steno::ActionError; diff --git a/nexus/src/app/sagas/loopback_address_delete.rs b/nexus/src/app/sagas/loopback_address_delete.rs index a030178d27..822a360acf 100644 --- a/nexus/src/app/sagas/loopback_address_delete.rs +++ b/nexus/src/app/sagas/loopback_address_delete.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::NexusActionContext; -use crate::app::sagas::retry_until_known_result; use crate::app::sagas::{ declare_saga_actions, ActionRegistry, NexusSaga, SagaInitError, }; @@ -14,6 +13,7 @@ use nexus_db_queries::authz; use nexus_db_queries::db::model::{LoopbackAddress, Name}; use nexus_types::identity::Asset; use omicron_common::api::external::{IpNet, NameOrId}; +use omicron_common::retry_until_known_result; use serde::{Deserialize, Serialize}; use std::sync::Arc; use steno::ActionError; diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index e9f800c61b..056f6bf277 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -328,88 +328,6 @@ pub(crate) use __emit_action; pub(crate) use __stringify_ident; pub(crate) use declare_saga_actions; -use futures::Future; - -/// Retry a progenitor client operation until a known result is returned. -/// -/// Saga execution relies on the outcome of an external call being known: since -/// they are idempotent, reissue the external call until a known result comes -/// back. Retry if a communication error is seen, or if another retryable error -/// is seen. -/// -/// Note that retrying is only valid if the call itself is idempotent. -pub(crate) async fn retry_until_known_result( - log: &slog::Logger, - mut f: F, -) -> Result> -where - F: FnMut() -> Fut, - Fut: Future>>, - E: std::fmt::Debug, -{ - use omicron_common::backoff; - - backoff::retry_notify( - backoff::retry_policy_internal_service(), - move || { - let fut = f(); - async move { - match fut.await { - Err(progenitor_client::Error::CommunicationError(e)) => { - warn!( - log, - "saw transient communication error {}, retrying...", - e, - ); - - Err(backoff::BackoffError::transient( - progenitor_client::Error::CommunicationError(e), - )) - } - - Err(progenitor_client::Error::ErrorResponse( - response_value, - )) => { - match response_value.status() { - // Retry on 503 or 429 - http::StatusCode::SERVICE_UNAVAILABLE - | http::StatusCode::TOO_MANY_REQUESTS => { - Err(backoff::BackoffError::transient( - progenitor_client::Error::ErrorResponse( - response_value, - ), - )) - } - - // Anything else is a permanent error - _ => Err(backoff::BackoffError::Permanent( - progenitor_client::Error::ErrorResponse( - response_value, - ), - )), - } - } - - Err(e) => { - warn!(log, "saw permanent error {}, aborting", e,); - - Err(backoff::BackoffError::Permanent(e)) - } - - Ok(v) => Ok(v), - } - } - }, - |error: progenitor_client::Error<_>, delay| { - warn!( - log, - "failed external call ({:?}), will retry in {:?}", error, delay, - ); - }, - ) - .await -} - /// Reliable persistent workflows can request that sagas be run as part of their /// activation by sending a SagaRequest through a supplied channel to Nexus. pub enum SagaRequest { diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index e017ab377b..f1d1a2bd02 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -99,7 +99,6 @@ use super::{ ACTION_GENERATE_ID, }; use crate::app::sagas::declare_saga_actions; -use crate::app::sagas::retry_until_known_result; use crate::app::{authn, authz, db}; use crate::external_api::params; use anyhow::anyhow; @@ -109,6 +108,7 @@ use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external; use omicron_common::api::external::Error; +use omicron_common::retry_until_known_result; use rand::{rngs::StdRng, RngCore, SeedableRng}; use serde::Deserialize; use serde::Serialize; diff --git a/nexus/src/app/sagas/switch_port_settings_apply.rs b/nexus/src/app/sagas/switch_port_settings_apply.rs index 9e2331f416..44f2f77ea1 100644 --- a/nexus/src/app/sagas/switch_port_settings_apply.rs +++ b/nexus/src/app/sagas/switch_port_settings_apply.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{NexusActionContext, NEXUS_DPD_TAG}; -use crate::app::sagas::retry_until_known_result; use crate::app::sagas::switch_port_settings_common::{ api_to_dpd_port_settings, ensure_switch_port_bgp_settings, ensure_switch_port_uplink, select_dendrite_client, select_mg_client, @@ -24,6 +23,7 @@ use nexus_db_queries::db::datastore::UpdatePrecondition; use nexus_db_queries::{authn, db}; use omicron_common::api::external::{self, NameOrId}; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_common::retry_until_known_result; use serde::{Deserialize, Serialize}; use std::net::IpAddr; use std::str::FromStr; diff --git a/nexus/src/app/sagas/switch_port_settings_clear.rs b/nexus/src/app/sagas/switch_port_settings_clear.rs index 15290dd75b..2e35530ef1 100644 --- a/nexus/src/app/sagas/switch_port_settings_clear.rs +++ b/nexus/src/app/sagas/switch_port_settings_clear.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{NexusActionContext, NEXUS_DPD_TAG}; -use crate::app::sagas::retry_until_known_result; use crate::app::sagas::switch_port_settings_common::{ api_to_dpd_port_settings, apply_bootstore_update, bootstore_update, ensure_switch_port_bgp_settings, ensure_switch_port_uplink, @@ -23,6 +22,7 @@ use nexus_db_model::NETWORK_KEY; use nexus_db_queries::authn; use nexus_db_queries::db::datastore::UpdatePrecondition; use omicron_common::api::external::{self, NameOrId, SwitchLocation}; +use omicron_common::retry_until_known_result; use serde::{Deserialize, Serialize}; use std::net::IpAddr; use std::str::FromStr; From c39094e8c79c4d2b067ba4ea9396a9f0dca1daaf Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 8 Mar 2024 16:43:22 -0500 Subject: [PATCH 12/24] prepend /crucible/0/ --- nexus/src/internal_api/http_entrypoints.rs | 6 +- openapi/nexus-internal.json | 246 ++++++++++----------- 2 files changed, 126 insertions(+), 126 deletions(-) diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 95a12f6f97..6e5fa2b633 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -498,7 +498,7 @@ struct UpstairsPathParam { /// An Upstairs will notify this endpoint when a repair starts #[endpoint { method = POST, - path = "/upstairs/{upstairs_id}/repair-start", + path = "/crucible/0/upstairs/{upstairs_id}/repair-start", }] async fn cpapi_upstairs_repair_start( rqctx: RequestContext>, @@ -526,7 +526,7 @@ async fn cpapi_upstairs_repair_start( /// An Upstairs will notify this endpoint when a repair finishes. #[endpoint { method = POST, - path = "/upstairs/{upstairs_id}/repair-finish", + path = "/crucible/0/upstairs/{upstairs_id}/repair-finish", }] async fn cpapi_upstairs_repair_finish( rqctx: RequestContext>, @@ -561,7 +561,7 @@ struct UpstairsRepairPathParam { /// An Upstairs will update this endpoint with the progress of a repair #[endpoint { method = POST, - path = "/upstairs/{upstairs_id}/repair/{repair_id}/progress", + path = "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress", }] async fn cpapi_upstairs_repair_progress( rqctx: RequestContext>, diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index c1137e3914..ac54aa2bce 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -125,6 +125,125 @@ } } }, + "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress": { + "post": { + "summary": "An Upstairs will update this endpoint with the progress of a repair", + "operationId": "cpapi_upstairs_repair_progress", + "parameters": [ + { + "in": "path", + "name": "repair_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsRepairKind" + } + }, + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairProgress" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/crucible/0/upstairs/{upstairs_id}/repair-finish": { + "post": { + "summary": "An Upstairs will notify this endpoint when a repair finishes.", + "operationId": "cpapi_upstairs_repair_finish", + "parameters": [ + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairFinishInfo" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/crucible/0/upstairs/{upstairs_id}/repair-start": { + "post": { + "summary": "An Upstairs will notify this endpoint when a repair starts", + "operationId": "cpapi_upstairs_repair_start", + "parameters": [ + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairStartInfo" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/deployment/blueprints/all": { "get": { "summary": "Lists blueprints", @@ -962,125 +1081,6 @@ } } }, - "/upstairs/{upstairs_id}/repair/{repair_id}/progress": { - "post": { - "summary": "An Upstairs will update this endpoint with the progress of a repair", - "operationId": "cpapi_upstairs_repair_progress", - "parameters": [ - { - "in": "path", - "name": "repair_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/TypedUuidForUpstairsRepairKind" - } - }, - { - "in": "path", - "name": "upstairs_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/TypedUuidForUpstairsKind" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RepairProgress" - } - } - }, - "required": true - }, - "responses": { - "204": { - "description": "resource updated" - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, - "/upstairs/{upstairs_id}/repair-finish": { - "post": { - "summary": "An Upstairs will notify this endpoint when a repair finishes.", - "operationId": "cpapi_upstairs_repair_finish", - "parameters": [ - { - "in": "path", - "name": "upstairs_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/TypedUuidForUpstairsKind" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RepairFinishInfo" - } - } - }, - "required": true - }, - "responses": { - "204": { - "description": "resource updated" - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, - "/upstairs/{upstairs_id}/repair-start": { - "post": { - "summary": "An Upstairs will notify this endpoint when a repair starts", - "operationId": "cpapi_upstairs_repair_start", - "parameters": [ - { - "in": "path", - "name": "upstairs_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/TypedUuidForUpstairsKind" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RepairStartInfo" - } - } - }, - "required": true - }, - "responses": { - "204": { - "description": "resource updated" - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, "/volume/{volume_id}/remove-read-only-parent": { "post": { "summary": "Request removal of a read_only_parent from a volume", @@ -6873,6 +6873,10 @@ "type": "string", "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" }, + "TypedUuidForUpstairsKind": { + "type": "string", + "format": "uuid" + }, "IdSortMode": { "description": "Supported set of sort modes for scanning by id only.\n\nCurrently, we only support scanning in ascending order.", "oneOf": [ @@ -6884,10 +6888,6 @@ ] } ] - }, - "TypedUuidForUpstairsKind": { - "type": "string", - "format": "uuid" } }, "responses": { From 63d05edfe5067420cd48aa0552ec273119c1f281 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 11 Mar 2024 13:11:10 -0400 Subject: [PATCH 13/24] add downstairs client task stopped notification --- common/src/api/internal/nexus.rs | 19 ++++++ nexus/db-model/src/downstairs.rs | 66 +++++++++++++++++++ nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/schema.rs | 9 +++ nexus/db-queries/src/db/datastore/volume.rs | 29 +++++++++ nexus/src/app/volume.rs | 26 ++++++++ nexus/src/internal_api/http_entrypoints.rs | 39 +++++++++++ openapi/nexus-internal.json | 71 +++++++++++++++++++++ uuid-kinds/src/lib.rs | 1 + 9 files changed, 262 insertions(+) create mode 100644 nexus/db-model/src/downstairs.rs diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index a36fea96a6..caaf039881 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -294,3 +294,22 @@ pub struct RepairProgress { pub current_item: i64, pub total_items: i64, } + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub enum DownstairsClientStopReason { + Replacing, + Disabled, + FailedReconcile, + IOError, + BadNegotiationOrder, + Incompatible, + FailedLiveRepair, + TooManyOutstandingJobs, + Deactivated, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct DownstairsClientStopped { + pub time: DateTime, + pub reason: DownstairsClientStopReason, +} diff --git a/nexus/db-model/src/downstairs.rs b/nexus/db-model/src/downstairs.rs new file mode 100644 index 0000000000..744dae9e45 --- /dev/null +++ b/nexus/db-model/src/downstairs.rs @@ -0,0 +1,66 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::downstairs_client_stopped_notification; +use crate::typed_uuid::DbTypedUuid; +use chrono::{DateTime, Utc}; +use omicron_common::api::internal; +use omicron_uuid_kinds::DownstairsKind; +use omicron_uuid_kinds::UpstairsKind; +use serde::{Deserialize, Serialize}; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "downstairs_client_stop_reason_type", schema = "public"))] + pub struct DownstairsClientStopReasonEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = DownstairsClientStopReasonEnum)] + pub enum DownstairsClientStopReason; + + // Reason types + Replacing => b"replacing" + Disabled => b"disabled" + FailedReconcile => b"failed_reconcile" + IOError => b"io_error" + BadNegotiationOrder => b"bad_negotiation_order" + Incompatible => b"incompatible" + FailedLiveRepair => b"failed_live_repair" + TooManyOutstandingJobs => b"too_many_outstanding_jobs" + Deactivated => b"deactivated" +); + +impl From for DownstairsClientStopReason { + fn from(v: internal::nexus::DownstairsClientStopReason) -> DownstairsClientStopReason { + match v { + internal::nexus::DownstairsClientStopReason::Replacing => DownstairsClientStopReason::Replacing, + internal::nexus::DownstairsClientStopReason::Disabled => DownstairsClientStopReason::Disabled, + internal::nexus::DownstairsClientStopReason::FailedReconcile => DownstairsClientStopReason::FailedReconcile, + internal::nexus::DownstairsClientStopReason::IOError => DownstairsClientStopReason::IOError, + internal::nexus::DownstairsClientStopReason::BadNegotiationOrder => DownstairsClientStopReason::BadNegotiationOrder, + internal::nexus::DownstairsClientStopReason::Incompatible => DownstairsClientStopReason::Incompatible, + internal::nexus::DownstairsClientStopReason::FailedLiveRepair => DownstairsClientStopReason::FailedLiveRepair, + internal::nexus::DownstairsClientStopReason::TooManyOutstandingJobs => DownstairsClientStopReason::TooManyOutstandingJobs, + internal::nexus::DownstairsClientStopReason::Deactivated => DownstairsClientStopReason::Deactivated, + } + } +} + +/// A Record of when an Upstairs stopped a Downstairs client task +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = downstairs_client_stopped_notification)] +pub struct DownstairsClientStoppedNotification { + // Importantly, this is client time, not Nexus' time that it received the + // notification. + pub time: DateTime, + + // Which Upstairs sent this notification? + pub upstairs_id: DbTypedUuid, + + // Which Downstairs client was stopped? + pub downstairs_id: DbTypedUuid, + + pub reason: DownstairsClientStopReason, +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 7cbddc1a00..db097aae40 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -26,6 +26,7 @@ mod digest; mod disk; mod disk_state; mod dns; +mod downstairs; mod external_ip; mod generation; mod identity_provider; @@ -127,6 +128,7 @@ pub use digest::*; pub use disk::*; pub use disk_state::*; pub use dns::*; +pub use downstairs::*; pub use external_ip::*; pub use generation::*; pub use identity_provider::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 598794bc93..85ecd53a86 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1545,6 +1545,15 @@ table! { } } +table! { + downstairs_client_stopped_notification (downstairs_id, time, reason) { + time -> Timestamptz, + upstairs_id -> Uuid, + downstairs_id -> Uuid, + reason -> crate::DownstairsClientStopReasonEnum, + } +} + table! { db_metadata (singleton) { singleton -> Bool, diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index c297b976c8..98a40f24d8 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -13,6 +13,7 @@ use crate::db::identity::Asset; use crate::db::model::Dataset; use crate::db::model::Region; use crate::db::model::RegionSnapshot; +use crate::db::model::DownstairsClientStoppedNotification; use crate::db::model::UpstairsRepairNotification; use crate::db::model::UpstairsRepairNotificationType; use crate::db::model::UpstairsRepairProgress; @@ -30,8 +31,10 @@ use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::ResourceType; use omicron_common::api::internal::nexus::RepairProgress; +use omicron_common::api::internal::nexus::DownstairsClientStopped; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; use serde::Deserialize; use serde::Deserializer; @@ -988,6 +991,32 @@ impl DataStore { } }) } + + /// Record when a Downstairs client is stopped, and why + pub async fn downstairs_stopped_notification( + &self, + opctx: &OpContext, + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, + downstairs_client_stopped: DownstairsClientStopped, + ) -> Result<(), Error> { + use db::schema::downstairs_client_stopped_notification::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + diesel::insert_into(dsl::downstairs_client_stopped_notification) + .values(DownstairsClientStoppedNotification { + time: downstairs_client_stopped.time, + upstairs_id: upstairs_id.into(), + downstairs_id: downstairs_id.into(), + reason: downstairs_client_stopped.reason.into(), + }) + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } } #[derive(Default, Clone, Debug, Serialize, Deserialize)] diff --git a/nexus/src/app/volume.rs b/nexus/src/app/volume.rs index 60ba36091f..0af72d4c8b 100644 --- a/nexus/src/app/volume.rs +++ b/nexus/src/app/volume.rs @@ -13,8 +13,10 @@ use omicron_common::api::external::DeleteResult; use omicron_common::api::internal::nexus::RepairFinishInfo; use omicron_common::api::internal::nexus::RepairProgress; use omicron_common::api::internal::nexus::RepairStartInfo; +use omicron_common::api::internal::nexus::DownstairsClientStopped; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; use std::sync::Arc; use uuid::Uuid; @@ -140,4 +142,28 @@ impl super::Nexus { ) .await } + + /// An Upstairs is telling us that a Downstairs client task was stopped + pub(crate) async fn downstairs_stopped_notification( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, + downstairs_client_stopped: DownstairsClientStopped, + ) -> DeleteResult { + info!( + self.log, + "received downstairs_stopped_notification from upstairs {upstairs_id} for downstairs {downstairs_id}: {:?}", + downstairs_client_stopped, + ); + + self.db_datastore + .downstairs_stopped_notification( + opctx, + upstairs_id, + downstairs_id, + downstairs_client_stopped, + ) + .await + } } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 6e5fa2b633..5673d3b5e4 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -45,10 +45,12 @@ use omicron_common::api::internal::nexus::RepairFinishInfo; use omicron_common::api::internal::nexus::RepairProgress; use omicron_common::api::internal::nexus::RepairStartInfo; use omicron_common::api::internal::nexus::SledInstanceState; +use omicron_common::api::internal::nexus::DownstairsClientStopped; use omicron_common::update::ArtifactId; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; +use omicron_uuid_kinds::DownstairsKind; use oximeter::types::ProducerResults; use oximeter_producer::{collect, ProducerIdPathParams}; use schemars::JsonSchema; @@ -81,6 +83,7 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(cpapi_upstairs_repair_start)?; api.register(cpapi_upstairs_repair_finish)?; api.register(cpapi_upstairs_repair_progress)?; + api.register(cpapi_downstairs_stopped)?; api.register(saga_list)?; api.register(saga_view)?; @@ -587,6 +590,42 @@ async fn cpapi_upstairs_repair_progress( apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// Path parameters for Downstairs requests (internal API) +#[derive(Deserialize, JsonSchema)] +struct UpstairsDownstairsPathParam { + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, +} + +/// An Upstairs will update this endpoint if a Downstairs client task is stopped +#[endpoint { + method = POST, + path = "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped", + }] +async fn cpapi_downstairs_stopped( + rqctx: RequestContext>, + path_params: Path, + downstairs_client_stopped: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .downstairs_stopped_notification( + &opctx, + path.upstairs_id, + path.downstairs_id, + downstairs_client_stopped.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + // Sagas /// List sagas diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index ac54aa2bce..41a6904346 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -125,6 +125,43 @@ } } }, + "/crucible/0/downstairs/{downstairs_id}/stopped": { + "post": { + "summary": "An Upstairs will update this endpoint if a Downstairs client task is stopped", + "operationId": "cpapi_downstairs_stopped", + "parameters": [ + { + "in": "path", + "name": "downstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForDownstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DownstairsClientStopped" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress": { "post": { "summary": "An Upstairs will update this endpoint with the progress of a repair", @@ -3528,6 +3565,36 @@ } ] }, + "DownstairsClientStopReason": { + "type": "string", + "enum": [ + "Replacing", + "Disabled", + "FailedReconcile", + "IOError", + "BadNegotiationOrder", + "Incompatible", + "FailedLiveRepair", + "TooManyOutstandingJobs", + "Deactivated" + ] + }, + "DownstairsClientStopped": { + "type": "object", + "properties": { + "reason": { + "$ref": "#/components/schemas/DownstairsClientStopReason" + }, + "time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "reason", + "time" + ] + }, "DownstairsUnderRepair": { "type": "object", "properties": { @@ -6873,6 +6940,10 @@ "type": "string", "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" }, + "TypedUuidForDownstairsKind": { + "type": "string", + "format": "uuid" + }, "TypedUuidForUpstairsKind": { "type": "string", "format": "uuid" diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 10551683fb..7018485b59 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -45,6 +45,7 @@ macro_rules! impl_typed_uuid_kind { // Please keep this list in alphabetical order. impl_typed_uuid_kind! { + DownstairsKind => "downstairs", DownstairsRegionKind => "downstairs_region", LoopbackAddressKind => "loopback_address", TufRepoKind => "tuf_repo", From a5e0c9f6f3c5aea05f71ddb457b7bbbd607da054 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 11 Mar 2024 15:44:07 -0400 Subject: [PATCH 14/24] schema 38 -> 41 --- nexus/db-model/src/schema.rs | 2 +- schema/crdb/{38.0.0 => 41.0.0}/up01.sql | 0 schema/crdb/{38.0.0 => 41.0.0}/up02.sql | 0 schema/crdb/{38.0.0 => 41.0.0}/up03.sql | 0 schema/crdb/{38.0.0 => 41.0.0}/up04.sql | 0 schema/crdb/dbinit.sql | 2 +- 6 files changed, 2 insertions(+), 2 deletions(-) rename schema/crdb/{38.0.0 => 41.0.0}/up01.sql (100%) rename schema/crdb/{38.0.0 => 41.0.0}/up02.sql (100%) rename schema/crdb/{38.0.0 => 41.0.0}/up03.sql (100%) rename schema/crdb/{38.0.0 => 41.0.0}/up04.sql (100%) diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 5257da4867..a6c2b3afb8 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion; /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(40, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(41, 0, 0); table! { disk (id) { diff --git a/schema/crdb/38.0.0/up01.sql b/schema/crdb/41.0.0/up01.sql similarity index 100% rename from schema/crdb/38.0.0/up01.sql rename to schema/crdb/41.0.0/up01.sql diff --git a/schema/crdb/38.0.0/up02.sql b/schema/crdb/41.0.0/up02.sql similarity index 100% rename from schema/crdb/38.0.0/up02.sql rename to schema/crdb/41.0.0/up02.sql diff --git a/schema/crdb/38.0.0/up03.sql b/schema/crdb/41.0.0/up03.sql similarity index 100% rename from schema/crdb/38.0.0/up03.sql rename to schema/crdb/41.0.0/up03.sql diff --git a/schema/crdb/38.0.0/up04.sql b/schema/crdb/41.0.0/up04.sql similarity index 100% rename from schema/crdb/38.0.0/up04.sql rename to schema/crdb/41.0.0/up04.sql diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 964c36d35c..f2b4010d0a 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3617,7 +3617,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '40.0.0', NULL) + ( TRUE, NOW(), NOW(), '41.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From 2b6fa269ebd54e67f9481957bdd20c2b1a92def2 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 11 Mar 2024 15:52:03 -0400 Subject: [PATCH 15/24] downstairs_client_stopped_notification sql --- schema/crdb/41.0.0/up05.sql | 11 +++++++++++ schema/crdb/41.0.0/up06.sql | 8 ++++++++ schema/crdb/dbinit.sql | 21 +++++++++++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 schema/crdb/41.0.0/up05.sql create mode 100644 schema/crdb/41.0.0/up06.sql diff --git a/schema/crdb/41.0.0/up05.sql b/schema/crdb/41.0.0/up05.sql new file mode 100644 index 0000000000..59679a453f --- /dev/null +++ b/schema/crdb/41.0.0/up05.sql @@ -0,0 +1,11 @@ +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stop_reason_type AS ENUM ( + 'replacing', + 'disabled', + 'failed_reconcile', + 'io_error', + 'bad_negotiation_order', + 'incompatible', + 'failed_live_repair', + 'too_many_outstanding_jobs', + 'deactivated' +); diff --git a/schema/crdb/41.0.0/up06.sql b/schema/crdb/41.0.0/up06.sql new file mode 100644 index 0000000000..4e2b99ec96 --- /dev/null +++ b/schema/crdb/41.0.0/up06.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS downstairs_client_stopped_notification ( + time TIMESTAMPTZ NOT NULL, + upstairs_id UUID NOT NULL, + downstairs_id UUID NOT NULL, + reason omicron.public.downstairs_client_stop_reason_type NOT NULL, + + PRIMARY KEY (time, upstairs_id, downstairs_id, reason) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index f2b4010d0a..2f91bc5604 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3610,6 +3610,27 @@ CREATE TABLE IF NOT EXISTS upstairs_repair_progress ( PRIMARY KEY (repair_id, time, current_item, total_items) ); +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stop_reason_type AS ENUM ( + 'replacing', + 'disabled', + 'failed_reconcile', + 'io_error', + 'bad_negotiation_order', + 'incompatible', + 'failed_live_repair', + 'too_many_outstanding_jobs', + 'deactivated' +); + +CREATE TABLE IF NOT EXISTS downstairs_client_stopped_notification ( + time TIMESTAMPTZ NOT NULL, + upstairs_id UUID NOT NULL, + downstairs_id UUID NOT NULL, + reason omicron.public.downstairs_client_stop_reason_type NOT NULL, + + PRIMARY KEY (time, upstairs_id, downstairs_id, reason) +); + INSERT INTO omicron.public.db_metadata ( singleton, time_created, From 83106c39d6d29a7b0662ae0c029d03b34f44eeac Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 11 Mar 2024 15:52:09 -0400 Subject: [PATCH 16/24] fmt --- nexus/db-model/src/downstairs.rs | 8 ++++++-- nexus/db-queries/src/db/datastore/volume.rs | 6 +++--- nexus/db-queries/src/db/pool_connection.rs | 1 + nexus/src/app/volume.rs | 4 ++-- nexus/src/internal_api/http_entrypoints.rs | 4 ++-- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/nexus/db-model/src/downstairs.rs b/nexus/db-model/src/downstairs.rs index 744dae9e45..e75155be16 100644 --- a/nexus/db-model/src/downstairs.rs +++ b/nexus/db-model/src/downstairs.rs @@ -32,8 +32,12 @@ impl_enum_type!( Deactivated => b"deactivated" ); -impl From for DownstairsClientStopReason { - fn from(v: internal::nexus::DownstairsClientStopReason) -> DownstairsClientStopReason { +impl From + for DownstairsClientStopReason +{ + fn from( + v: internal::nexus::DownstairsClientStopReason, + ) -> DownstairsClientStopReason { match v { internal::nexus::DownstairsClientStopReason::Replacing => DownstairsClientStopReason::Replacing, internal::nexus::DownstairsClientStopReason::Disabled => DownstairsClientStopReason::Disabled, diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 98a40f24d8..55916f66cc 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -11,9 +11,9 @@ use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; +use crate::db::model::DownstairsClientStoppedNotification; use crate::db::model::Region; use crate::db::model::RegionSnapshot; -use crate::db::model::DownstairsClientStoppedNotification; use crate::db::model::UpstairsRepairNotification; use crate::db::model::UpstairsRepairNotificationType; use crate::db::model::UpstairsRepairProgress; @@ -30,11 +30,11 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::ResourceType; -use omicron_common::api::internal::nexus::RepairProgress; use omicron_common::api::internal::nexus::DownstairsClientStopped; +use omicron_common::api::internal::nexus::RepairProgress; +use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; -use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; use serde::Deserialize; use serde::Deserializer; diff --git a/nexus/db-queries/src/db/pool_connection.rs b/nexus/db-queries/src/db/pool_connection.rs index 4a77d0669c..7e65d42b50 100644 --- a/nexus/db-queries/src/db/pool_connection.rs +++ b/nexus/db-queries/src/db/pool_connection.rs @@ -44,6 +44,7 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "caboose_which", "dataset_kind", "dns_group", + "downstairs_client_stop_reason_type", "hw_power_state", "hw_rot_slot", "identity_type", diff --git a/nexus/src/app/volume.rs b/nexus/src/app/volume.rs index 0af72d4c8b..fd3f1d453f 100644 --- a/nexus/src/app/volume.rs +++ b/nexus/src/app/volume.rs @@ -10,13 +10,13 @@ use nexus_db_model::UpstairsRepairNotificationType; use nexus_db_queries::authn; use nexus_db_queries::context::OpContext; use omicron_common::api::external::DeleteResult; +use omicron_common::api::internal::nexus::DownstairsClientStopped; use omicron_common::api::internal::nexus::RepairFinishInfo; use omicron_common::api::internal::nexus::RepairProgress; use omicron_common::api::internal::nexus::RepairStartInfo; -use omicron_common::api::internal::nexus::DownstairsClientStopped; +use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; -use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; use std::sync::Arc; use uuid::Uuid; diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 92b712cd43..23f8db117c 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -43,17 +43,17 @@ use omicron_common::api::external::http_pagination::ScanById; use omicron_common::api::external::http_pagination::ScanParams; use omicron_common::api::external::Error; use omicron_common::api::internal::nexus::DiskRuntimeState; +use omicron_common::api::internal::nexus::DownstairsClientStopped; use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::api::internal::nexus::RepairFinishInfo; use omicron_common::api::internal::nexus::RepairProgress; use omicron_common::api::internal::nexus::RepairStartInfo; use omicron_common::api::internal::nexus::SledInstanceState; -use omicron_common::api::internal::nexus::DownstairsClientStopped; use omicron_common::update::ArtifactId; +use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; -use omicron_uuid_kinds::DownstairsKind; use oximeter::types::ProducerResults; use oximeter_producer::{collect, ProducerIdPathParams}; use schemars::JsonSchema; From 41e7aa7bba2b36e6e1300f659d8819712c9d0baf Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 11 Mar 2024 16:45:54 -0400 Subject: [PATCH 17/24] snake case please --- common/src/api/internal/nexus.rs | 1 + openapi/nexus-internal.json | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index caaf039881..460ec2c0ce 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -296,6 +296,7 @@ pub struct RepairProgress { } #[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +#[serde(rename_all = "snake_case")] pub enum DownstairsClientStopReason { Replacing, Disabled, diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 76fb312073..95bc3b07c4 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -3729,15 +3729,15 @@ "DownstairsClientStopReason": { "type": "string", "enum": [ - "Replacing", - "Disabled", - "FailedReconcile", - "IOError", - "BadNegotiationOrder", - "Incompatible", - "FailedLiveRepair", - "TooManyOutstandingJobs", - "Deactivated" + "replacing", + "disabled", + "failed_reconcile", + "i_o_error", + "bad_negotiation_order", + "incompatible", + "failed_live_repair", + "too_many_outstanding_jobs", + "deactivated" ] }, "DownstairsClientStopped": { From 8bc626cad17ef9cd15f65a05d718858dc54b1822 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 12 Mar 2024 15:57:08 -0400 Subject: [PATCH 18/24] update URLs with prefix --- .../integration_tests/volume_management.rs | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 3405a950dc..db0dbcf79b 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -2560,7 +2560,7 @@ async fn test_volume_hard_delete_idempotent( datastore.volume_hard_delete(volume_id).await.unwrap(); } -// upstairs related tests +// internal API related tests /// Test that an Upstairs can reissue live repair notifications #[nexus_test] @@ -2575,7 +2575,7 @@ async fn test_upstairs_repair_notify_idempotent( let region_id: TypedUuid = TypedUuid::new_v4(); // Notify start - let notify_url = format!("/upstairs/{upstairs_id}/repair-start"); + let notify_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); int_client .make_request( @@ -2620,7 +2620,7 @@ async fn test_upstairs_repair_notify_idempotent( .unwrap(); // Notify finish - let notify_url = format!("/upstairs/{upstairs_id}/repair-finish"); + let notify_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -2680,7 +2680,7 @@ async fn test_upstairs_repair_notify_different_finish_status( let repair_id: TypedUuid = TypedUuid::new_v4(); let region_id: TypedUuid = TypedUuid::new_v4(); - let notify_url = format!("/upstairs/{upstairs_id}/repair-finish"); + let notify_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -2741,8 +2741,8 @@ async fn test_upstairs_repair_same_upstairs_retry( // Simulate one failed repair - let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); - let notify_finish_url = format!("/upstairs/{upstairs_id}/repair-finish"); + let notify_start_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_finish_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -2849,8 +2849,8 @@ async fn test_upstairs_repair_different_upstairs_retry( // Simulate one failed repair by one Upstairs - let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); - let notify_finish_url = format!("/upstairs/{upstairs_id}/repair-finish"); + let notify_start_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_finish_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -2959,8 +2959,8 @@ async fn test_upstairs_repair_different_upstairs_retry_interrupted( // Simulate one failed repair by one Upstairs, which was interrupted (which // leads to no finish message). - let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); - let notify_finish_url = format!("/upstairs/{upstairs_id}/repair-finish"); + let notify_start_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_finish_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -3045,7 +3045,7 @@ async fn test_upstairs_repair_repair_id_and_type_conflict( let repair_id: TypedUuid = TypedUuid::new_v4(); let region_id: TypedUuid = TypedUuid::new_v4(); - let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); + let notify_start_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); int_client .make_request( @@ -3105,7 +3105,7 @@ async fn test_upstairs_repair_submit_progress( // A repair must be started before progress can be submitted - let notify_start_url = format!("/upstairs/{upstairs_id}/repair-start"); + let notify_start_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); int_client .make_request( @@ -3129,7 +3129,7 @@ async fn test_upstairs_repair_submit_progress( .unwrap(); let progress_url = - format!("/upstairs/{upstairs_id}/repair/{repair_id}/progress"); + format!("/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress"); for i in 0..100 { int_client @@ -3159,7 +3159,7 @@ async fn test_upstairs_repair_reject_submit_progress_when_no_repair( let repair_id: TypedUuid = TypedUuid::new_v4(); let progress_url = - format!("/upstairs/{upstairs_id}/repair/{repair_id}/progress"); + format!("/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress"); int_client .make_request( From 0b37b63bb752243298e894bb186eb8d6e907374b Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 12 Mar 2024 15:57:24 -0400 Subject: [PATCH 19/24] use new Error::non_resourcetype_not_found --- nexus/db-queries/src/db/datastore/volume.rs | 10 ++++++++-- nexus/tests/integration_tests/volume_management.rs | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 55916f66cc..0c6ba6e475 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -963,8 +963,7 @@ impl DataStore { .optional()?; if matching_repair.is_none() { - // XXX should be 404 - return Err(err.bail(Error::invalid_request(&format!( + return Err(err.bail(Error::non_resourcetype_not_found(&format!( "upstairs {upstairs_id} repair {repair_id} not found" )))); } @@ -1011,6 +1010,13 @@ impl DataStore { downstairs_id: downstairs_id.into(), reason: downstairs_client_stopped.reason.into(), }) + .on_conflict(( + dsl::time, + dsl::upstairs_id, + dsl::downstairs_id, + dsl::reason, + )) + .do_nothing() .execute_async(&*conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index db0dbcf79b..958329bf8c 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -3170,7 +3170,7 @@ async fn test_upstairs_repair_reject_submit_progress_when_no_repair( current_item: 10, total_items: 100, }), - StatusCode::BAD_REQUEST, // XXX 404 instead? + StatusCode::NOT_FOUND, ) .await .unwrap_err(); From 3dfabf02755ab1643b26441cadd297e94cf32b32 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 12 Mar 2024 16:22:16 -0400 Subject: [PATCH 20/24] use a variable, they are the same requests --- .../integration_tests/volume_management.rs | 84 +++++++------------ 1 file changed, 31 insertions(+), 53 deletions(-) diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 958329bf8c..d6e1434ac1 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -2574,25 +2574,25 @@ async fn test_upstairs_repair_notify_idempotent( let repair_id: TypedUuid = TypedUuid::new_v4(); let region_id: TypedUuid = TypedUuid::new_v4(); - // Notify start + // Send the same start request. let notify_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let request = internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345".parse().unwrap(), + }], + }; + int_client .make_request( Method::POST, ¬ify_url, - Some(internal::nexus::RepairStartInfo { - time: Utc::now(), - session_id, - repair_id, - repair_type: internal::nexus::UpstairsRepairType::Live, - repairs: vec![internal::nexus::DownstairsUnderRepair { - region_uuid: region_id, - target_addr: "[fd00:1122:3344:101::8]:12345" - .parse() - .unwrap(), - }], - }), + Some(request.clone()), StatusCode::NO_CONTENT, ) .await @@ -2602,43 +2602,33 @@ async fn test_upstairs_repair_notify_idempotent( .make_request( Method::POST, ¬ify_url, - Some(internal::nexus::RepairStartInfo { - time: Utc::now(), - session_id, - repair_id, - repair_type: internal::nexus::UpstairsRepairType::Live, - repairs: vec![internal::nexus::DownstairsUnderRepair { - region_uuid: region_id, - target_addr: "[fd00:1122:3344:101::8]:12345" - .parse() - .unwrap(), - }], - }), + Some(request), StatusCode::NO_CONTENT, ) .await .unwrap(); - // Notify finish - let notify_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + // Send the same finish request. + let notify_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + + let request = internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345".parse().unwrap(), + }], + aborted: false, + }; int_client .make_request( Method::POST, ¬ify_url, - Some(internal::nexus::RepairFinishInfo { - time: Utc::now(), - session_id, - repair_id, - repair_type: internal::nexus::UpstairsRepairType::Live, - repairs: vec![internal::nexus::DownstairsUnderRepair { - region_uuid: region_id, - target_addr: "[fd00:1122:3344:101::8]:12345" - .parse() - .unwrap(), - }], - aborted: false, - }), + Some(request.clone()), StatusCode::NO_CONTENT, ) .await @@ -2648,19 +2638,7 @@ async fn test_upstairs_repair_notify_idempotent( .make_request( Method::POST, ¬ify_url, - Some(internal::nexus::RepairFinishInfo { - time: Utc::now(), - session_id, - repair_id, - repair_type: internal::nexus::UpstairsRepairType::Live, - repairs: vec![internal::nexus::DownstairsUnderRepair { - region_uuid: region_id, - target_addr: "[fd00:1122:3344:101::8]:12345" - .parse() - .unwrap(), - }], - aborted: false, - }), + Some(request), StatusCode::NO_CONTENT, ) .await From 6e8b63c7fec61cf6b665e9065b2bada9d1a4546e Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 12 Mar 2024 16:23:09 -0400 Subject: [PATCH 21/24] test_upstairs_notify_downstairs_client_stop --- .../integration_tests/volume_management.rs | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index d6e1434ac1..0a22262302 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -26,6 +26,7 @@ use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Name; use omicron_common::api::internal; +use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::DownstairsRegionKind; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; @@ -3153,3 +3154,81 @@ async fn test_upstairs_repair_reject_submit_progress_when_no_repair( .await .unwrap_err(); } + +/// Test that an Upstairs can notify Nexus when a Downstairs client task stops +#[nexus_test] +async fn test_upstairs_notify_downstairs_client_stop( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let downstairs_id: TypedUuid = TypedUuid::new_v4(); + + let stopped_url = format!( + "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped" + ); + + // Make sure an Upstairs can re-send the notification + + let request = internal::nexus::DownstairsClientStopped { + time: Utc::now(), + reason: + internal::nexus::DownstairsClientStopReason::TooManyOutstandingJobs, + }; + + int_client + .make_request( + Method::POST, + &stopped_url, + Some(request.clone()), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + &stopped_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // The client can stop for the same reason a different time + + let request = internal::nexus::DownstairsClientStopped { + time: Utc::now(), + reason: + internal::nexus::DownstairsClientStopReason::TooManyOutstandingJobs, + }; + + int_client + .make_request( + Method::POST, + &stopped_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // The client can also stop for a different reason + + let request = internal::nexus::DownstairsClientStopped { + time: Utc::now(), + reason: internal::nexus::DownstairsClientStopReason::IOError, + }; + + int_client + .make_request( + Method::POST, + &stopped_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} From a5410aadda50a5f66d9da6b3d210988a0b8cce06 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 12 Mar 2024 16:23:16 -0400 Subject: [PATCH 22/24] fmt --- .../integration_tests/volume_management.rs | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 0a22262302..cc4daf8c5d 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -2659,7 +2659,8 @@ async fn test_upstairs_repair_notify_different_finish_status( let repair_id: TypedUuid = TypedUuid::new_v4(); let region_id: TypedUuid = TypedUuid::new_v4(); - let notify_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + let notify_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -2720,8 +2721,10 @@ async fn test_upstairs_repair_same_upstairs_retry( // Simulate one failed repair - let notify_start_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); - let notify_finish_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_finish_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -2828,8 +2831,10 @@ async fn test_upstairs_repair_different_upstairs_retry( // Simulate one failed repair by one Upstairs - let notify_start_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); - let notify_finish_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_finish_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -2938,8 +2943,10 @@ async fn test_upstairs_repair_different_upstairs_retry_interrupted( // Simulate one failed repair by one Upstairs, which was interrupted (which // leads to no finish message). - let notify_start_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); - let notify_finish_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_finish_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); int_client .make_request( @@ -3024,7 +3031,8 @@ async fn test_upstairs_repair_repair_id_and_type_conflict( let repair_id: TypedUuid = TypedUuid::new_v4(); let region_id: TypedUuid = TypedUuid::new_v4(); - let notify_start_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); int_client .make_request( @@ -3084,7 +3092,8 @@ async fn test_upstairs_repair_submit_progress( // A repair must be started before progress can be submitted - let notify_start_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); int_client .make_request( @@ -3107,8 +3116,9 @@ async fn test_upstairs_repair_submit_progress( .await .unwrap(); - let progress_url = - format!("/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress"); + let progress_url = format!( + "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress" + ); for i in 0..100 { int_client @@ -3137,8 +3147,9 @@ async fn test_upstairs_repair_reject_submit_progress_when_no_repair( let upstairs_id: TypedUuid = TypedUuid::new_v4(); let repair_id: TypedUuid = TypedUuid::new_v4(); - let progress_url = - format!("/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress"); + let progress_url = format!( + "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress" + ); int_client .make_request( From df79ff2dcbd54cd0186e09886078166211da28b0 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 13 Mar 2024 12:13:19 -0400 Subject: [PATCH 23/24] separate endpoints for stop request and stopped --- common/src/api/internal/nexus.rs | 24 ++++- nexus/db-model/src/downstairs.rs | 101 ++++++++++++++---- nexus/db-model/src/schema.rs | 13 ++- nexus/db-queries/src/db/datastore/volume.rs | 37 ++++++- nexus/db-queries/src/db/pool_connection.rs | 3 +- nexus/src/app/volume.rs | 32 +++++- nexus/src/internal_api/http_entrypoints.rs | 41 ++++++- .../integration_tests/volume_management.rs | 89 +++++++++++++-- openapi/nexus-internal.json | 85 ++++++++++++++- schema/crdb/41.0.0/up05.sql | 2 +- schema/crdb/41.0.0/up06.sql | 4 +- schema/crdb/41.0.0/up07.sql | 11 ++ schema/crdb/41.0.0/up08.sql | 8 ++ schema/crdb/dbinit.sql | 25 ++++- 14 files changed, 428 insertions(+), 47 deletions(-) create mode 100644 schema/crdb/41.0.0/up07.sql create mode 100644 schema/crdb/41.0.0/up08.sql diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 460ec2c0ce..24ef9a16aa 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -297,7 +297,7 @@ pub struct RepairProgress { #[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] #[serde(rename_all = "snake_case")] -pub enum DownstairsClientStopReason { +pub enum DownstairsClientStopRequestReason { Replacing, Disabled, FailedReconcile, @@ -309,8 +309,28 @@ pub enum DownstairsClientStopReason { Deactivated, } +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct DownstairsClientStopRequest { + pub time: DateTime, + pub reason: DownstairsClientStopRequestReason, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +#[serde(rename_all = "snake_case")] +pub enum DownstairsClientStoppedReason { + ConnectionTimeout, + ConnectionFailed, + Timeout, + WriteFailed, + ReadFailed, + RequestedStop, + Finished, + QueueClosed, + ReceiveTaskCancelled, +} + #[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] pub struct DownstairsClientStopped { pub time: DateTime, - pub reason: DownstairsClientStopReason, + pub reason: DownstairsClientStoppedReason, } diff --git a/nexus/db-model/src/downstairs.rs b/nexus/db-model/src/downstairs.rs index e75155be16..6aaeadc810 100644 --- a/nexus/db-model/src/downstairs.rs +++ b/nexus/db-model/src/downstairs.rs @@ -3,6 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::impl_enum_type; +use crate::schema::downstairs_client_stop_request_notification; use crate::schema::downstairs_client_stopped_notification; use crate::typed_uuid::DbTypedUuid; use chrono::{DateTime, Utc}; @@ -11,14 +12,16 @@ use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::UpstairsKind; use serde::{Deserialize, Serialize}; +// Types for stop request notification + impl_enum_type!( #[derive(SqlType, Debug, QueryId)] - #[diesel(postgres_type(name = "downstairs_client_stop_reason_type", schema = "public"))] - pub struct DownstairsClientStopReasonEnum; + #[diesel(postgres_type(name = "downstairs_client_stop_request_reason_type", schema = "public"))] + pub struct DownstairsClientStopRequestReasonEnum; #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] - #[diesel(sql_type = DownstairsClientStopReasonEnum)] - pub enum DownstairsClientStopReason; + #[diesel(sql_type = DownstairsClientStopRequestReasonEnum)] + pub enum DownstairsClientStopRequestReason; // Reason types Replacing => b"replacing" @@ -32,27 +35,87 @@ impl_enum_type!( Deactivated => b"deactivated" ); -impl From - for DownstairsClientStopReason +impl From + for DownstairsClientStopRequestReason +{ + fn from( + v: internal::nexus::DownstairsClientStopRequestReason, + ) -> DownstairsClientStopRequestReason { + match v { + internal::nexus::DownstairsClientStopRequestReason::Replacing => DownstairsClientStopRequestReason::Replacing, + internal::nexus::DownstairsClientStopRequestReason::Disabled => DownstairsClientStopRequestReason::Disabled, + internal::nexus::DownstairsClientStopRequestReason::FailedReconcile => DownstairsClientStopRequestReason::FailedReconcile, + internal::nexus::DownstairsClientStopRequestReason::IOError => DownstairsClientStopRequestReason::IOError, + internal::nexus::DownstairsClientStopRequestReason::BadNegotiationOrder => DownstairsClientStopRequestReason::BadNegotiationOrder, + internal::nexus::DownstairsClientStopRequestReason::Incompatible => DownstairsClientStopRequestReason::Incompatible, + internal::nexus::DownstairsClientStopRequestReason::FailedLiveRepair => DownstairsClientStopRequestReason::FailedLiveRepair, + internal::nexus::DownstairsClientStopRequestReason::TooManyOutstandingJobs => DownstairsClientStopRequestReason::TooManyOutstandingJobs, + internal::nexus::DownstairsClientStopRequestReason::Deactivated => DownstairsClientStopRequestReason::Deactivated, + } + } +} + +/// A Record of when an Upstairs requested a Downstairs client task stop +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = downstairs_client_stop_request_notification)] +pub struct DownstairsClientStopRequestNotification { + // Importantly, this is client time, not Nexus' time that it received the + // notification. + pub time: DateTime, + + // Which Upstairs sent this notification? + pub upstairs_id: DbTypedUuid, + + // Which Downstairs client was requested to stop? + pub downstairs_id: DbTypedUuid, + + pub reason: DownstairsClientStopRequestReason, +} + +// Types for stopped notification + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "downstairs_client_stopped_reason_type", schema = "public"))] + pub struct DownstairsClientStoppedReasonEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = DownstairsClientStoppedReasonEnum)] + pub enum DownstairsClientStoppedReason; + + // Reason types + ConnectionTimeout => b"connection_timeout" + ConnectionFailed => b"connection_failed" + Timeout => b"timeout" + WriteFailed => b"write_failed" + ReadFailed => b"read_failed" + RequestedStop => b"requested_stop" + Finished => b"finished" + QueueClosed => b"queue_closed" + ReceiveTaskCancelled => b"receive_task_cancelled" +); + +impl From + for DownstairsClientStoppedReason { fn from( - v: internal::nexus::DownstairsClientStopReason, - ) -> DownstairsClientStopReason { + v: internal::nexus::DownstairsClientStoppedReason, + ) -> DownstairsClientStoppedReason { match v { - internal::nexus::DownstairsClientStopReason::Replacing => DownstairsClientStopReason::Replacing, - internal::nexus::DownstairsClientStopReason::Disabled => DownstairsClientStopReason::Disabled, - internal::nexus::DownstairsClientStopReason::FailedReconcile => DownstairsClientStopReason::FailedReconcile, - internal::nexus::DownstairsClientStopReason::IOError => DownstairsClientStopReason::IOError, - internal::nexus::DownstairsClientStopReason::BadNegotiationOrder => DownstairsClientStopReason::BadNegotiationOrder, - internal::nexus::DownstairsClientStopReason::Incompatible => DownstairsClientStopReason::Incompatible, - internal::nexus::DownstairsClientStopReason::FailedLiveRepair => DownstairsClientStopReason::FailedLiveRepair, - internal::nexus::DownstairsClientStopReason::TooManyOutstandingJobs => DownstairsClientStopReason::TooManyOutstandingJobs, - internal::nexus::DownstairsClientStopReason::Deactivated => DownstairsClientStopReason::Deactivated, + internal::nexus::DownstairsClientStoppedReason::ConnectionTimeout => DownstairsClientStoppedReason::ConnectionTimeout, + internal::nexus::DownstairsClientStoppedReason::ConnectionFailed => DownstairsClientStoppedReason::ConnectionFailed, + internal::nexus::DownstairsClientStoppedReason::Timeout => DownstairsClientStoppedReason::Timeout, + internal::nexus::DownstairsClientStoppedReason::WriteFailed => DownstairsClientStoppedReason::WriteFailed, + internal::nexus::DownstairsClientStoppedReason::ReadFailed => DownstairsClientStoppedReason::ReadFailed, + internal::nexus::DownstairsClientStoppedReason::RequestedStop => DownstairsClientStoppedReason::RequestedStop, + internal::nexus::DownstairsClientStoppedReason::Finished => DownstairsClientStoppedReason::Finished, + internal::nexus::DownstairsClientStoppedReason::QueueClosed => DownstairsClientStoppedReason::QueueClosed, + internal::nexus::DownstairsClientStoppedReason::ReceiveTaskCancelled => DownstairsClientStoppedReason::ReceiveTaskCancelled, } } } -/// A Record of when an Upstairs stopped a Downstairs client task +/// A Record of when a Downstairs client task stopped #[derive(Queryable, Insertable, Debug, Clone, Selectable)] #[diesel(table_name = downstairs_client_stopped_notification)] pub struct DownstairsClientStoppedNotification { @@ -66,5 +129,5 @@ pub struct DownstairsClientStoppedNotification { // Which Downstairs client was stopped? pub downstairs_id: DbTypedUuid, - pub reason: DownstairsClientStopReason, + pub reason: DownstairsClientStoppedReason, } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index a6c2b3afb8..f539f808f9 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1561,11 +1561,20 @@ table! { } table! { - downstairs_client_stopped_notification (downstairs_id, time, reason) { + downstairs_client_stop_request_notification (time, upstairs_id, downstairs_id, reason) { time -> Timestamptz, upstairs_id -> Uuid, downstairs_id -> Uuid, - reason -> crate::DownstairsClientStopReasonEnum, + reason -> crate::DownstairsClientStopRequestReasonEnum, + } +} + +table! { + downstairs_client_stopped_notification (time, upstairs_id, downstairs_id, reason) { + time -> Timestamptz, + upstairs_id -> Uuid, + downstairs_id -> Uuid, + reason -> crate::DownstairsClientStoppedReasonEnum, } } diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 0c6ba6e475..a9646b9ef6 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -11,6 +11,7 @@ use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; +use crate::db::model::DownstairsClientStopRequestNotification; use crate::db::model::DownstairsClientStoppedNotification; use crate::db::model::Region; use crate::db::model::RegionSnapshot; @@ -30,6 +31,7 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::ResourceType; +use omicron_common::api::internal::nexus::DownstairsClientStopRequest; use omicron_common::api::internal::nexus::DownstairsClientStopped; use omicron_common::api::internal::nexus::RepairProgress; use omicron_uuid_kinds::DownstairsKind; @@ -991,8 +993,41 @@ impl DataStore { }) } + /// Record when a Downstairs client is requested to stop, and why + pub async fn downstairs_client_stop_request_notification( + &self, + opctx: &OpContext, + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, + downstairs_client_stop_request: DownstairsClientStopRequest, + ) -> Result<(), Error> { + use db::schema::downstairs_client_stop_request_notification::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + diesel::insert_into(dsl::downstairs_client_stop_request_notification) + .values(DownstairsClientStopRequestNotification { + time: downstairs_client_stop_request.time, + upstairs_id: upstairs_id.into(), + downstairs_id: downstairs_id.into(), + reason: downstairs_client_stop_request.reason.into(), + }) + .on_conflict(( + dsl::time, + dsl::upstairs_id, + dsl::downstairs_id, + dsl::reason, + )) + .do_nothing() + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } + /// Record when a Downstairs client is stopped, and why - pub async fn downstairs_stopped_notification( + pub async fn downstairs_client_stopped_notification( &self, opctx: &OpContext, upstairs_id: TypedUuid, diff --git a/nexus/db-queries/src/db/pool_connection.rs b/nexus/db-queries/src/db/pool_connection.rs index 7e65d42b50..0331a3a103 100644 --- a/nexus/db-queries/src/db/pool_connection.rs +++ b/nexus/db-queries/src/db/pool_connection.rs @@ -44,7 +44,8 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "caboose_which", "dataset_kind", "dns_group", - "downstairs_client_stop_reason_type", + "downstairs_client_stop_request_reason_type", + "downstairs_client_stopped_reason_type", "hw_power_state", "hw_rot_slot", "identity_type", diff --git a/nexus/src/app/volume.rs b/nexus/src/app/volume.rs index fd3f1d453f..8cfffdb686 100644 --- a/nexus/src/app/volume.rs +++ b/nexus/src/app/volume.rs @@ -10,6 +10,7 @@ use nexus_db_model::UpstairsRepairNotificationType; use nexus_db_queries::authn; use nexus_db_queries::context::OpContext; use omicron_common::api::external::DeleteResult; +use omicron_common::api::internal::nexus::DownstairsClientStopRequest; use omicron_common::api::internal::nexus::DownstairsClientStopped; use omicron_common::api::internal::nexus::RepairFinishInfo; use omicron_common::api::internal::nexus::RepairProgress; @@ -143,8 +144,33 @@ impl super::Nexus { .await } + /// An Upstairs is telling us that a Downstairs client task was requested to + /// stop + pub(crate) async fn downstairs_client_stop_request_notification( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, + downstairs_client_stop_request: DownstairsClientStopRequest, + ) -> DeleteResult { + info!( + self.log, + "received downstairs_client_stop_request_notification from upstairs {upstairs_id} for downstairs {downstairs_id}: {:?}", + downstairs_client_stop_request, + ); + + self.db_datastore + .downstairs_client_stop_request_notification( + opctx, + upstairs_id, + downstairs_id, + downstairs_client_stop_request, + ) + .await + } + /// An Upstairs is telling us that a Downstairs client task was stopped - pub(crate) async fn downstairs_stopped_notification( + pub(crate) async fn downstairs_client_stopped_notification( self: &Arc, opctx: &OpContext, upstairs_id: TypedUuid, @@ -153,12 +179,12 @@ impl super::Nexus { ) -> DeleteResult { info!( self.log, - "received downstairs_stopped_notification from upstairs {upstairs_id} for downstairs {downstairs_id}: {:?}", + "received downstairs_client_stopped_notification from upstairs {upstairs_id} for downstairs {downstairs_id}: {:?}", downstairs_client_stopped, ); self.db_datastore - .downstairs_stopped_notification( + .downstairs_client_stopped_notification( opctx, upstairs_id, downstairs_id, diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 23f8db117c..b46b0d7b76 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -43,6 +43,7 @@ use omicron_common::api::external::http_pagination::ScanById; use omicron_common::api::external::http_pagination::ScanParams; use omicron_common::api::external::Error; use omicron_common::api::internal::nexus::DiskRuntimeState; +use omicron_common::api::internal::nexus::DownstairsClientStopRequest; use omicron_common::api::internal::nexus::DownstairsClientStopped; use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::api::internal::nexus::RepairFinishInfo; @@ -87,7 +88,8 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(cpapi_upstairs_repair_start)?; api.register(cpapi_upstairs_repair_finish)?; api.register(cpapi_upstairs_repair_progress)?; - api.register(cpapi_downstairs_stopped)?; + api.register(cpapi_downstairs_client_stop_request)?; + api.register(cpapi_downstairs_client_stopped)?; api.register(saga_list)?; api.register(saga_view)?; @@ -627,12 +629,43 @@ struct UpstairsDownstairsPathParam { downstairs_id: TypedUuid, } -/// An Upstairs will update this endpoint if a Downstairs client task is stopped +/// An Upstairs will update this endpoint if a Downstairs client task is +/// requested to stop +#[endpoint { + method = POST, + path = "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stop-request", + }] +async fn cpapi_downstairs_client_stop_request( + rqctx: RequestContext>, + path_params: Path, + downstairs_client_stop_request: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .downstairs_client_stop_request_notification( + &opctx, + path.upstairs_id, + path.downstairs_id, + downstairs_client_stop_request.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// An Upstairs will update this endpoint if a Downstairs client task stops for +/// any reason (not just after being requested to) #[endpoint { method = POST, path = "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped", }] -async fn cpapi_downstairs_stopped( +async fn cpapi_downstairs_client_stopped( rqctx: RequestContext>, path_params: Path, downstairs_client_stopped: TypedBody, @@ -644,7 +677,7 @@ async fn cpapi_downstairs_stopped( let handler = async { let opctx = crate::context::op_context_for_internal_api(&rqctx).await; nexus - .downstairs_stopped_notification( + .downstairs_client_stopped_notification( &opctx, path.upstairs_id, path.downstairs_id, diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index cc4daf8c5d..289446fe85 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -3166,9 +3166,88 @@ async fn test_upstairs_repair_reject_submit_progress_when_no_repair( .unwrap_err(); } +/// Test that an Upstairs can notify Nexus when a Downstairs client task is +/// requested to stop +#[nexus_test] +async fn test_upstairs_notify_downstairs_client_stop_request( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let downstairs_id: TypedUuid = TypedUuid::new_v4(); + + let stop_request_url = format!( + "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stop-request" + ); + + // Make sure an Upstairs can re-send the notification + + let request = internal::nexus::DownstairsClientStopRequest { + time: Utc::now(), + reason: + internal::nexus::DownstairsClientStopRequestReason::TooManyOutstandingJobs, + }; + + int_client + .make_request( + Method::POST, + &stop_request_url, + Some(request.clone()), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + &stop_request_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // The client can be requested to stop for the same reason a different time + + let request = internal::nexus::DownstairsClientStopRequest { + time: Utc::now(), + reason: + internal::nexus::DownstairsClientStopRequestReason::TooManyOutstandingJobs, + }; + + int_client + .make_request( + Method::POST, + &stop_request_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // The client can also be requested to stop for a different reason + + let request = internal::nexus::DownstairsClientStopRequest { + time: Utc::now(), + reason: internal::nexus::DownstairsClientStopRequestReason::IOError, + }; + + int_client + .make_request( + Method::POST, + &stop_request_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + /// Test that an Upstairs can notify Nexus when a Downstairs client task stops #[nexus_test] -async fn test_upstairs_notify_downstairs_client_stop( +async fn test_upstairs_notify_downstairs_client_stops( cptestctx: &ControlPlaneTestContext, ) { let int_client = &cptestctx.internal_client; @@ -3184,8 +3263,7 @@ async fn test_upstairs_notify_downstairs_client_stop( let request = internal::nexus::DownstairsClientStopped { time: Utc::now(), - reason: - internal::nexus::DownstairsClientStopReason::TooManyOutstandingJobs, + reason: internal::nexus::DownstairsClientStoppedReason::ReadFailed, }; int_client @@ -3212,8 +3290,7 @@ async fn test_upstairs_notify_downstairs_client_stop( let request = internal::nexus::DownstairsClientStopped { time: Utc::now(), - reason: - internal::nexus::DownstairsClientStopReason::TooManyOutstandingJobs, + reason: internal::nexus::DownstairsClientStoppedReason::ReadFailed, }; int_client @@ -3230,7 +3307,7 @@ async fn test_upstairs_notify_downstairs_client_stop( let request = internal::nexus::DownstairsClientStopped { time: Utc::now(), - reason: internal::nexus::DownstairsClientStopReason::IOError, + reason: internal::nexus::DownstairsClientStoppedReason::Timeout, }; int_client diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 95bc3b07c4..9513f9ee58 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -125,10 +125,57 @@ } } }, + "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stop-request": { + "post": { + "summary": "An Upstairs will update this endpoint if a Downstairs client task is", + "description": "requested to stop", + "operationId": "cpapi_downstairs_client_stop_request", + "parameters": [ + { + "in": "path", + "name": "downstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForDownstairsKind" + } + }, + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DownstairsClientStopRequest" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped": { "post": { - "summary": "An Upstairs will update this endpoint if a Downstairs client task is stopped", - "operationId": "cpapi_downstairs_stopped", + "summary": "An Upstairs will update this endpoint if a Downstairs client task stops for", + "description": "any reason (not just after being requested to)", + "operationId": "cpapi_downstairs_client_stopped", "parameters": [ { "in": "path", @@ -3726,7 +3773,23 @@ } ] }, - "DownstairsClientStopReason": { + "DownstairsClientStopRequest": { + "type": "object", + "properties": { + "reason": { + "$ref": "#/components/schemas/DownstairsClientStopRequestReason" + }, + "time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "reason", + "time" + ] + }, + "DownstairsClientStopRequestReason": { "type": "string", "enum": [ "replacing", @@ -3744,7 +3807,7 @@ "type": "object", "properties": { "reason": { - "$ref": "#/components/schemas/DownstairsClientStopReason" + "$ref": "#/components/schemas/DownstairsClientStoppedReason" }, "time": { "type": "string", @@ -3756,6 +3819,20 @@ "time" ] }, + "DownstairsClientStoppedReason": { + "type": "string", + "enum": [ + "connection_timeout", + "connection_failed", + "timeout", + "write_failed", + "read_failed", + "requested_stop", + "finished", + "queue_closed", + "receive_task_cancelled" + ] + }, "DownstairsUnderRepair": { "type": "object", "properties": { diff --git a/schema/crdb/41.0.0/up05.sql b/schema/crdb/41.0.0/up05.sql index 59679a453f..16fb91d410 100644 --- a/schema/crdb/41.0.0/up05.sql +++ b/schema/crdb/41.0.0/up05.sql @@ -1,4 +1,4 @@ -CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stop_reason_type AS ENUM ( +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stop_request_reason_type AS ENUM ( 'replacing', 'disabled', 'failed_reconcile', diff --git a/schema/crdb/41.0.0/up06.sql b/schema/crdb/41.0.0/up06.sql index 4e2b99ec96..32b5ea6dd1 100644 --- a/schema/crdb/41.0.0/up06.sql +++ b/schema/crdb/41.0.0/up06.sql @@ -1,8 +1,8 @@ -CREATE TABLE IF NOT EXISTS downstairs_client_stopped_notification ( +CREATE TABLE IF NOT EXISTS downstairs_client_stop_request_notification ( time TIMESTAMPTZ NOT NULL, upstairs_id UUID NOT NULL, downstairs_id UUID NOT NULL, - reason omicron.public.downstairs_client_stop_reason_type NOT NULL, + reason omicron.public.downstairs_client_stop_request_reason_type NOT NULL, PRIMARY KEY (time, upstairs_id, downstairs_id, reason) ); diff --git a/schema/crdb/41.0.0/up07.sql b/schema/crdb/41.0.0/up07.sql new file mode 100644 index 0000000000..cb8903b1d3 --- /dev/null +++ b/schema/crdb/41.0.0/up07.sql @@ -0,0 +1,11 @@ +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stopped_reason_type AS ENUM ( + 'connection_timeout', + 'connection_failed', + 'timeout', + 'write_failed', + 'read_failed', + 'requested_stop', + 'finished', + 'queue_closed', + 'receive_task_cancelled' +); diff --git a/schema/crdb/41.0.0/up08.sql b/schema/crdb/41.0.0/up08.sql new file mode 100644 index 0000000000..01dc9abf8c --- /dev/null +++ b/schema/crdb/41.0.0/up08.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS downstairs_client_stopped_notification ( + time TIMESTAMPTZ NOT NULL, + upstairs_id UUID NOT NULL, + downstairs_id UUID NOT NULL, + reason omicron.public.downstairs_client_stopped_reason_type NOT NULL, + + PRIMARY KEY (time, upstairs_id, downstairs_id, reason) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 2f91bc5604..c2389c5709 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3610,7 +3610,7 @@ CREATE TABLE IF NOT EXISTS upstairs_repair_progress ( PRIMARY KEY (repair_id, time, current_item, total_items) ); -CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stop_reason_type AS ENUM ( +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stop_request_reason_type AS ENUM ( 'replacing', 'disabled', 'failed_reconcile', @@ -3622,11 +3622,32 @@ CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stop_reason_type AS E 'deactivated' ); +CREATE TABLE IF NOT EXISTS downstairs_client_stop_request_notification ( + time TIMESTAMPTZ NOT NULL, + upstairs_id UUID NOT NULL, + downstairs_id UUID NOT NULL, + reason omicron.public.downstairs_client_stop_request_reason_type NOT NULL, + + PRIMARY KEY (time, upstairs_id, downstairs_id, reason) +); + +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stopped_reason_type AS ENUM ( + 'connection_timeout', + 'connection_failed', + 'timeout', + 'write_failed', + 'read_failed', + 'requested_stop', + 'finished', + 'queue_closed', + 'receive_task_cancelled' +); + CREATE TABLE IF NOT EXISTS downstairs_client_stopped_notification ( time TIMESTAMPTZ NOT NULL, upstairs_id UUID NOT NULL, downstairs_id UUID NOT NULL, - reason omicron.public.downstairs_client_stop_reason_type NOT NULL, + reason omicron.public.downstairs_client_stopped_reason_type NOT NULL, PRIMARY KEY (time, upstairs_id, downstairs_id, reason) ); From 737d18ff4170b671774a6aba8dc5057760cefd42 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 14 Mar 2024 10:48:53 -0400 Subject: [PATCH 24/24] missing omicron.public. prefix --- schema/crdb/41.0.0/up03.sql | 2 +- schema/crdb/41.0.0/up04.sql | 2 +- schema/crdb/41.0.0/up06.sql | 2 +- schema/crdb/41.0.0/up08.sql | 2 +- schema/crdb/dbinit.sql | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/schema/crdb/41.0.0/up03.sql b/schema/crdb/41.0.0/up03.sql index 3affc24900..a33c83c1ab 100644 --- a/schema/crdb/41.0.0/up03.sql +++ b/schema/crdb/41.0.0/up03.sql @@ -1,4 +1,4 @@ -CREATE TABLE IF NOT EXISTS upstairs_repair_notification ( +CREATE TABLE IF NOT EXISTS omicron.public.upstairs_repair_notification ( time TIMESTAMPTZ NOT NULL, repair_id UUID NOT NULL, diff --git a/schema/crdb/41.0.0/up04.sql b/schema/crdb/41.0.0/up04.sql index c01ee8152d..ed51c15a1e 100644 --- a/schema/crdb/41.0.0/up04.sql +++ b/schema/crdb/41.0.0/up04.sql @@ -1,4 +1,4 @@ -CREATE TABLE IF NOT EXISTS upstairs_repair_progress ( +CREATE TABLE IF NOT EXISTS omicron.public.upstairs_repair_progress ( repair_id UUID NOT NULL, time TIMESTAMPTZ NOT NULL, current_item INT8 NOT NULL, diff --git a/schema/crdb/41.0.0/up06.sql b/schema/crdb/41.0.0/up06.sql index 32b5ea6dd1..2cf45ce101 100644 --- a/schema/crdb/41.0.0/up06.sql +++ b/schema/crdb/41.0.0/up06.sql @@ -1,4 +1,4 @@ -CREATE TABLE IF NOT EXISTS downstairs_client_stop_request_notification ( +CREATE TABLE IF NOT EXISTS omicron.public.downstairs_client_stop_request_notification ( time TIMESTAMPTZ NOT NULL, upstairs_id UUID NOT NULL, downstairs_id UUID NOT NULL, diff --git a/schema/crdb/41.0.0/up08.sql b/schema/crdb/41.0.0/up08.sql index 01dc9abf8c..6f898f382c 100644 --- a/schema/crdb/41.0.0/up08.sql +++ b/schema/crdb/41.0.0/up08.sql @@ -1,4 +1,4 @@ -CREATE TABLE IF NOT EXISTS downstairs_client_stopped_notification ( +CREATE TABLE IF NOT EXISTS omicron.public.downstairs_client_stopped_notification ( time TIMESTAMPTZ NOT NULL, upstairs_id UUID NOT NULL, downstairs_id UUID NOT NULL, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index c2389c5709..b36ac7bde1 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3579,7 +3579,7 @@ CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_type AS ENUM ( 'reconciliation' ); -CREATE TABLE IF NOT EXISTS upstairs_repair_notification ( +CREATE TABLE IF NOT EXISTS omicron.public.upstairs_repair_notification ( time TIMESTAMPTZ NOT NULL, repair_id UUID NOT NULL, @@ -3601,7 +3601,7 @@ CREATE TABLE IF NOT EXISTS upstairs_repair_notification ( PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) ); -CREATE TABLE IF NOT EXISTS upstairs_repair_progress ( +CREATE TABLE IF NOT EXISTS omicron.public.upstairs_repair_progress ( repair_id UUID NOT NULL, time TIMESTAMPTZ NOT NULL, current_item INT8 NOT NULL, @@ -3622,7 +3622,7 @@ CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stop_request_reason_t 'deactivated' ); -CREATE TABLE IF NOT EXISTS downstairs_client_stop_request_notification ( +CREATE TABLE IF NOT EXISTS omicron.public.downstairs_client_stop_request_notification ( time TIMESTAMPTZ NOT NULL, upstairs_id UUID NOT NULL, downstairs_id UUID NOT NULL, @@ -3643,7 +3643,7 @@ CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stopped_reason_type A 'receive_task_cancelled' ); -CREATE TABLE IF NOT EXISTS downstairs_client_stopped_notification ( +CREATE TABLE IF NOT EXISTS omicron.public.downstairs_client_stopped_notification ( time TIMESTAMPTZ NOT NULL, upstairs_id UUID NOT NULL, downstairs_id UUID NOT NULL,