From 67cd482cd4f6f15ed3a9b42ba7eed10c57199b84 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 28 Nov 2023 23:57:21 -0800 Subject: [PATCH] [nexus] add sled provision state (#4520) Add the notion of a sled provision state to Nexus. Currently, we will only use this to prevent new resources and regions from being provisioned to sleds. This PR includes: 1. Database updates and schema migrations. 2. Database APIs in `nexus-db-queries`. 3. An HTTP API. 4. Tests for resource and region allocation. --- Cargo.lock | 6 +- nexus/db-model/Cargo.toml | 1 + nexus/db-model/src/lib.rs | 9 +- .../db-model/src/queries/region_allocation.rs | 2 + nexus/db-model/src/schema.rs | 3 +- nexus/db-model/src/sled.rs | 11 +- nexus/db-model/src/sled_provision_state.rs | 58 ++++++ nexus/db-queries/src/db/datastore/mod.rs | 86 ++++++++- nexus/db-queries/src/db/datastore/sled.rs | 171 ++++++++++++++++-- .../src/db/queries/region_allocation.rs | 10 +- nexus/src/app/sled.rs | 15 ++ nexus/src/external_api/http_entrypoints.rs | 42 +++++ nexus/tests/integration_tests/endpoints.rs | 15 ++ nexus/tests/integration_tests/schema.rs | 12 +- nexus/tests/output/nexus_tags.txt | 1 + nexus/types/Cargo.toml | 1 + nexus/types/src/external_api/params.rs | 17 ++ nexus/types/src/external_api/views.rs | 27 +++ openapi/nexus.json | 127 +++++++++++++ schema/crdb/15.0.0/up1.sql | 6 + schema/crdb/15.0.0/up2.sql | 3 + schema/crdb/15.0.0/up3.sql | 5 + schema/crdb/dbinit.sql | 12 +- 23 files changed, 607 insertions(+), 33 deletions(-) create mode 100644 nexus/db-model/src/sled_provision_state.rs create mode 100644 schema/crdb/15.0.0/up1.sql create mode 100644 schema/crdb/15.0.0/up2.sql create mode 100644 schema/crdb/15.0.0/up3.sql diff --git a/Cargo.lock b/Cargo.lock index 108c8b182d..532fcde59f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1671,9 +1671,9 @@ dependencies = [ [[package]] name = "diesel_derives" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e054665eaf6d97d1e7125512bb2d35d07c73ac86cc6920174cb42d1ab697a554" +checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44" dependencies = [ "diesel_table_macro_syntax", "proc-macro2", @@ -3993,6 +3993,7 @@ dependencies = [ "sled-agent-client", "steno", "strum", + "thiserror", "uuid", ] @@ -4178,6 +4179,7 @@ dependencies = [ "schemars", "serde", "serde_json", + "serde_with", "steno", "strum", "uuid", diff --git a/nexus/db-model/Cargo.toml b/nexus/db-model/Cargo.toml index b7514c4806..477ce7d11f 100644 --- a/nexus/db-model/Cargo.toml +++ b/nexus/db-model/Cargo.toml @@ -26,6 +26,7 @@ serde.workspace = true serde_json.workspace = true steno.workspace = true strum.workspace = true +thiserror.workspace = true uuid.workspace = true db-macros.workspace = true diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index ac5bad26f8..43bf83fd34 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -70,6 +70,7 @@ mod silo_user; mod silo_user_password_hash; mod sled; mod sled_instance; +mod sled_provision_state; mod sled_resource; mod sled_resource_kind; mod sled_underlay_subnet_allocation; @@ -152,6 +153,7 @@ pub use silo_user::*; pub use silo_user_password_hash::*; pub use sled::*; pub use sled_instance::*; +pub use sled_provision_state::*; pub use sled_resource::*; pub use sled_resource_kind::*; pub use sled_underlay_subnet_allocation::*; @@ -287,10 +289,9 @@ macro_rules! impl_enum_type { Ok($model_type::$enum_item) } )* - _ => { - Err(concat!("Unrecognized enum variant for ", - stringify!{$model_type}) - .into()) + other => { + let s = concat!("Unrecognized enum variant for ", stringify!{$model_type}); + Err(format!("{}: (raw bytes: {:?})", s, other).into()) } } } diff --git a/nexus/db-model/src/queries/region_allocation.rs b/nexus/db-model/src/queries/region_allocation.rs index 2025e79fb8..a1b9e0373a 100644 --- a/nexus/db-model/src/queries/region_allocation.rs +++ b/nexus/db-model/src/queries/region_allocation.rs @@ -23,6 +23,7 @@ // a CTE (where we want the alias name to come first). use crate::schema::dataset; +use crate::schema::sled; use crate::schema::zpool; table! { @@ -157,6 +158,7 @@ diesel::allow_tables_to_appear_in_same_query!( diesel::allow_tables_to_appear_in_same_query!( old_zpool_usage, zpool, + sled, proposed_dataset_changes, ); diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index afeac5e6cd..6527da3637 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -741,6 +741,7 @@ table! { ip -> Inet, port -> Int4, last_used_address -> Inet, + provision_state -> crate::SledProvisionStateEnum, } } @@ -1299,7 +1300,7 @@ table! { /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(14, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(15, 0, 0); allow_tables_to_appear_in_same_query!( system_update, diff --git a/nexus/db-model/src/sled.rs b/nexus/db-model/src/sled.rs index 4c82aa5d23..0f6d1b911e 100644 --- a/nexus/db-model/src/sled.rs +++ b/nexus/db-model/src/sled.rs @@ -4,8 +4,8 @@ use super::{ByteCount, Generation, SqlU16, SqlU32}; use crate::collection::DatastoreCollectionConfig; -use crate::ipv6; use crate::schema::{physical_disk, service, sled, zpool}; +use crate::{ipv6, SledProvisionState}; use chrono::{DateTime, Utc}; use db_macros::Asset; use nexus_types::{external_api::shared, external_api::views, identity::Asset}; @@ -59,6 +59,8 @@ pub struct Sled { /// The last IP address provided to an Oxide service on this sled pub last_used_address: ipv6::Ipv6Addr, + + provision_state: SledProvisionState, } impl Sled { @@ -81,6 +83,10 @@ impl Sled { pub fn serial_number(&self) -> &str { &self.serial_number } + + pub fn provision_state(&self) -> SledProvisionState { + self.provision_state + } } impl From for views::Sled { @@ -93,6 +99,7 @@ impl From for views::Sled { part: sled.part_number, revision: sled.revision, }, + provision_state: sled.provision_state.into(), usable_hardware_threads: sled.usable_hardware_threads.0, usable_physical_ram: *sled.usable_physical_ram, } @@ -188,6 +195,8 @@ impl SledUpdate { serial_number: self.serial_number, part_number: self.part_number, revision: self.revision, + // By default, sleds start as provisionable. + provision_state: SledProvisionState::Provisionable, usable_hardware_threads: self.usable_hardware_threads, usable_physical_ram: self.usable_physical_ram, reservoir_size: self.reservoir_size, diff --git a/nexus/db-model/src/sled_provision_state.rs b/nexus/db-model/src/sled_provision_state.rs new file mode 100644 index 0000000000..6cf81b9c70 --- /dev/null +++ b/nexus/db-model/src/sled_provision_state.rs @@ -0,0 +1,58 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use nexus_types::external_api::views; +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +impl_enum_type!( + #[derive(Clone, SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "sled_provision_state"))] + pub struct SledProvisionStateEnum; + + #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = SledProvisionStateEnum)] + pub enum SledProvisionState; + + // Enum values + Provisionable => b"provisionable" + NonProvisionable => b"non_provisionable" +); + +impl From for views::SledProvisionState { + fn from(state: SledProvisionState) -> Self { + match state { + SledProvisionState::Provisionable => { + views::SledProvisionState::Provisionable + } + SledProvisionState::NonProvisionable => { + views::SledProvisionState::NonProvisionable + } + } + } +} + +impl TryFrom for SledProvisionState { + type Error = UnknownSledProvisionState; + + fn try_from(state: views::SledProvisionState) -> Result { + match state { + views::SledProvisionState::Provisionable => { + Ok(SledProvisionState::Provisionable) + } + views::SledProvisionState::NonProvisionable => { + Ok(SledProvisionState::NonProvisionable) + } + views::SledProvisionState::Unknown => { + Err(UnknownSledProvisionState) + } + } + } +} + +/// An unknown [`views::SledProvisionState`] was encountered. +#[derive(Clone, Debug, Error)] +#[error("Unknown SledProvisionState")] +pub struct UnknownSledProvisionState; diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 0612b960c9..44cd7a95b7 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -372,8 +372,8 @@ mod test { BlockSize, ComponentUpdate, ComponentUpdateIdentity, ConsoleSession, Dataset, DatasetKind, ExternalIp, PhysicalDisk, PhysicalDiskKind, Project, Rack, Region, Service, ServiceKind, SiloUser, SledBaseboard, - SledSystemHardware, SledUpdate, SshKey, SystemUpdate, - UpdateableComponentType, VpcSubnet, Zpool, + SledProvisionState, SledSystemHardware, SledUpdate, SshKey, + SystemUpdate, UpdateableComponentType, VpcSubnet, Zpool, }; use crate::db::queries::vpc_subnet::FilterConflictingVpcSubnetRangesQuery; use assert_matches::assert_matches; @@ -610,6 +610,35 @@ mod test { sled_id } + // Marks a sled as non-provisionable. + async fn mark_sled_non_provisionable( + datastore: &DataStore, + opctx: &OpContext, + sled_id: Uuid, + ) { + let (authz_sled, sled) = LookupPath::new(opctx, datastore) + .sled_id(sled_id) + .fetch_for(authz::Action::Modify) + .await + .unwrap(); + println!("sled: {:?}", sled); + let old_state = datastore + .sled_set_provision_state( + &opctx, + &authz_sled, + SledProvisionState::NonProvisionable, + ) + .await + .unwrap_or_else(|error| { + panic!( + "error marking sled {sled_id} as non-provisionable: {error}" + ) + }); + // The old state should always be provisionable since that's where we + // start. + assert_eq!(old_state, SledProvisionState::Provisionable); + } + fn test_zpool_size() -> ByteCount { ByteCount::from_gibibytes_u32(100) } @@ -770,13 +799,24 @@ mod test { let logctx = dev::test_setup_log("test_region_allocation_strat_random"); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - create_test_datasets_for_region_allocation( + let test_datasets = create_test_datasets_for_region_allocation( &opctx, datastore.clone(), + // Even though we're going to mark one sled as non-provisionable to + // test that logic, we aren't forcing the datasets to be on + // distinct sleds, so REGION_REDUNDANCY_THRESHOLD is enough. REGION_REDUNDANCY_THRESHOLD, ) .await; + let non_provisionable_dataset_id = test_datasets[0].dataset_id; + mark_sled_non_provisionable( + &datastore, + &opctx, + test_datasets[0].sled_id, + ) + .await; + // Allocate regions from the datasets for this disk. Do it a few times // for good measure. for alloc_seed in 0..10 { @@ -809,6 +849,9 @@ mod test { // Must be 3 unique datasets assert!(disk_datasets.insert(dataset.id())); + // Dataset must not be non-provisionable. + assert_ne!(dataset.id(), non_provisionable_dataset_id); + // Must be 3 unique zpools assert!(disk_zpools.insert(dataset.pool_id)); @@ -837,12 +880,23 @@ mod test { let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - // Create a rack without enough sleds for a successful allocation when - // we require 3 distinct sleds. + // Create a rack with enough sleds for a successful allocation when we + // require 3 distinct provisionable sleds. let test_datasets = create_test_datasets_for_region_allocation( &opctx, datastore.clone(), - REGION_REDUNDANCY_THRESHOLD, + // We're going to mark one sled as non-provisionable to test that + // logic, and we *are* forcing the datasets to be on distinct + // sleds: hence threshold + 1. + REGION_REDUNDANCY_THRESHOLD + 1, + ) + .await; + + let non_provisionable_dataset_id = test_datasets[0].dataset_id; + mark_sled_non_provisionable( + &datastore, + &opctx, + test_datasets[0].sled_id, ) .await; @@ -884,6 +938,9 @@ mod test { // Must be 3 unique datasets assert!(disk_datasets.insert(dataset.id())); + // Dataset must not be non-provisionable. + assert_ne!(dataset.id(), non_provisionable_dataset_id); + // Must be 3 unique zpools assert!(disk_zpools.insert(dataset.pool_id)); @@ -916,11 +973,22 @@ mod test { let (opctx, datastore) = datastore_test(&logctx, &db).await; // Create a rack without enough sleds for a successful allocation when - // we require 3 distinct sleds. - create_test_datasets_for_region_allocation( + // we require 3 distinct provisionable sleds. + let test_datasets = create_test_datasets_for_region_allocation( &opctx, datastore.clone(), - REGION_REDUNDANCY_THRESHOLD - 1, + // Here, we need to have REGION_REDUNDANCY_THRESHOLD - 1 + // provisionable sleds to test this failure condition. We're going + // to mark one sled as non-provisionable to test that logic, so we + // need to add 1 to that number. + REGION_REDUNDANCY_THRESHOLD, + ) + .await; + + mark_sled_non_provisionable( + &datastore, + &opctx, + test_datasets[0].sled_id, ) .await; diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 130c36b496..406119a636 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -15,6 +15,7 @@ use crate::db::model::Sled; use crate::db::model::SledResource; use crate::db::model::SledUpdate; use crate::db::pagination::paginated; +use crate::db::update_and_check::UpdateAndCheck; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -153,6 +154,11 @@ impl DataStore { .and(sled_has_space_in_reservoir), ) .filter(sled_dsl::time_deleted.is_null()) + // Filter out sleds that are not provisionable. + .filter( + sled_dsl::provision_state + .eq(db::model::SledProvisionState::Provisionable), + ) .select(sled_dsl::id) .into_boxed(); @@ -217,6 +223,37 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; Ok(()) } + + /// Sets the provision state for this sled. + /// + /// Returns the previous state. + pub async fn sled_set_provision_state( + &self, + opctx: &OpContext, + authz_sled: &authz::Sled, + state: db::model::SledProvisionState, + ) -> Result { + use db::schema::sled::dsl; + + opctx.authorize(authz::Action::Modify, authz_sled).await?; + + let sled_id = authz_sled.id(); + let query = diesel::update(dsl::sled) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(sled_id)) + .filter(dsl::provision_state.ne(state)) + .set(( + dsl::provision_state.eq(state), + dsl::time_modified.eq(Utc::now()), + )) + .check_if_exists::(sled_id); + let result = query + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(result.found.provision_state()) + } } #[cfg(test)] @@ -226,12 +263,15 @@ mod test { use crate::db::datastore::test::{ sled_baseboard_for_test, sled_system_hardware_for_test, }; + use crate::db::lookup::LookupPath; use crate::db::model::ByteCount; use crate::db::model::SqlU32; use nexus_test_utils::db::test_setup_database; + use nexus_types::identity::Asset; use omicron_common::api::external; use omicron_test_utils::dev; use std::net::{Ipv6Addr, SocketAddrV6}; + use std::num::NonZeroU32; fn rack_id() -> Uuid { Uuid::parse_str(nexus_test_utils::RACK_UUID).unwrap() @@ -243,19 +283,9 @@ mod test { let mut db = test_setup_database(&logctx.log).await; let (_opctx, datastore) = datastore_test(&logctx, &db).await; - let sled_id = Uuid::new_v4(); - let addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0); - let mut sled_update = SledUpdate::new( - sled_id, - addr, - sled_baseboard_for_test(), - sled_system_hardware_for_test(), - rack_id(), - ); - let observed_sled = datastore - .sled_upsert(sled_update.clone()) - .await - .expect("Could not upsert sled during test prep"); + let mut sled_update = test_new_sled_update(); + let observed_sled = + datastore.sled_upsert(sled_update.clone()).await.unwrap(); assert_eq!( observed_sled.usable_hardware_threads, sled_update.usable_hardware_threads @@ -301,4 +331,119 @@ mod test { db.cleanup().await.unwrap(); logctx.cleanup_successful(); } + + /// Test that new reservations aren't created on non-provisionable sleds. + #[tokio::test] + async fn sled_reservation_create_non_provisionable() { + let logctx = + dev::test_setup_log("sled_reservation_create_non_provisionable"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + let sled_update = test_new_sled_update(); + let non_provisionable_sled = + datastore.sled_upsert(sled_update.clone()).await.unwrap(); + + let (authz_sled, _) = LookupPath::new(&opctx, &datastore) + .sled_id(non_provisionable_sled.id()) + .fetch_for(authz::Action::Modify) + .await + .unwrap(); + + let old_state = datastore + .sled_set_provision_state( + &opctx, + &authz_sled, + db::model::SledProvisionState::NonProvisionable, + ) + .await + .unwrap(); + assert_eq!( + old_state, + db::model::SledProvisionState::Provisionable, + "a newly created sled starts as provisionable" + ); + + // This should be an error since there are no provisionable sleds. + let resources = db::model::Resources::new( + 1, + // Just require the bare non-zero amount of RAM. + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ); + let constraints = db::model::SledReservationConstraints::none(); + let error = datastore + .sled_reservation_create( + &opctx, + Uuid::new_v4(), + db::model::SledResourceKind::Instance, + resources.clone(), + constraints, + ) + .await + .unwrap_err(); + assert!(matches!(error, external::Error::ServiceUnavailable { .. })); + + // Now add a provisionable sled and try again. + let sled_update = test_new_sled_update(); + let provisionable_sled = + datastore.sled_upsert(sled_update.clone()).await.unwrap(); + + let sleds = datastore + .sled_list(&opctx, &first_page(NonZeroU32::new(10).unwrap())) + .await + .unwrap(); + println!("sleds: {:?}", sleds); + + // Try a few times to ensure that resources never get allocated to the + // non-provisionable sled. + for _ in 0..10 { + let constraints = db::model::SledReservationConstraints::none(); + let resource = datastore + .sled_reservation_create( + &opctx, + Uuid::new_v4(), + db::model::SledResourceKind::Instance, + resources.clone(), + constraints, + ) + .await + .unwrap(); + assert_eq!( + resource.sled_id, + provisionable_sled.id(), + "resource is always allocated to the provisionable sled" + ); + + datastore + .sled_reservation_delete(&opctx, resource.id) + .await + .unwrap(); + } + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + fn test_new_sled_update() -> SledUpdate { + let sled_id = Uuid::new_v4(); + let addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0); + SledUpdate::new( + sled_id, + addr, + sled_baseboard_for_test(), + sled_system_hardware_for_test(), + rack_id(), + ) + } + + /// Returns pagination parameters to fetch the first page of results for a + /// paginated endpoint + fn first_page<'a, T>(limit: NonZeroU32) -> DataPageParams<'a, T> { + DataPageParams { + marker: None, + direction: dropshot::PaginationOrder::Ascending, + limit, + } + } } diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index a080af4c37..031be92c08 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -290,6 +290,7 @@ impl CandidateZpools { seed: u128, distinct_sleds: bool, ) -> Self { + use schema::sled::dsl as sled_dsl; use schema::zpool::dsl as zpool_dsl; // Why are we using raw `diesel::dsl::sql` here? @@ -310,13 +311,20 @@ impl CandidateZpools { + diesel::dsl::sql(&zpool_size_delta.to_string())) .le(diesel::dsl::sql(zpool_dsl::total_size::NAME)); + // We need to join on the sled table to access provision_state. + let with_sled = sled_dsl::sled.on(zpool_dsl::sled_id.eq(sled_dsl::id)); let with_zpool = zpool_dsl::zpool - .on(zpool_dsl::id.eq(old_zpool_usage::dsl::pool_id)); + .on(zpool_dsl::id.eq(old_zpool_usage::dsl::pool_id)) + .inner_join(with_sled); + + let sled_is_provisionable = sled_dsl::provision_state + .eq(crate::db::model::SledProvisionState::Provisionable); let base_query = old_zpool_usage .query_source() .inner_join(with_zpool) .filter(it_will_fit) + .filter(sled_is_provisionable) .select((old_zpool_usage::dsl::pool_id,)); let query = if distinct_sleds { diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index c2931f1441..44efc2934e 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -8,6 +8,7 @@ use crate::internal_api::params::{ PhysicalDiskDeleteRequest, PhysicalDiskPutRequest, SledAgentStartupInfo, SledRole, ZpoolPutRequest, }; +use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::lookup; @@ -142,6 +143,20 @@ impl super::Nexus { .await } + /// Returns the old state. + pub(crate) async fn sled_set_provision_state( + &self, + opctx: &OpContext, + sled_lookup: &lookup::Sled<'_>, + state: db::model::SledProvisionState, + ) -> Result { + let (authz_sled,) = + sled_lookup.lookup_for(authz::Action::Modify).await?; + self.db_datastore + .sled_set_provision_state(opctx, &authz_sled, state) + .await + } + // Physical disks pub(crate) async fn sled_list_physical_disks( diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 78f675c28a..f1302f4a73 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -218,6 +218,7 @@ pub(crate) fn external_api() -> NexusApiDescription { api.register(rack_view)?; api.register(sled_list)?; api.register(sled_view)?; + api.register(sled_set_provision_state)?; api.register(sled_instance_list)?; api.register(sled_physical_disk_list)?; api.register(physical_disk_list)?; @@ -4483,6 +4484,47 @@ async fn sled_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// Set the sled's provision state. +#[endpoint { + method = PUT, + path = "/v1/system/hardware/sleds/{sled_id}/provision-state", + tags = ["system/hardware"], +}] +async fn sled_set_provision_state( + rqctx: RequestContext>, + path_params: Path, + new_provision_state: TypedBody, +) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.nexus; + + let path = path_params.into_inner(); + let provision_state = new_provision_state.into_inner().state; + + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + // Convert the external `SledProvisionState` into our internal data model. + let new_state = + db::model::SledProvisionState::try_from(provision_state).map_err( + |error| HttpError::for_bad_request(None, format!("{error}")), + )?; + + let sled_lookup = nexus.sled_lookup(&opctx, &path.sled_id)?; + + let old_state = nexus + .sled_set_provision_state(&opctx, &sled_lookup, new_state) + .await?; + + let response = params::SledProvisionStateResponse { + old_state: old_state.into(), + new_state: new_state.into(), + }; + + Ok(HttpResponseOk(response)) + }; + apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + /// List instances running on a given sled #[endpoint { method = GET, diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 5dfdcc151d..536b96f7ae 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -50,6 +50,12 @@ lazy_static! { format!("/v1/system/hardware/uninitialized-sleds"); pub static ref HARDWARE_SLED_URL: String = format!("/v1/system/hardware/sleds/{}", SLED_AGENT_UUID); + pub static ref HARDWARE_SLED_PROVISION_STATE_URL: String = + format!("/v1/system/hardware/sleds/{}/provision-state", SLED_AGENT_UUID); + pub static ref DEMO_SLED_PROVISION_STATE: params::SledProvisionStateParams = + params::SledProvisionStateParams { + state: nexus_types::external_api::views::SledProvisionState::NonProvisionable, + }; pub static ref HARDWARE_SWITCH_URL: String = format!("/v1/system/hardware/switches/{}", SWITCH_UUID); pub static ref HARDWARE_DISK_URL: String = @@ -1609,6 +1615,15 @@ lazy_static! { allowed_methods: vec![AllowedMethod::Get], }, + VerifyEndpoint { + url: &HARDWARE_SLED_PROVISION_STATE_URL, + visibility: Visibility::Protected, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![AllowedMethod::Put( + serde_json::to_value(&*DEMO_SLED_PROVISION_STATE).unwrap() + )], + }, + VerifyEndpoint { url: "/v1/system/hardware/switches", visibility: Visibility::Public, diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs index 213e7f9e4f..6feafe415d 100644 --- a/nexus/tests/integration_tests/schema.rs +++ b/nexus/tests/integration_tests/schema.rs @@ -629,7 +629,17 @@ impl InformationSchema { self.referential_constraints, other.referential_constraints ); - similar_asserts::assert_eq!(self.statistics, other.statistics); + similar_asserts::assert_eq!( + self.statistics, + other.statistics, + "Statistics did not match. This often means that in dbinit.sql, a new \ + column was added into the middle of a table rather than to the end. \ + If that is the case:\n\n \ + \ + * Change dbinit.sql to add the column to the end of the table.\n\ + * Update nexus/db-model/src/schema.rs and the corresponding \ + Queryable/Insertable struct with the new column ordering." + ); similar_asserts::assert_eq!(self.sequences, other.sequences); similar_asserts::assert_eq!(self.pg_indexes, other.pg_indexes); } diff --git a/nexus/tests/output/nexus_tags.txt b/nexus/tests/output/nexus_tags.txt index dd387ab979..7e57d00df2 100644 --- a/nexus/tests/output/nexus_tags.txt +++ b/nexus/tests/output/nexus_tags.txt @@ -120,6 +120,7 @@ rack_view GET /v1/system/hardware/racks/{rac sled_instance_list GET /v1/system/hardware/sleds/{sled_id}/instances sled_list GET /v1/system/hardware/sleds sled_physical_disk_list GET /v1/system/hardware/sleds/{sled_id}/disks +sled_set_provision_state PUT /v1/system/hardware/sleds/{sled_id}/provision-state sled_view GET /v1/system/hardware/sleds/{sled_id} switch_list GET /v1/system/hardware/switches switch_view GET /v1/system/hardware/switches/{switch_id} diff --git a/nexus/types/Cargo.toml b/nexus/types/Cargo.toml index 9cb94a8484..8cbbd8626c 100644 --- a/nexus/types/Cargo.toml +++ b/nexus/types/Cargo.toml @@ -14,6 +14,7 @@ parse-display.workspace = true schemars = { workspace = true, features = ["chrono", "uuid1"] } serde.workspace = true serde_json.workspace = true +serde_with.workspace = true steno.workspace = true strum.workspace = true uuid.workspace = true diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index a0169ae777..a5f1f3f874 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -75,6 +75,23 @@ pub struct SledSelector { pub sled: Uuid, } +/// Parameters for `sled_set_provision_state`. +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq)] +pub struct SledProvisionStateParams { + /// The provision state. + pub state: super::views::SledProvisionState, +} + +/// Response to `sled_set_provision_state`. +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq)] +pub struct SledProvisionStateResponse { + /// The old provision state. + pub old_state: super::views::SledProvisionState, + + /// The new provision state. + pub new_state: super::views::SledProvisionState, +} + pub struct SwitchSelector { /// ID of the switch pub switch: Uuid, diff --git a/nexus/types/src/external_api/views.rs b/nexus/types/src/external_api/views.rs index 9dfe36d63b..6d02623f34 100644 --- a/nexus/types/src/external_api/views.rs +++ b/nexus/types/src/external_api/views.rs @@ -17,6 +17,7 @@ use omicron_common::api::external::{ }; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use serde_with::rust::deserialize_ignore_any; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::net::IpAddr; @@ -286,12 +287,38 @@ pub struct Sled { pub baseboard: Baseboard, /// The rack to which this Sled is currently attached pub rack_id: Uuid, + /// The provision state of the sled. + pub provision_state: SledProvisionState, /// The number of hardware threads which can execute on this sled pub usable_hardware_threads: u32, /// Amount of RAM which may be used by the Sled's OS pub usable_physical_ram: ByteCount, } +/// The provision state of a sled. +/// +/// This controls whether new resources are going to be provisioned on this +/// sled. +#[derive( + Copy, Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, +)] +#[serde(rename_all = "snake_case")] +pub enum SledProvisionState { + /// New resources will be provisioned on this sled. + Provisionable, + + /// New resources will not be provisioned on this sled. However, existing + /// resources will continue to be on this sled unless manually migrated + /// off. + NonProvisionable, + + /// This is a state that isn't known yet. + /// + /// This is defined to avoid API breakage. + #[serde(other, deserialize_with = "deserialize_ignore_any")] + Unknown, +} + /// An operator's view of an instance running on a given sled #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct SledInstance { diff --git a/openapi/nexus.json b/openapi/nexus.json index 704aa393db..08e6cd7149 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -3817,6 +3817,55 @@ } } }, + "/v1/system/hardware/sleds/{sled_id}/provision-state": { + "put": { + "tags": [ + "system/hardware" + ], + "summary": "Set the sled's provision state.", + "operationId": "sled_set_provision_state", + "parameters": [ + { + "in": "path", + "name": "sled_id", + "description": "ID of the sled", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SledProvisionStateParams" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SledProvisionStateResponse" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/v1/system/hardware/switch-port": { "get": { "tags": [ @@ -12976,6 +13025,14 @@ "type": "string", "format": "uuid" }, + "provision_state": { + "description": "The provision state of the sled.", + "allOf": [ + { + "$ref": "#/components/schemas/SledProvisionState" + } + ] + }, "rack_id": { "description": "The rack to which this Sled is currently attached", "type": "string", @@ -13009,6 +13066,7 @@ "required": [ "baseboard", "id", + "provision_state", "rack_id", "time_created", "time_modified", @@ -13099,6 +13157,75 @@ "items" ] }, + "SledProvisionState": { + "description": "The provision state of a sled.\n\nThis controls whether new resources are going to be provisioned on this sled.", + "oneOf": [ + { + "description": "New resources will be provisioned on this sled.", + "type": "string", + "enum": [ + "provisionable" + ] + }, + { + "description": "New resources will not be provisioned on this sled. However, existing resources will continue to be on this sled unless manually migrated off.", + "type": "string", + "enum": [ + "non_provisionable" + ] + }, + { + "description": "This is a state that isn't known yet.\n\nThis is defined to avoid API breakage.", + "type": "string", + "enum": [ + "unknown" + ] + } + ] + }, + "SledProvisionStateParams": { + "description": "Parameters for `sled_set_provision_state`.", + "type": "object", + "properties": { + "state": { + "description": "The provision state.", + "allOf": [ + { + "$ref": "#/components/schemas/SledProvisionState" + } + ] + } + }, + "required": [ + "state" + ] + }, + "SledProvisionStateResponse": { + "description": "Response to `sled_set_provision_state`.", + "type": "object", + "properties": { + "new_state": { + "description": "The new provision state.", + "allOf": [ + { + "$ref": "#/components/schemas/SledProvisionState" + } + ] + }, + "old_state": { + "description": "The old provision state.", + "allOf": [ + { + "$ref": "#/components/schemas/SledProvisionState" + } + ] + } + }, + "required": [ + "new_state", + "old_state" + ] + }, "SledResultsPage": { "description": "A single page of results", "type": "object", diff --git a/schema/crdb/15.0.0/up1.sql b/schema/crdb/15.0.0/up1.sql new file mode 100644 index 0000000000..04baa76370 --- /dev/null +++ b/schema/crdb/15.0.0/up1.sql @@ -0,0 +1,6 @@ +CREATE TYPE IF NOT EXISTS omicron.public.sled_provision_state AS ENUM ( + -- New resources can be provisioned onto the sled + 'provisionable', + -- New resources must not be provisioned onto the sled + 'non_provisionable' +); diff --git a/schema/crdb/15.0.0/up2.sql b/schema/crdb/15.0.0/up2.sql new file mode 100644 index 0000000000..e3ea2ba11c --- /dev/null +++ b/schema/crdb/15.0.0/up2.sql @@ -0,0 +1,3 @@ +ALTER TABLE omicron.public.sled + ADD COLUMN IF NOT EXISTS provision_state omicron.public.sled_provision_state + NOT NULL DEFAULT 'provisionable'; diff --git a/schema/crdb/15.0.0/up3.sql b/schema/crdb/15.0.0/up3.sql new file mode 100644 index 0000000000..aaa3feac20 --- /dev/null +++ b/schema/crdb/15.0.0/up3.sql @@ -0,0 +1,5 @@ +-- Drop the default column value for provision_state -- it should always be set +-- by Nexus. +ALTER TABLE omicron.public.sled + ALTER COLUMN provision_state + DROP DEFAULT; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 728b084982..178c7af913 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -73,6 +73,13 @@ CREATE TABLE IF NOT EXISTS omicron.public.rack ( * Sleds */ +CREATE TYPE IF NOT EXISTS omicron.public.sled_provision_state AS ENUM ( + -- New resources can be provisioned onto the sled + 'provisionable', + -- New resources must not be provisioned onto the sled + 'non_provisionable' +); + CREATE TABLE IF NOT EXISTS omicron.public.sled ( /* Identity metadata (asset) */ id UUID PRIMARY KEY, @@ -104,6 +111,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.sled ( /* The last address allocated to an Oxide service on this sled. */ last_used_address INET NOT NULL, + /* The state of whether resources should be provisioned onto the sled */ + provision_state omicron.public.sled_provision_state NOT NULL, + -- This constraint should be upheld, even for deleted disks -- in the fleet. CONSTRAINT serial_part_revision_unique UNIQUE ( @@ -2997,7 +3007,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '14.0.0', NULL) + ( TRUE, NOW(), NOW(), '15.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT;