From c71d28d6435b96002de4b20fd14ea705aac80d0a Mon Sep 17 00:00:00 2001 From: Rain Date: Thu, 8 Feb 2024 14:53:20 -0800 Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?= =?UTF-8?q?l=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- nexus/db-model/src/lib.rs | 6 +- nexus/db-model/src/schema.rs | 3 +- nexus/db-model/src/sled.rs | 35 +- nexus/db-model/src/sled_policy.rs | 66 +++ nexus/db-model/src/sled_provision_state.rs | 53 --- nexus/db-model/src/sled_state.rs | 39 ++ nexus/db-queries/src/db/datastore/mod.rs | 11 +- nexus/db-queries/src/db/datastore/sled.rs | 391 ++++++++++++++++-- nexus/db-queries/src/db/pool_connection.rs | 3 +- .../src/db/queries/region_allocation.rs | 9 +- nexus/src/app/sled.rs | 11 +- nexus/src/external_api/http_entrypoints.rs | 16 +- nexus/tests/integration_tests/endpoints.rs | 6 +- nexus/types/src/external_api/params.rs | 10 +- nexus/types/src/external_api/views.rs | 117 +++++- openapi/nexus.json | 20 +- schema/crdb/dbinit.sql | 45 +- 17 files changed, 689 insertions(+), 152 deletions(-) create mode 100644 nexus/db-model/src/sled_policy.rs delete mode 100644 nexus/db-model/src/sled_provision_state.rs create mode 100644 nexus/db-model/src/sled_state.rs diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index b77d56059e..7b9152c83e 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -73,9 +73,10 @@ mod silo_user; mod silo_user_password_hash; mod sled; mod sled_instance; -mod sled_provision_state; +mod sled_policy; mod sled_resource; mod sled_resource_kind; +mod sled_state; mod sled_underlay_subnet_allocation; mod snapshot; mod ssh_key; @@ -160,9 +161,10 @@ pub use silo_user::*; pub use silo_user_password_hash::*; pub use sled::*; pub use sled_instance::*; -pub use sled_provision_state::*; +pub use sled_policy::to_db_sled_policy; // Do not expose DbSledPolicy pub use sled_resource::*; pub use sled_resource_kind::*; +pub use sled_state::*; pub use sled_underlay_subnet_allocation::*; pub use snapshot::*; pub use ssh_key::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 736442282c..17c0aec55f 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -810,7 +810,8 @@ table! { ip -> Inet, port -> Int4, last_used_address -> Inet, - provision_state -> crate::SledProvisionStateEnum, + sled_policy -> crate::sled_policy::SledPolicyEnum, + sled_state -> crate::SledStateEnum, } } diff --git a/nexus/db-model/src/sled.rs b/nexus/db-model/src/sled.rs index 52968c27d5..672840271d 100644 --- a/nexus/db-model/src/sled.rs +++ b/nexus/db-model/src/sled.rs @@ -2,10 +2,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::{ByteCount, Generation, SqlU16, SqlU32}; +use super::{ByteCount, Generation, SledState, SqlU16, SqlU32}; use crate::collection::DatastoreCollectionConfig; +use crate::ipv6; use crate::schema::{physical_disk, service, sled, zpool}; -use crate::{ipv6, SledProvisionState}; +use crate::sled_policy::DbSledPolicy; use chrono::{DateTime, Utc}; use db_macros::Asset; use nexus_types::{external_api::shared, external_api::views, identity::Asset}; @@ -60,7 +61,11 @@ pub struct Sled { /// The last IP address provided to a propolis instance on this sled pub last_used_address: ipv6::Ipv6Addr, - provision_state: SledProvisionState, + #[diesel(column_name = sled_policy)] + policy: DbSledPolicy, + + #[diesel(column_name = sled_state)] + state: SledState, } impl Sled { @@ -84,8 +89,15 @@ impl Sled { &self.serial_number } - pub fn provision_state(&self) -> SledProvisionState { - self.provision_state + /// The policy here is the `views::SledPolicy` because we expect external + /// users to always use that. + pub fn policy(&self) -> views::SledPolicy { + self.policy.into() + } + + /// Returns the sled's state. + pub fn state(&self) -> SledState { + self.state } } @@ -99,7 +111,9 @@ impl From for views::Sled { part: sled.part_number, revision: sled.revision, }, - provision_state: sled.provision_state.into(), + policy: sled.policy.into(), + provision_policy: sled.policy.to_provision_policy(), + state: sled.state.into(), usable_hardware_threads: sled.usable_hardware_threads.0, usable_physical_ram: *sled.usable_physical_ram, } @@ -197,8 +211,13 @@ impl SledUpdate { serial_number: self.serial_number, part_number: self.part_number, revision: self.revision, - // By default, sleds start as provisionable. - provision_state: SledProvisionState::Provisionable, + // By default, sleds start in-service. + policy: DbSledPolicy::InService, + // By default, sleds start in the "active" state. + // + // XXX In the future there probably needs to be an "uninitialized" + // state here. + state: SledState::Active, usable_hardware_threads: self.usable_hardware_threads, usable_physical_ram: self.usable_physical_ram, reservoir_size: self.reservoir_size, diff --git a/nexus/db-model/src/sled_policy.rs b/nexus/db-model/src/sled_policy.rs new file mode 100644 index 0000000000..7885b48214 --- /dev/null +++ b/nexus/db-model/src/sled_policy.rs @@ -0,0 +1,66 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use nexus_types::external_api::views::{SledPolicy, SledProvisionPolicy}; +use serde::{Deserialize, Serialize}; + +impl_enum_type!( + #[derive(Clone, SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "sled_policy", schema = "public"))] + pub struct SledPolicyEnum; + + /// This type is not actually public, because [`SledPolicy`] has a somewhat + /// different, friendlier shape while being equivalent -- external code + /// should always use [`SledPolicy`]. + #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = SledPolicyEnum)] + pub enum DbSledPolicy; + + // Enum values + InService => b"in_service" + InServiceNoProvision => b"in_service_no_provision" + Expunged => b"expunged" +); + +/// Converts a [`SledPolicy`] to a version that can be inserted into a +/// database. +pub fn to_db_sled_policy(policy: SledPolicy) -> DbSledPolicy { + match policy { + SledPolicy::InService { + provision_policy: SledProvisionPolicy::Provisionable, + } => DbSledPolicy::InService, + SledPolicy::InService { + provision_policy: SledProvisionPolicy::NonProvisionable, + } => DbSledPolicy::InServiceNoProvision, + SledPolicy::Expunged => DbSledPolicy::Expunged, + } +} + +impl DbSledPolicy { + /// Converts self into the appropriate provision policy, in a lossy manner. + pub fn to_provision_policy(self) -> SledProvisionPolicy { + match self { + DbSledPolicy::InService => SledProvisionPolicy::Provisionable, + DbSledPolicy::InServiceNoProvision => { + SledProvisionPolicy::NonProvisionable + } + DbSledPolicy::Expunged => SledProvisionPolicy::NonProvisionable, + } + } +} + +impl From for SledPolicy { + fn from(policy: DbSledPolicy) -> Self { + match policy { + DbSledPolicy::InService => SledPolicy::InService { + provision_policy: SledProvisionPolicy::Provisionable, + }, + DbSledPolicy::InServiceNoProvision => SledPolicy::InService { + provision_policy: SledProvisionPolicy::NonProvisionable, + }, + DbSledPolicy::Expunged => SledPolicy::Expunged, + } + } +} diff --git a/nexus/db-model/src/sled_provision_state.rs b/nexus/db-model/src/sled_provision_state.rs deleted file mode 100644 index ada842a32f..0000000000 --- a/nexus/db-model/src/sled_provision_state.rs +++ /dev/null @@ -1,53 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use super::impl_enum_type; -use nexus_types::external_api::views; -use serde::{Deserialize, Serialize}; -use thiserror::Error; - -impl_enum_type!( - #[derive(Clone, SqlType, Debug, QueryId)] - #[diesel(postgres_type(name = "sled_provision_state", schema = "public"))] - pub struct SledProvisionStateEnum; - - #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] - #[diesel(sql_type = SledProvisionStateEnum)] - pub enum SledProvisionState; - - // Enum values - Provisionable => b"provisionable" - NonProvisionable => b"non_provisionable" -); - -impl From for views::SledProvisionState { - fn from(state: SledProvisionState) -> Self { - match state { - SledProvisionState::Provisionable => { - views::SledProvisionState::Provisionable - } - SledProvisionState::NonProvisionable => { - views::SledProvisionState::NonProvisionable - } - } - } -} - -impl From for SledProvisionState { - fn from(state: views::SledProvisionState) -> Self { - match state { - views::SledProvisionState::Provisionable => { - SledProvisionState::Provisionable - } - views::SledProvisionState::NonProvisionable => { - SledProvisionState::NonProvisionable - } - } - } -} - -/// An unknown [`views::SledProvisionState`] was encountered. -#[derive(Clone, Debug, Error)] -#[error("Unknown SledProvisionState")] -pub struct UnknownSledProvisionState; diff --git a/nexus/db-model/src/sled_state.rs b/nexus/db-model/src/sled_state.rs new file mode 100644 index 0000000000..c93f602728 --- /dev/null +++ b/nexus/db-model/src/sled_state.rs @@ -0,0 +1,39 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use nexus_types::external_api::views; +use serde::{Deserialize, Serialize}; + +impl_enum_type!( + #[derive(Clone, SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "sled_state", schema = "public"))] + pub struct SledStateEnum; + + #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = SledStateEnum)] + pub enum SledState; + + // Enum values + Active => b"active" + Decommissioned => b"decommissioned" +); + +impl From for views::SledState { + fn from(state: SledState) -> Self { + match state { + SledState::Active => views::SledState::Active, + SledState::Decommissioned => views::SledState::Decommissioned, + } + } +} + +impl From for SledState { + fn from(state: views::SledState) -> Self { + match state { + views::SledState::Active => SledState::Active, + views::SledState::Decommissioned => SledState::Decommissioned, + } + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index b9ad2ea610..620414969b 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -401,8 +401,8 @@ mod test { use crate::db::model::{ BlockSize, ConsoleSession, Dataset, DatasetKind, ExternalIp, PhysicalDisk, PhysicalDiskKind, Project, Rack, Region, Service, - ServiceKind, SiloUser, SledBaseboard, SledProvisionState, - SledSystemHardware, SledUpdate, SshKey, VpcSubnet, Zpool, + ServiceKind, SiloUser, SledBaseboard, SledSystemHardware, SledUpdate, + SshKey, VpcSubnet, Zpool, }; use crate::db::queries::vpc_subnet::FilterConflictingVpcSubnetRangesQuery; use chrono::{Duration, Utc}; @@ -411,6 +411,7 @@ mod test { use nexus_db_model::IpAttachState; use nexus_test_utils::db::test_setup_database; use nexus_types::external_api::params; + use nexus_types::external_api::views::SledProvisionPolicy; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::{ ByteCount, Error, IdentityMetadataCreateParams, LookupType, Name, @@ -652,10 +653,10 @@ mod test { .unwrap(); println!("sled: {:?}", sled); let old_state = datastore - .sled_set_provision_state( + .sled_set_provision_policy( &opctx, &authz_sled, - SledProvisionState::NonProvisionable, + SledProvisionPolicy::NonProvisionable, ) .await .unwrap_or_else(|error| { @@ -665,7 +666,7 @@ mod test { }); // The old state should always be provisionable since that's where we // start. - assert_eq!(old_state, SledProvisionState::Provisionable); + assert_eq!(old_state, SledProvisionPolicy::Provisionable); } fn test_zpool_size() -> ByteCount { diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 7b94d64418..591bc7e4e7 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -10,8 +10,10 @@ use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; +use crate::db::model::to_db_sled_policy; use crate::db::model::Sled; use crate::db::model::SledResource; +use crate::db::model::SledState; use crate::db::model::SledUpdate; use crate::db::pagination::paginated; use crate::db::update_and_check::UpdateAndCheck; @@ -19,6 +21,8 @@ use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; +use nexus_types::external_api::views::SledPolicy; +use nexus_types::external_api::views::SledProvisionPolicy; use omicron_common::api::external; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; @@ -168,10 +172,11 @@ impl DataStore { .and(sled_has_space_in_reservoir), ) .filter(sled_dsl::time_deleted.is_null()) - // Filter out sleds that are not provisionable. - .filter(sled_dsl::provision_state.eq( - db::model::SledProvisionState::Provisionable, + // Ensure that the sled is in-service and active. + .filter(sled_dsl::sled_policy.eq( + to_db_sled_policy(SledPolicy::provisionable()), )) + .filter(sled_dsl::sled_state.eq(SledState::Active)) .select(sled_dsl::id) .into_boxed(); @@ -243,35 +248,217 @@ impl DataStore { Ok(()) } - /// Sets the provision state for this sled. + /// Sets the provision policy for this sled. /// - /// Returns the previous state. - pub async fn sled_set_provision_state( + /// Errors if the sled is not in service. + /// + /// Returns the previous policy. + pub async fn sled_set_provision_policy( + &self, + opctx: &OpContext, + authz_sled: &authz::Sled, + policy: SledProvisionPolicy, + ) -> Result { + use db::schema::sled::dsl; + + opctx.authorize(authz::Action::Modify, authz_sled).await?; + + let new_policy = SledPolicy::InService { provision_policy: policy }; + // The sled policy can only be changed if the current policy is one of + // the `in_service` ones. There are only two in_service policies + // possible at the moment. + let valid_old_policy = + SledPolicy::InService { provision_policy: policy.invert() }; + + let sled_id = authz_sled.id(); + let query = diesel::update(dsl::sled) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(sled_id)) + .filter(dsl::sled_policy.eq(to_db_sled_policy(valid_old_policy))) + // Ensure that the sled is active. + .filter(dsl::sled_state.eq(SledState::Active)) + .set(( + dsl::sled_policy.eq(to_db_sled_policy(new_policy)), + dsl::time_modified.eq(Utc::now()), + )) + .check_if_exists::(sled_id); + let result = query + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + // There are three possibilities here: + // 1. The sled policy was the same as the valid old one, and was + // updated. + // 2. The sled policy was the same as the updated one. The policy was + // not updated, but this is fine because this method is idempotent. + // 3. The sled policy was something else. In that case, we should + // return an error. + match result.found.policy() { + SledPolicy::InService { provision_policy } => Ok(provision_policy), + other @ SledPolicy::Expunged => { + Err(external::Error::conflict(format!( + "the sled is {other}, and its \ + provision state cannot be changed" + ))) + } + } + } + + /// Marks a sled as expunged, as directed by the operator. + /// + /// This is an irreversible process! It should only be called after + /// sufficient warning to the operator. + /// + /// This is idempotent, and it returns the old policy of the sled. + /// + /// XXX: This, or the code that's calling it, needs to kick off the + /// blueprint planner. + pub async fn sled_set_policy_to_expunged( + &self, + opctx: &OpContext, + authz_sled: &authz::Sled, + ) -> Result { + use db::schema::sled::dsl; + + opctx.authorize(authz::Action::Modify, authz_sled).await?; + + let new_policy = SledPolicy::Expunged; + // The valid policies to transition from are the two in-service ones. + let valid_old_policies = [ + SledPolicy::InService { + provision_policy: SledProvisionPolicy::Provisionable, + }, + SledPolicy::InService { + provision_policy: SledProvisionPolicy::NonProvisionable, + }, + ] + .into_iter() + .map(to_db_sled_policy); + + let sled_id = authz_sled.id(); + let query = diesel::update(dsl::sled) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(sled_id)) + .filter(dsl::sled_policy.eq_any(valid_old_policies)) + // Ensure that the sled is active. + .filter(dsl::sled_state.eq(SledState::Active)) + .set(( + dsl::sled_policy.eq(to_db_sled_policy(new_policy)), + dsl::time_modified.eq(Utc::now()), + )) + .check_if_exists::(sled_id); + + let result = query + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + // There are two possibilities here: + // 1. The sled policy was one of the valid old ones (in-service), and + // was updated. + // 2. The sled policy was already expunged. The policy was not updated, + // but this is fine because this method is idempotent. + // + // In the future, when we add graceful sled removal, we'll need to + // ensure that graceful removal <-> expunged transitions are + // disallowed. + Ok(result.found.policy()) + } + + /// Marks the state of the sled as decommissioned, as believed by Nexus. + /// + /// This is an irreversible process! It should only be called after all + /// resources previously on the sled have been migrated over. + /// + /// This is idempotent, and it returns the old state of the sled. + /// + /// # Errors + /// + /// This method returns an error if the sled policy is not a state that is + /// valid to decommission from (i.e. if, for the current sled policy, + /// [`SledPolicy::is_decommissionable`] returns `false`). + pub async fn sled_set_state_to_decommissioned( + &self, + opctx: &OpContext, + authz_sled: &authz::Sled, + ) -> Result { + self.sled_set_state_to_decommissioned_inner(opctx, authz_sled, true) + .await + } + + async fn sled_set_state_to_decommissioned_inner( &self, opctx: &OpContext, authz_sled: &authz::Sled, - state: db::model::SledProvisionState, - ) -> Result { + // check_decommissionable = false means illegal state transitions are + // possible -- this must only be called from test-only code. + check_decommissionable: bool, + ) -> Result { use db::schema::sled::dsl; + #[cfg(not(test))] + { + if !check_decommissionable { + panic!("check_decommissionable = false is only allowed in test code") + } + } + opctx.authorize(authz::Action::Modify, authz_sled).await?; + let new_state = SledState::Decommissioned; let sled_id = authz_sled.id(); let query = diesel::update(dsl::sled) .filter(dsl::time_deleted.is_null()) .filter(dsl::id.eq(sled_id)) - .filter(dsl::provision_state.ne(state)) + .filter(dsl::sled_state.eq(SledState::Active)); + + let query = if check_decommissionable { + query + .filter( + dsl::sled_policy.eq_any( + SledPolicy::all_decommissionable() + .into_iter() + .map(|p| to_db_sled_policy(*p)), + ), + ) + .into_boxed() + } else { + query.into_boxed() + }; + + let query = query .set(( - dsl::provision_state.eq(state), + dsl::sled_state.eq(new_state), dsl::time_modified.eq(Utc::now()), )) .check_if_exists::(sled_id); + let result = query .execute_and_check(&*self.pool_connection_authorized(opctx).await?) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - Ok(result.found.provision_state()) + // There are three possibilities here: + // 1. The sled state was active, and was updated. + // 2. The sled state was already decommissioned. The state was not + // updated, but this is fine because this method is idempotent. + // 3. The sled policy was not expunged. If so, return an error. + match result.found.policy() { + SledPolicy::Expunged => Ok(result.found.state()), + other @ SledPolicy::InService { .. } => { + if check_decommissionable { + Err(external::Error::conflict(format!( + "the sled is {other}, and its state cannot be \ + decommissioned" + ))) + } else { + // This is test-only code, so don't check if it is + // in-service. + Ok(result.found.state()) + } + } + } } } @@ -359,29 +546,61 @@ mod test { let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - let sled_update = test_new_sled_update(); + // Define some sleds that resources cannot be provisioned on. let non_provisionable_sled = - datastore.sled_upsert(sled_update.clone()).await.unwrap(); - - let (authz_sled, _) = LookupPath::new(&opctx, &datastore) - .sled_id(non_provisionable_sled.id()) - .fetch_for(authz::Action::Modify) - .await - .unwrap(); - - let old_state = datastore - .sled_set_provision_state( - &opctx, - &authz_sled, - db::model::SledProvisionState::NonProvisionable, - ) - .await - .unwrap(); - assert_eq!( - old_state, - db::model::SledProvisionState::Provisionable, - "a newly created sled starts as provisionable" - ); + datastore.sled_upsert(test_new_sled_update()).await.unwrap(); + set_provision_policy( + &opctx, + &datastore, + non_provisionable_sled.id(), + SledProvisionPolicy::NonProvisionable, + Ok(SledProvisionPolicy::Provisionable), + ) + .await; + + let expunged_sled = + datastore.sled_upsert(test_new_sled_update()).await.unwrap(); + set_policy_to_expunged( + &opctx, + &datastore, + expunged_sled.id(), + Ok(SledPolicy::provisionable()), + ) + .await; + + let decommissioned_sled = + datastore.sled_upsert(test_new_sled_update()).await.unwrap(); + // Legally, we must set the policy to expunged before setting the state + // to decommissioned. + set_policy_to_expunged( + &opctx, + &datastore, + decommissioned_sled.id(), + Ok(SledPolicy::provisionable()), + ) + .await; + set_state_to_decommissioned( + &opctx, + &datastore, + decommissioned_sled.id(), + true, + Ok(SledState::Active), + ) + .await; + + // This is _not_ a legal state, BUT we test it out to ensure that if + // the system somehow enters this state anyway, we don't try and + // provision resources on it. + let illegal_decommissioned_sled = + datastore.sled_upsert(test_new_sled_update()).await.unwrap(); + set_state_to_decommissioned( + &opctx, + &datastore, + illegal_decommissioned_sled.id(), + false, + Ok(SledState::Active), + ) + .await; // This should be an error since there are no provisionable sleds. let resources = db::model::Resources::new( @@ -456,6 +675,107 @@ mod test { ) } + async fn set_provision_policy( + opctx: &OpContext, + datastore: &DataStore, + sled_id: Uuid, + new_policy: SledProvisionPolicy, + expected_old_policy: Result, + ) { + let (authz_sled, _) = LookupPath::new(&opctx, &datastore) + .sled_id(sled_id) + .fetch_for(authz::Action::Modify) + .await + .unwrap(); + + let res = datastore + .sled_set_provision_policy(&opctx, &authz_sled, new_policy) + .await; + match expected_old_policy { + Ok(expected_old_policy) => { + assert_eq!( + res, + Ok(expected_old_policy), + "actual old policy is not the same as expected" + ); + } + Err(SetStateError::InvalidTransition) => { + let error = res + .expect_err("expected an invalid state transition error"); + // Invalid transitions are represented as conflicts. + assert!(matches!(error, external::Error::Conflict { .. }), ""); + } + } + } + + async fn set_policy_to_expunged( + opctx: &OpContext, + datastore: &DataStore, + sled_id: Uuid, + expected_old_policy: Result, + ) { + let (authz_sled, _) = LookupPath::new(&opctx, &datastore) + .sled_id(sled_id) + .fetch_for(authz::Action::Modify) + .await + .unwrap(); + + let res = + datastore.sled_set_policy_to_expunged(&opctx, &authz_sled).await; + match expected_old_policy { + Ok(expected_old_policy) => { + assert_eq!( + res, + Ok(expected_old_policy), + "actual old policy is not the same as expected" + ); + } + Err(SetStateError::InvalidTransition) => { + let error = res + .expect_err("expected an invalid state transition error"); + // Invalid transitions are represented as conflicts. + assert!(matches!(error, external::Error::Conflict { .. }), ""); + } + } + } + + async fn set_state_to_decommissioned( + opctx: &OpContext, + datastore: &DataStore, + sled_id: Uuid, + check_decommissionable: bool, + expected_old_state: Result, + ) { + let (authz_sled, _) = LookupPath::new(&opctx, &datastore) + .sled_id(sled_id) + .fetch_for(authz::Action::Modify) + .await + .unwrap(); + + let res = datastore + .sled_set_state_to_decommissioned_inner( + &opctx, + &authz_sled, + check_decommissionable, + ) + .await; + match expected_old_state { + Ok(expected_old_state) => { + assert_eq!( + res, + Ok(expected_old_state), + "actual old policy is not the same as expected" + ); + } + Err(SetStateError::InvalidTransition) => { + let error = res + .expect_err("expected an invalid state transition error"); + // Invalid transitions are represented as conflicts. + assert!(matches!(error, external::Error::Conflict { .. }), ""); + } + } + } + /// Returns pagination parameters to fetch the first page of results for a /// paginated endpoint fn first_page<'a, T>(limit: NonZeroU32) -> DataPageParams<'a, T> { @@ -465,4 +785,9 @@ mod test { limit, } } + + #[derive(Copy, Clone, Debug)] + enum SetStateError { + InvalidTransition, + } } diff --git a/nexus/db-queries/src/db/pool_connection.rs b/nexus/db-queries/src/db/pool_connection.rs index d9c50ff26c..1bc18571e6 100644 --- a/nexus/db-queries/src/db/pool_connection.rs +++ b/nexus/db-queries/src/db/pool_connection.rs @@ -59,9 +59,10 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "router_route_kind", "saga_state", "service_kind", - "sled_provision_state", + "sled_policy", "sled_resource_kind", "sled_role", + "sled_state", "snapshot_state", "sp_type", "switch_interface_kind", diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index 3c37bf6b2e..43e1750812 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -27,6 +27,9 @@ use nexus_db_model::queries::region_allocation::{ proposed_dataset_changes, shuffled_candidate_datasets, updated_datasets, }; use nexus_db_model::schema; +use nexus_db_model::to_db_sled_policy; +use nexus_db_model::SledState; +use nexus_types::external_api::views::SledPolicy; use omicron_common::api::external; use omicron_common::nexus_config::RegionAllocationStrategy; @@ -321,14 +324,16 @@ impl CandidateZpools { .on(zpool_dsl::id.eq(old_zpool_usage::dsl::pool_id)) .inner_join(with_sled); - let sled_is_provisionable = sled_dsl::provision_state - .eq(crate::db::model::SledProvisionState::Provisionable); + let sled_is_provisionable = sled_dsl::sled_policy + .eq(to_db_sled_policy(SledPolicy::provisionable())); + let sled_is_active = sled_dsl::sled_state.eq(SledState::Active); let base_query = old_zpool_usage .query_source() .inner_join(with_zpool) .filter(it_will_fit) .filter(sled_is_provisionable) + .filter(sled_is_active) .select((old_zpool_usage::dsl::pool_id,)); let query = if distinct_sleds { diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 943490ac04..c5c70ceff2 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -14,6 +14,7 @@ use nexus_db_queries::db; use nexus_db_queries::db::lookup; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::model::DatasetKind; +use nexus_types::external_api::views::SledProvisionPolicy; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; @@ -143,17 +144,17 @@ impl super::Nexus { .await } - /// Returns the old state. - pub(crate) async fn sled_set_provision_state( + /// Returns the old provision policy. + pub(crate) async fn sled_set_provision_policy( &self, opctx: &OpContext, sled_lookup: &lookup::Sled<'_>, - state: db::model::SledProvisionState, - ) -> Result { + new_policy: SledProvisionPolicy, + ) -> Result { let (authz_sled,) = sled_lookup.lookup_for(authz::Action::Modify).await?; self.db_datastore - .sled_set_provision_state(opctx, &authz_sled, state) + .sled_set_provision_policy(opctx, &authz_sled, new_policy) .await } diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 28755e5959..9779163d70 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -5169,29 +5169,25 @@ async fn sled_view( async fn sled_set_provision_state( rqctx: RequestContext>, path_params: Path, - new_provision_state: TypedBody, -) -> Result, HttpError> { + new_provision_state: TypedBody, +) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.nexus; let path = path_params.into_inner(); - let provision_state = new_provision_state.into_inner().state; + let new_state = new_provision_state.into_inner().state; let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - // Convert the external `SledProvisionState` into our internal data model. - let new_state = db::model::SledProvisionState::from(provision_state); let sled_lookup = nexus.sled_lookup(&opctx, &path.sled_id)?; let old_state = nexus - .sled_set_provision_state(&opctx, &sled_lookup, new_state) + .sled_set_provision_policy(&opctx, &sled_lookup, new_state) .await?; - let response = params::SledProvisionStateResponse { - old_state: old_state.into(), - new_state: new_state.into(), - }; + let response = + params::SledProvisionPolicyResponse { old_state, new_state }; Ok(HttpResponseOk(response)) }; diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index cd04bb6018..93cc9d1db9 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -48,10 +48,10 @@ pub static HARDWARE_SLED_URL: Lazy = pub static HARDWARE_SLED_PROVISION_STATE_URL: Lazy = Lazy::new(|| { format!("/v1/system/hardware/sleds/{}/provision-state", SLED_AGENT_UUID) }); -pub static DEMO_SLED_PROVISION_STATE: Lazy = +pub static DEMO_SLED_PROVISION_STATE: Lazy = Lazy::new(|| { - params::SledProvisionStateParams { - state: nexus_types::external_api::views::SledProvisionState::NonProvisionable, + params::SledProvisionPolicyParams { + state: nexus_types::external_api::views::SledProvisionPolicy::NonProvisionable, } }); diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 6cb878084d..e8caefe1bc 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -98,19 +98,19 @@ pub struct SledSelector { /// Parameters for `sled_set_provision_state`. #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq)] -pub struct SledProvisionStateParams { +pub struct SledProvisionPolicyParams { /// The provision state. - pub state: super::views::SledProvisionState, + pub state: super::views::SledProvisionPolicy, } /// Response to `sled_set_provision_state`. #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq)] -pub struct SledProvisionStateResponse { +pub struct SledProvisionPolicyResponse { /// The old provision state. - pub old_state: super::views::SledProvisionState, + pub old_state: super::views::SledProvisionPolicy, /// The new provision state. - pub new_state: super::views::SledProvisionState, + pub new_state: super::views::SledProvisionPolicy, } pub struct SwitchSelector { diff --git a/nexus/types/src/external_api/views.rs b/nexus/types/src/external_api/views.rs index 84648f109f..79a5f7bb36 100644 --- a/nexus/types/src/external_api/views.rs +++ b/nexus/types/src/external_api/views.rs @@ -19,6 +19,7 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; use std::collections::BTreeSet; +use std::fmt; use std::net::IpAddr; use uuid::Uuid; @@ -418,15 +419,25 @@ pub struct Sled { pub baseboard: Baseboard, /// The rack to which this Sled is currently attached pub rack_id: Uuid, - /// The provision state of the sled. - pub provision_state: SledProvisionState, + /// The operator-defined policy of a sled. + pub policy: SledPolicy, + /// The provision policy of the sled. + /// + /// This used to be called `provision_state` but was renamed to + /// `provision_policy`, because it is operator-set similar to `policy`. The + /// serialization name `provision_state` has been retained for backwards + /// compatibility. + #[serde(rename = "provision_state")] + pub provision_policy: SledProvisionPolicy, + /// The current state Nexus believes the sled to be in. + pub state: SledState, /// The number of hardware threads which can execute on this sled pub usable_hardware_threads: u32, /// Amount of RAM which may be used by the Sled's OS pub usable_physical_ram: ByteCount, } -/// The provision state of a sled. +/// The operator-defined provision policy of a sled. /// /// This controls whether new resources are going to be provisioned on this /// sled. @@ -434,16 +445,108 @@ pub struct Sled { Copy, Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, )] #[serde(rename_all = "snake_case")] -pub enum SledProvisionState { +pub enum SledProvisionPolicy { /// New resources will be provisioned on this sled. Provisionable, - /// New resources will not be provisioned on this sled. However, existing - /// resources will continue to be on this sled unless manually migrated - /// off. + /// New resources will not be provisioned on this sled. However, if the + /// sled is currently in service, existing resources will continue to be on + /// this sled unless manually migrated off. NonProvisionable, } +impl SledProvisionPolicy { + /// Returns the opposite of the current provision state. + pub fn invert(self) -> Self { + match self { + SledProvisionPolicy::Provisionable => { + SledProvisionPolicy::NonProvisionable + } + SledProvisionPolicy::NonProvisionable => { + SledProvisionPolicy::Provisionable + } + } + } +} + +/// The operator-defined policy of a sled. +#[derive( + Copy, Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, +)] +#[serde(rename_all = "snake_case")] +pub enum SledPolicy { + /// The operator has indicated that the sled is in-service. + InService { + /// Determines whether new resources can be provisioned onto the sled. + provision_policy: SledProvisionPolicy, + }, + + /// The operator has indicated that the sled has been permanently removed + /// from service. + /// + /// This is a terminal state: once a particular sled ID is expunged, it + /// will never return to service. (The actual hardware may be reused, but + /// it will be treated as a brand-new sled.) + /// + /// An expunged sled is always non-provisionable. + Expunged, +} + +impl SledPolicy { + /// Creates a new `SledPolicy` that is in-service and provisionable. + pub fn provisionable() -> Self { + SledPolicy::InService { + provision_policy: SledProvisionPolicy::Provisionable, + } + } + + /// Returns true if the sled can be decommissioned in this state. + pub fn is_decommissionable(&self) -> bool { + // This should be kept in sync with decommissionable_states below. + match self { + SledPolicy::InService { .. } => false, + SledPolicy::Expunged => true, + } + } + + /// Returns all the possible policies a sled can have for it to be + /// decommissioned. + pub fn all_decommissionable() -> &'static [SledPolicy] { + &[SledPolicy::Expunged] + } +} + +impl fmt::Display for SledPolicy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + SledPolicy::InService { + provision_policy: SledProvisionPolicy::Provisionable, + } => write!(f, "in service"), + SledPolicy::InService { + provision_policy: SledProvisionPolicy::NonProvisionable, + } => write!(f, "in service (not provisionable)"), + SledPolicy::Expunged => write!(f, "expunged"), + } + } +} + +/// The current state of the sled, as determined by Nexus. +#[derive( + Copy, Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, +)] +#[serde(rename_all = "snake_case")] +pub enum SledState { + /// The sled is currently active, and has resources allocated on it. + Active, + + /// The sled has been permanently removed from service. + /// + /// This is a terminal state: once a particular sled ID is decommissioned, + /// it will never return to service. (The actual hardware may be reused, + /// but it will be treated as a brand-new sled.) + Decommissioned, +} + /// An operator's view of an instance running on a given sled #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct SledInstance { diff --git a/openapi/nexus.json b/openapi/nexus.json index 8baf1a6316..2733290abb 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -4243,7 +4243,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SledProvisionStateParams" + "$ref": "#/components/schemas/SledProvisionPolicyParams" } } }, @@ -4255,7 +4255,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SledProvisionStateResponse" + "$ref": "#/components/schemas/SledProvisionPolicyResponse" } } } @@ -14842,7 +14842,7 @@ "description": "The provision state of the sled.", "allOf": [ { - "$ref": "#/components/schemas/SledProvisionState" + "$ref": "#/components/schemas/SledProvisionPolicy" } ] }, @@ -14970,7 +14970,7 @@ "items" ] }, - "SledProvisionState": { + "SledProvisionPolicy": { "description": "The provision state of a sled.\n\nThis controls whether new resources are going to be provisioned on this sled.", "oneOf": [ { @@ -14989,7 +14989,7 @@ } ] }, - "SledProvisionStateParams": { + "SledProvisionPolicyParams": { "description": "Parameters for `sled_set_provision_state`.", "type": "object", "properties": { @@ -14997,7 +14997,7 @@ "description": "The provision state.", "allOf": [ { - "$ref": "#/components/schemas/SledProvisionState" + "$ref": "#/components/schemas/SledProvisionPolicy" } ] } @@ -15006,7 +15006,7 @@ "state" ] }, - "SledProvisionStateResponse": { + "SledProvisionPolicyResponse": { "description": "Response to `sled_set_provision_state`.", "type": "object", "properties": { @@ -15014,7 +15014,7 @@ "description": "The new provision state.", "allOf": [ { - "$ref": "#/components/schemas/SledProvisionState" + "$ref": "#/components/schemas/SledProvisionPolicy" } ] }, @@ -15022,7 +15022,7 @@ "description": "The old provision state.", "allOf": [ { - "$ref": "#/components/schemas/SledProvisionState" + "$ref": "#/components/schemas/SledProvisionPolicy" } ] } @@ -17332,4 +17332,4 @@ } } ] -} \ No newline at end of file +} diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 103eb2e0c7..401cefbe72 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -73,11 +73,39 @@ CREATE TABLE IF NOT EXISTS omicron.public.rack ( * Sleds */ -CREATE TYPE IF NOT EXISTS omicron.public.sled_provision_state AS ENUM ( - -- New resources can be provisioned onto the sled - 'provisionable', - -- New resources must not be provisioned onto the sled - 'non_provisionable' +-- The disposition for a particular sled. This is updated solely by the +-- operator, and not by Nexus. +CREATE TYPE IF NOT EXISTS omicron.public.sled_policy AS ENUM ( + -- The sled is in service, and new resources can be provisioned onto it. + 'in_service', + -- The sled is in service, but the operator has indicated that new + -- resources should not be provisioned onto it. + 'in_service_no_provision', + -- The operator has marked that the sled has, or will be, removed from the + -- rack, and it should be assumed that any resources currently on it are + -- now permanently missing. + 'expunged' +); + +-- The actual state of the sled. This is updated exclusively by Nexus. +-- +-- Nexus's goal is to match the sled's state with the operator-indicated +-- policy. For example, if the sled_policy is "expunged" and the sled_state is +-- "active", Nexus will start removing zones from the sled, reallocating them +-- elsewhere, etc. Once that is done, Nexus will mark it as decommissioned. +CREATE TYPE IF NOT EXISTS omicron.public.sled_state AS ENUM ( + -- The sled has resources of any kind allocated on it, or, is available for + -- new resources. + -- + -- The sled can be in this state and have a different sled policy, e.g. + -- "expunged". + 'active', + + -- The sled no longer has resources allocated on it, now or in the future. + -- + -- This is a terminal state. This state is only valid if the sled policy is + -- 'expunged'. + 'decommissioned' ); CREATE TABLE IF NOT EXISTS omicron.public.sled ( @@ -111,8 +139,11 @@ CREATE TABLE IF NOT EXISTS omicron.public.sled ( /* The last address allocated to a propolis instance on this sled. */ last_used_address INET NOT NULL, - /* The state of whether resources should be provisioned onto the sled */ - provision_state omicron.public.sled_provision_state NOT NULL, + /* The policy for the sled, updated exclusively by the operator */ + sled_policy omicron.public.sled_policy NOT NULL, + + /* The actual state of the sled, updated exclusively by Nexus */ + sled_state omicron.public.sled_state NOT NULL, -- This constraint should be upheld, even for deleted disks -- in the fleet.