diff --git a/Cargo.lock b/Cargo.lock index b40855b336..255433250e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5311,6 +5311,7 @@ dependencies = [ "oximeter-client", "pq-sys", "ratatui", + "reedline", "serde", "serde_json", "sled-agent-client", diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index 1b697561a9..813a4b9552 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -36,6 +36,7 @@ oximeter-client.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. pq-sys = "*" ratatui.workspace = true +reedline.workspace = true serde.workspace = true serde_json.workspace = true sled-agent-client.workspace = true diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 1b99a815ef..cc498d0ad1 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -148,9 +148,8 @@ impl Display for MaybeSledId { #[derive(Debug, Args)] pub struct DbArgs { - /// URL of the database SQL interface - #[clap(long, env("OMDB_DB_URL"))] - db_url: Option, + #[clap(flatten)] + db_url_opts: DbUrlOptions, #[clap(flatten)] fetch_opts: DbFetchOptions, @@ -159,6 +158,71 @@ pub struct DbArgs { command: DbCommands, } +#[derive(Debug, Args)] +pub struct DbUrlOptions { + /// URL of the database SQL interface + #[clap(long, env("OMDB_DB_URL"))] + db_url: Option, +} + +impl DbUrlOptions { + async fn resolve_pg_url( + &self, + omdb: &Omdb, + log: &slog::Logger, + ) -> anyhow::Result { + match &self.db_url { + Some(cli_or_env_url) => Ok(cli_or_env_url.clone()), + None => { + eprintln!( + "note: database URL not specified. Will search DNS." + ); + eprintln!("note: (override with --db-url or OMDB_DB_URL)"); + let addrs = omdb + .dns_lookup_all( + log.clone(), + internal_dns::ServiceName::Cockroach, + ) + .await?; + + format!( + "postgresql://root@{}/omicron?sslmode=disable", + addrs + .into_iter() + .map(|a| a.to_string()) + .collect::>() + .join(",") + ) + .parse() + .context("failed to parse constructed postgres URL") + } + } + } + + pub async fn connect( + &self, + omdb: &Omdb, + log: &slog::Logger, + ) -> anyhow::Result> { + let db_url = self.resolve_pg_url(omdb, log).await?; + eprintln!("note: using database URL {}", &db_url); + + let db_config = db::Config { url: db_url.clone() }; + let pool = Arc::new(db::Pool::new(&log.clone(), &db_config)); + + // Being a dev tool, we want to try this operation even if the schema + // doesn't match what we expect. So we use `DataStore::new_unchecked()` + // here. We will then check the schema version explicitly and warn the + // user if it doesn't match. + let datastore = Arc::new( + DataStore::new_unchecked(log.clone(), pool) + .map_err(|e| anyhow!(e).context("creating datastore"))?, + ); + check_schema_version(&datastore).await; + Ok(datastore) + } +} + #[derive(Debug, Args)] pub struct DbFetchOptions { /// limit to apply to queries that fetch rows @@ -405,47 +469,7 @@ impl DbArgs { omdb: &Omdb, log: &slog::Logger, ) -> Result<(), anyhow::Error> { - let db_url = match &self.db_url { - Some(cli_or_env_url) => cli_or_env_url.clone(), - None => { - eprintln!( - "note: database URL not specified. Will search DNS." - ); - eprintln!("note: (override with --db-url or OMDB_DB_URL)"); - let addrs = omdb - .dns_lookup_all( - log.clone(), - internal_dns::ServiceName::Cockroach, - ) - .await?; - - format!( - "postgresql://root@{}/omicron?sslmode=disable", - addrs - .into_iter() - .map(|a| a.to_string()) - .collect::>() - .join(",") - ) - .parse() - .context("failed to parse constructed postgres URL")? - } - }; - eprintln!("note: using database URL {}", &db_url); - - let db_config = db::Config { url: db_url.clone() }; - let pool = Arc::new(db::Pool::new(&log.clone(), &db_config)); - - // Being a dev tool, we want to try this operation even if the schema - // doesn't match what we expect. So we use `DataStore::new_unchecked()` - // here. We will then check the schema version explicitly and warn the - // user if it doesn't match. - let datastore = Arc::new( - DataStore::new_unchecked(log.clone(), pool) - .map_err(|e| anyhow!(e).context("creating datastore"))?, - ); - check_schema_version(&datastore).await; - + let datastore = self.db_url_opts.connect(omdb, log).await?; let opctx = OpContext::for_tests(log.clone(), datastore.clone()); match &self.command { DbCommands::Rack(RackArgs { command: RackCommands::List }) => { diff --git a/dev-tools/omdb/src/bin/omdb/main.rs b/dev-tools/omdb/src/bin/omdb/main.rs index 0fb9ba0121..17de22c2fa 100644 --- a/dev-tools/omdb/src/bin/omdb/main.rs +++ b/dev-tools/omdb/src/bin/omdb/main.rs @@ -92,16 +92,29 @@ struct Omdb { command: OmdbCommands, } -impl Omdb { - fn check_allow_destructive(&self) -> anyhow::Result<()> { - anyhow::ensure!( - self.allow_destructive, - "This command is potentially destructive. \ - Pass the `-w` / `--destructive` flag to allow it." - ); - Ok(()) +mod check_allow_destructive { + /// Zero-size type that potentially-destructive functions can accept to + /// ensure `Omdb::check_allow_destructive` has been called. + // This is tucked away inside a module to prevent it from being constructed + // by anything other than `Omdb::check_allow_destructive`. + #[must_use] + pub(crate) struct DestructiveOperationToken(()); + + impl super::Omdb { + pub(crate) fn check_allow_destructive( + &self, + ) -> anyhow::Result { + anyhow::ensure!( + self.allow_destructive, + "This command is potentially destructive. \ + Pass the `-w` / `--destructive` flag to allow it." + ); + Ok(DestructiveOperationToken(())) + } } +} +impl Omdb { async fn dns_lookup_all( &self, log: slog::Logger, diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 4705da2a32..ecf2a8072d 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -4,7 +4,10 @@ //! omdb commands that query or update specific Nexus instances +use crate::check_allow_destructive::DestructiveOperationToken; +use crate::db::DbUrlOptions; use crate::Omdb; +use anyhow::bail; use anyhow::Context; use chrono::DateTime; use chrono::SecondsFormat; @@ -17,7 +20,13 @@ use nexus_client::types::ActivationReason; use nexus_client::types::BackgroundTask; use nexus_client::types::CurrentStatus; use nexus_client::types::LastResult; +use nexus_client::types::SledSelector; use nexus_client::types::UninitializedSledId; +use nexus_db_queries::db::lookup::LookupPath; +use nexus_types::inventory::BaseboardId; +use reedline::DefaultPrompt; +use reedline::DefaultPromptSegment; +use reedline::Reedline; use serde::Deserialize; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; @@ -36,6 +45,7 @@ pub struct NexusArgs { } /// Subcommands for the "omdb nexus" subcommand +#[allow(clippy::large_enum_variant)] #[derive(Debug, Subcommand)] enum NexusCommands { /// print information about background tasks @@ -145,11 +155,14 @@ struct SledsArgs { } #[derive(Debug, Subcommand)] +#[allow(clippy::large_enum_variant)] enum SledsCommands { /// List all uninitialized sleds ListUninitialized, /// Add an uninitialized sled Add(SledAddArgs), + /// Expunge a sled (DANGEROUS) + Expunge(SledExpungeArgs), } #[derive(Debug, Args)] @@ -160,6 +173,17 @@ struct SledAddArgs { part: String, } +#[derive(Debug, Args)] +struct SledExpungeArgs { + // expunge is _extremely_ dangerous, so we also require a database + // connection to perform some safety checks + #[clap(flatten)] + db_url_opts: DbUrlOptions, + + /// sled ID + sled_id: Uuid, +} + impl NexusArgs { /// Run a `omdb nexus` subcommand. pub(crate) async fn run_cmd( @@ -212,8 +236,8 @@ impl NexusArgs { NexusCommands::Blueprints(BlueprintsArgs { command: BlueprintsCommands::Delete(args), }) => { - omdb.check_allow_destructive()?; - cmd_nexus_blueprints_delete(&client, args).await + let token = omdb.check_allow_destructive()?; + cmd_nexus_blueprints_delete(&client, args, token).await } NexusCommands::Blueprints(BlueprintsArgs { command: @@ -227,21 +251,23 @@ impl NexusArgs { command: BlueprintTargetCommands::Set(args), }), }) => { - omdb.check_allow_destructive()?; - cmd_nexus_blueprints_target_set(&client, args).await + let token = omdb.check_allow_destructive()?; + cmd_nexus_blueprints_target_set(&client, args, token).await } NexusCommands::Blueprints(BlueprintsArgs { command: BlueprintsCommands::Regenerate, }) => { - omdb.check_allow_destructive()?; - cmd_nexus_blueprints_regenerate(&client).await + let token = omdb.check_allow_destructive()?; + cmd_nexus_blueprints_regenerate(&client, token).await } NexusCommands::Blueprints(BlueprintsArgs { command: BlueprintsCommands::GenerateFromCollection(args), }) => { - omdb.check_allow_destructive()?; - cmd_nexus_blueprints_generate_from_collection(&client, args) - .await + let token = omdb.check_allow_destructive()?; + cmd_nexus_blueprints_generate_from_collection( + &client, args, token, + ) + .await } NexusCommands::Sleds(SledsArgs { @@ -250,8 +276,14 @@ impl NexusArgs { NexusCommands::Sleds(SledsArgs { command: SledsCommands::Add(args), }) => { - omdb.check_allow_destructive()?; - cmd_nexus_sled_add(&client, args).await + let token = omdb.check_allow_destructive()?; + cmd_nexus_sled_add(&client, args, token).await + } + NexusCommands::Sleds(SledsArgs { + command: SledsCommands::Expunge(args), + }) => { + let token = omdb.check_allow_destructive()?; + cmd_nexus_sled_expunge(&client, args, omdb, log, token).await } } } @@ -909,6 +941,7 @@ async fn cmd_nexus_blueprints_diff( async fn cmd_nexus_blueprints_delete( client: &nexus_client::Client, args: &BlueprintIdArgs, + _destruction_token: DestructiveOperationToken, ) -> Result<(), anyhow::Error> { let _ = client .blueprint_delete(&args.blueprint_id) @@ -934,6 +967,7 @@ async fn cmd_nexus_blueprints_target_show( async fn cmd_nexus_blueprints_target_set( client: &nexus_client::Client, args: &BlueprintTargetSetArgs, + _destruction_token: DestructiveOperationToken, ) -> Result<(), anyhow::Error> { let enabled = match args.enabled { BlueprintTargetSetEnabled::Enabled => true, @@ -969,6 +1003,7 @@ async fn cmd_nexus_blueprints_target_set( async fn cmd_nexus_blueprints_generate_from_collection( client: &nexus_client::Client, args: &CollectionIdArgs, + _destruction_token: DestructiveOperationToken, ) -> Result<(), anyhow::Error> { let blueprint = client .blueprint_generate_from_collection( @@ -984,6 +1019,7 @@ async fn cmd_nexus_blueprints_generate_from_collection( async fn cmd_nexus_blueprints_regenerate( client: &nexus_client::Client, + _destruction_token: DestructiveOperationToken, ) -> Result<(), anyhow::Error> { let blueprint = client.blueprint_regenerate().await.context("generating blueprint")?; @@ -1037,6 +1073,7 @@ async fn cmd_nexus_sleds_list_uninitialized( async fn cmd_nexus_sled_add( client: &nexus_client::Client, args: &SledAddArgs, + _destruction_token: DestructiveOperationToken, ) -> Result<(), anyhow::Error> { client .sled_add(&UninitializedSledId { @@ -1048,3 +1085,115 @@ async fn cmd_nexus_sled_add( eprintln!("added sled {} ({})", args.serial, args.part); Ok(()) } + +/// Runs `omdb nexus sleds expunge` +async fn cmd_nexus_sled_expunge( + client: &nexus_client::Client, + args: &SledExpungeArgs, + omdb: &Omdb, + log: &slog::Logger, + _destruction_token: DestructiveOperationToken, +) -> Result<(), anyhow::Error> { + // This is an extremely dangerous and irreversible operation. We put a + // couple of safeguards in place to ensure this cannot be called without + // due consideration: + // + // 1. We'll require manual input on stdin to confirm the sled to be removed + // 2. We'll warn sternly if the sled-to-be-expunged is still present in the + // most recent inventory collection + use nexus_db_queries::context::OpContext; + + let datastore = args.db_url_opts.connect(omdb, log).await?; + let opctx = OpContext::for_tests(log.clone(), datastore.clone()); + let opctx = &opctx; + + // First, we need to look up the sled so we know its serial number. + let (_authz_sled, sled) = LookupPath::new(opctx, &datastore) + .sled_id(args.sled_id) + .fetch() + .await + .with_context(|| format!("failed to find sled {}", args.sled_id))?; + + // Helper to get confirmation messages from the user. + let mut line_editor = Reedline::create(); + let mut read_with_prompt = move |message: &str| { + let prompt = DefaultPrompt::new( + DefaultPromptSegment::Basic(message.to_string()), + DefaultPromptSegment::Empty, + ); + if let Ok(reedline::Signal::Success(input)) = + line_editor.read_line(&prompt) + { + Ok(input) + } else { + bail!("expungement aborted") + } + }; + + // Now check whether its sled-agent or SP were found in the most recent + // inventory collection. + match datastore + .inventory_get_latest_collection(opctx) + .await + .context("loading latest collection")? + { + Some(collection) => { + let sled_baseboard = BaseboardId { + part_number: sled.part_number().to_string(), + serial_number: sled.serial_number().to_string(), + }; + // Check the collection for either the sled-id or the baseboard. In + // general a sled-agent's self report should result in both the + // sled-id and the baseboard being present in the collection, but + // there are some test environments (e.g., `omicron-dev run-all`) + // where that isn't guaranteed. + let sled_present_in_collection = + collection.sled_agents.contains_key(&args.sled_id) + || collection.baseboards.contains(&sled_baseboard); + if sled_present_in_collection { + eprintln!( + "WARNING: sled {} is PRESENT in the most recent inventory \ + collection (spotted at {}). It is dangerous to expunge a \ + sled that is still running. Are you sure you want to \ + proceed anyway?", + args.sled_id, collection.time_done, + ); + let confirm = read_with_prompt("y/N")?; + if confirm != "y" { + eprintln!("expungement not confirmed: aborting"); + return Ok(()); + } + } + } + None => { + eprintln!( + "WARNING: cannot verify that the sled is physically gone \ + because there are no inventory collections present. Please \ + make sure that the sled has been physically removed." + ); + } + } + + eprintln!( + "WARNING: This operation will PERMANENTLY and IRRECOVABLY mark sled \ + {} ({}) expunged. To proceed, type the sled's serial number.", + args.sled_id, + sled.serial_number(), + ); + let confirm = read_with_prompt("sled serial number")?; + if confirm != sled.serial_number() { + eprintln!("sled serial number not confirmed: aborting"); + return Ok(()); + } + + let old_policy = client + .sled_expunge(&SledSelector { sled: args.sled_id }) + .await + .context("expunging sled")? + .into_inner(); + eprintln!( + "expunged sled {} (previous policy: {old_policy:?})", + args.sled_id + ); + Ok(()) +} diff --git a/nexus/db-model/src/sled.rs b/nexus/db-model/src/sled.rs index c2cbb65bd6..a603f28d57 100644 --- a/nexus/db-model/src/sled.rs +++ b/nexus/db-model/src/sled.rs @@ -100,6 +100,10 @@ impl Sled { &self.serial_number } + pub fn part_number(&self) -> &str { + &self.part_number + } + /// The policy here is the `views::SledPolicy` because we expect external /// users to always use that. pub fn policy(&self) -> views::SledPolicy { diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index aad6e1122f..4ad8dc36ce 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -14,6 +14,7 @@ use nexus_db_queries::db; use nexus_db_queries::db::lookup; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::model::DatasetKind; +use nexus_types::external_api::views::SledPolicy; use nexus_types::external_api::views::SledProvisionPolicy; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; @@ -73,6 +74,23 @@ impl super::Nexus { Ok(()) } + /// Mark a sled as expunged + /// + /// This is an irreversible process! It should only be called after + /// sufficient warning to the operator. + /// + /// This is idempotent, and it returns the old policy of the sled. + pub(crate) async fn sled_expunge( + &self, + opctx: &OpContext, + sled_id: Uuid, + ) -> Result { + let sled_lookup = self.sled_lookup(opctx, &sled_id)?; + let (authz_sled,) = + sled_lookup.lookup_for(authz::Action::Modify).await?; + self.db_datastore.sled_set_policy_to_expunged(opctx, &authz_sled).await + } + pub(crate) async fn sled_request_firewall_rules( &self, opctx: &OpContext, diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 7f8211dc8e..d6649cd944 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -30,8 +30,10 @@ use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintMetadata; use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::BlueprintTargetSet; +use nexus_types::external_api::params::SledSelector; use nexus_types::external_api::params::UninitializedSledId; use nexus_types::external_api::shared::UninitializedSled; +use nexus_types::external_api::views::SledPolicy; use nexus_types::internal_api::params::SwitchPutRequest; use nexus_types::internal_api::params::SwitchPutResponse; use nexus_types::internal_api::views::to_list; @@ -94,6 +96,7 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(sled_list_uninitialized)?; api.register(sled_add)?; + api.register(sled_expunge)?; api.register(probes_get)?; @@ -868,6 +871,31 @@ async fn sled_add( apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// Mark a sled as expunged +/// +/// This is an irreversible process! It should only be called after +/// sufficient warning to the operator. +/// +/// This is idempotent, and it returns the old policy of the sled. +#[endpoint { + method = POST, + path = "/sleds/expunge", +}] +async fn sled_expunge( + rqctx: RequestContext>, + sled: TypedBody, +) -> Result, HttpError> { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + let previous_policy = + nexus.sled_expunge(&opctx, sled.into_inner().sled).await?; + Ok(HttpResponseOk(previous_policy)) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + /// Path parameters for probes #[derive(Deserialize, JsonSchema)] struct ProbePathParam { diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index c6ae47d27c..490661a819 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -92,6 +92,7 @@ id_path_param!(SwitchPath, switch_id, "switch"); // Internal API parameters id_path_param!(BlueprintPath, blueprint_id, "blueprint"); +#[derive(Clone, Copy, Debug, Serialize, Deserialize, JsonSchema, PartialEq)] pub struct SledSelector { /// ID of the sled pub sled: Uuid, diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 9e18c7d6bc..cc831fff73 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -1047,6 +1047,41 @@ } } }, + "/sleds/expunge": { + "post": { + "summary": "Mark a sled as expunged", + "description": "This is an irreversible process! It should only be called after sufficient warning to the operator.\nThis is idempotent, and it returns the old policy of the sled.", + "operationId": "sled_expunge", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SledSelector" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SledPolicy" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/sleds/uninitialized": { "get": { "summary": "List uninitialized sleds", @@ -6701,6 +6736,69 @@ "vmm_state" ] }, + "SledPolicy": { + "description": "The operator-defined policy of a sled.", + "oneOf": [ + { + "description": "The operator has indicated that the sled is in-service.", + "type": "object", + "properties": { + "kind": { + "type": "string", + "enum": [ + "in_service" + ] + }, + "provision_policy": { + "description": "Determines whether new resources can be provisioned onto the sled.", + "allOf": [ + { + "$ref": "#/components/schemas/SledProvisionPolicy" + } + ] + } + }, + "required": [ + "kind", + "provision_policy" + ] + }, + { + "description": "The operator has indicated that the sled has been permanently removed from service.\n\nThis is a terminal state: once a particular sled ID is expunged, it will never return to service. (The actual hardware may be reused, but it will be treated as a brand-new sled.)\n\nAn expunged sled is always non-provisionable.", + "type": "object", + "properties": { + "kind": { + "type": "string", + "enum": [ + "expunged" + ] + } + }, + "required": [ + "kind" + ] + } + ] + }, + "SledProvisionPolicy": { + "description": "The operator-defined provision policy of a sled.\n\nThis controls whether new resources are going to be provisioned on this sled.", + "oneOf": [ + { + "description": "New resources will be provisioned on this sled.", + "type": "string", + "enum": [ + "provisionable" + ] + }, + { + "description": "New resources will not be provisioned on this sled. However, if the sled is currently in service, existing resources will continue to be on this sled unless manually migrated off.", + "type": "string", + "enum": [ + "non_provisionable" + ] + } + ] + }, "SledRole": { "description": "Describes the role of the sled within the rack.\n\nNote that this may change if the sled is physically moved within the rack.", "oneOf": [ @@ -6720,6 +6818,19 @@ } ] }, + "SledSelector": { + "type": "object", + "properties": { + "sled": { + "description": "ID of the sled", + "type": "string", + "format": "uuid" + } + }, + "required": [ + "sled" + ] + }, "SourceNatConfig": { "description": "An IP address and port range used for source NAT, i.e., making outbound network connections from guests or services.", "type": "object",