diff --git a/Cargo.lock b/Cargo.lock index ad0dc13987..f38eece4d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6055,6 +6055,7 @@ dependencies = [ "indicatif", "internal-dns", "ipnetwork", + "itertools 0.13.0", "multimap", "nexus-client", "nexus-config", diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index 0990fdb11c..a92de1b6a9 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -28,6 +28,7 @@ gateway-messages.workspace = true gateway-test-utils.workspace = true humantime.workspace = true internal-dns.workspace = true +itertools.workspace = true nexus-client.workspace = true nexus-config.workspace = true nexus-db-model.workspace = true diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 1b6d2469f4..67a4180dd2 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -19,6 +19,7 @@ use clap::Subcommand; use clap::ValueEnum; use futures::future::try_join; use futures::TryStreamExt; +use itertools::Itertools; use nexus_client::types::ActivationReason; use nexus_client::types::BackgroundTask; use nexus_client::types::BackgroundTasksActivateRequest; @@ -46,6 +47,7 @@ use reedline::Reedline; use serde::Deserialize; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; +use std::collections::BTreeSet; use std::str::FromStr; use tabled::Tabled; use uuid::Uuid; @@ -93,11 +95,21 @@ enum BackgroundTasksCommands { /// Print a summary of the status of all background tasks List, /// Print human-readable summary of the status of each background task - Show, + Show(BackgroundTasksShowArgs), /// Activate one or more background tasks Activate(BackgroundTasksActivateArgs), } +#[derive(Debug, Args)] +struct BackgroundTasksShowArgs { + /// Names of background tasks to show (default: all) + /// + /// You can use any background task name here or one of the special strings + /// "all", "dns_external", or "dns_internal". + #[clap(value_name = "TASK_NAME")] + tasks: Vec, +} + #[derive(Debug, Args)] struct BackgroundTasksActivateArgs { /// Name of the background tasks to activate @@ -361,8 +373,8 @@ impl NexusArgs { command: BackgroundTasksCommands::List, }) => cmd_nexus_background_tasks_list(&client).await, NexusCommands::BackgroundTasks(BackgroundTasksArgs { - command: BackgroundTasksCommands::Show, - }) => cmd_nexus_background_tasks_show(&client).await, + command: BackgroundTasksCommands::Show(args), + }) => cmd_nexus_background_tasks_show(&client, args).await, NexusCommands::BackgroundTasks(BackgroundTasksArgs { command: BackgroundTasksCommands::Activate(args), }) => { @@ -523,7 +535,9 @@ async fn cmd_nexus_background_tasks_list( ) -> Result<(), anyhow::Error> { let response = client.bgtask_list().await.context("listing background tasks")?; - let tasks = response.into_inner(); + // Convert the HashMap to a BTreeMap because we want the keys in sorted + // order. + let tasks = response.into_inner().into_iter().collect::>(); let table_rows = tasks.values().map(BackgroundTaskStatusRow::from); let table = tabled::Table::new(table_rows) .with(tabled::settings::Style::empty()) @@ -536,6 +550,7 @@ async fn cmd_nexus_background_tasks_list( /// Runs `omdb nexus background-tasks show` async fn cmd_nexus_background_tasks_show( client: &nexus_client::Client, + args: &BackgroundTasksShowArgs, ) -> Result<(), anyhow::Error> { let response = client.bgtask_list().await.context("listing background tasks")?; @@ -544,8 +559,50 @@ async fn cmd_nexus_background_tasks_show( let mut tasks = response.into_inner().into_iter().collect::>(); - // We want to pick the order that we print some tasks intentionally. Then - // we want to print anything else that we find. + // Now, pick out the tasks that the user selected. + // + // The set of user tasks may include: + // + // - nothing at all, in which case we include all tasks + // - individual task names + // - certain groups that we recognize, like "dns_external" for all the tasks + // related to external DNS propagation. "all" means "all tasks". + let selected_set: BTreeSet<_> = + args.tasks.iter().map(AsRef::as_ref).collect(); + let selected_all = selected_set.is_empty() || selected_set.contains("all"); + if !selected_all { + for s in &selected_set { + if !tasks.contains_key(*s) + && *s != "all" + && *s != "dns_external" + && *s != "dns_internal" + { + bail!( + "unknown task name: {:?} (known task names: all, \ + dns_external, dns_internal, {})", + s, + tasks.keys().join(", ") + ); + } + } + + tasks.retain(|k, _| { + selected_set.contains(k.as_str()) + || selected_set.contains("all") + || (selected_set.contains("dns_external") + && k.starts_with("dns_") + && k.ends_with("_external")) + || (selected_set.contains("dns_internal") + && k.starts_with("dns_") + && k.ends_with("_internal")) + }); + } + + // Some tasks should be grouped and printed together in a certain order, + // even though their names aren't alphabetical. Notably, the DNS tasks + // logically go from config -> servers -> propagation, so we want to print + // them in that order. So we pick these out first and then print anything + // else that we find in alphabetical order. for name in [ "dns_config_internal", "dns_servers_internal", @@ -559,7 +616,7 @@ async fn cmd_nexus_background_tasks_show( ] { if let Some(bgtask) = tasks.remove(name) { print_task(&bgtask); - } else { + } else if selected_all { eprintln!("warning: expected to find background task {:?}", name); } } diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 166936da9c..19c555ec96 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -632,6 +632,397 @@ task: "vpc_route_manager" started at (s ago) and ran for ms warning: unknown background task: "vpc_route_manager" (don't know how to interpret details: Object {}) +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "background-tasks", "show", "saga_recovery"] +termination: Exited(0) +--------------------------------------------- +stdout: +task: "saga_recovery" + configured period: every 10m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + since Nexus started: + sagas recovered: 0 + sagas recovery errors: 0 + sagas observed started: 0 + sagas inferred finished: 0 + missing from SEC: 0 + bad state in SEC: 0 + last pass: + found sagas: 0 (in-progress, assigned to this Nexus) + recovered: 0 (successfully) + failed: 0 + skipped: 0 (already running) + removed: 0 (newly finished) + no recovered sagas + no saga recovery failures + +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "background-tasks", "show", "blueprint_loader", "blueprint_executor"] +termination: Exited(0) +--------------------------------------------- +stdout: +task: "blueprint_loader" + configured period: every 1m s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: failed to read target blueprint: Internal Error: no target blueprint set + +task: "blueprint_executor" + configured period: every 10m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: no blueprint + +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "background-tasks", "show", "dns_internal"] +termination: Exited(0) +--------------------------------------------- +stdout: +task: "dns_config_internal" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + last generation found: 1 + +task: "dns_servers_internal" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + servers found: 1 + + DNS_SERVER_ADDR + [::1]:REDACTED_PORT + +task: "dns_propagation_internal" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a dependent task completing + started at (s ago) and ran for ms + attempt to propagate generation: 1 + + DNS_SERVER_ADDR LAST_RESULT + [::1]:REDACTED_PORT success + + +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "background-tasks", "show", "dns_external"] +termination: Exited(0) +--------------------------------------------- +stdout: +task: "dns_config_external" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + last generation found: 2 + +task: "dns_servers_external" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + servers found: 1 + + DNS_SERVER_ADDR + [::1]:REDACTED_PORT + +task: "dns_propagation_external" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a dependent task completing + started at (s ago) and ran for ms + attempt to propagate generation: 2 + + DNS_SERVER_ADDR LAST_RESULT + [::1]:REDACTED_PORT success + + +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "background-tasks", "show", "all"] +termination: Exited(0) +--------------------------------------------- +stdout: +task: "dns_config_internal" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + last generation found: 1 + +task: "dns_servers_internal" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + servers found: 1 + + DNS_SERVER_ADDR + [::1]:REDACTED_PORT + +task: "dns_propagation_internal" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a dependent task completing + started at (s ago) and ran for ms + attempt to propagate generation: 1 + + DNS_SERVER_ADDR LAST_RESULT + [::1]:REDACTED_PORT success + + +task: "dns_config_external" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + last generation found: 2 + +task: "dns_servers_external" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + servers found: 1 + + DNS_SERVER_ADDR + [::1]:REDACTED_PORT + +task: "dns_propagation_external" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a dependent task completing + started at (s ago) and ran for ms + attempt to propagate generation: 2 + + DNS_SERVER_ADDR LAST_RESULT + [::1]:REDACTED_PORT success + + +task: "nat_v4_garbage_collector" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: failed to resolve addresses for Dendrite services: no record found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } + +task: "blueprint_loader" + configured period: every 1m s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: failed to read target blueprint: Internal Error: no target blueprint set + +task: "blueprint_executor" + configured period: every 10m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: no blueprint + +task: "abandoned_vmm_reaper" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total abandoned VMMs found: 0 + VMM records deleted: 0 + VMM records already deleted by another Nexus: 0 + sled resource reservations deleted: 0 + +task: "bfd_manager" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: failed to resolve addresses for Dendrite services: no record found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } + +task: "crdb_node_id_collector" + configured period: every 10m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: no blueprint + +task: "decommissioned_disk_cleaner" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "decommissioned_disk_cleaner" (don't know how to interpret details: Object {"deleted": Number(0), "error": Null, "error_count": Number(0), "found": Number(0), "not_ready_to_be_deleted": Number(0)}) + +task: "external_endpoints" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + external API endpoints: 2 ('*' below marks default) + + SILO_ID DNS_NAME + ..................... default-silo.sys.oxide-dev.test + * ..................... test-suite-silo.sys.oxide-dev.test + + warnings: 2 + warning: silo ..................... with DNS name "default-silo.sys.oxide-dev.test" has no usable certificates + warning: silo ..................... with DNS name "test-suite-silo.sys.oxide-dev.test" has no usable certificates + + TLS certificates: 0 + +task: "instance_updater" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total instances in need of updates: 0 + instances with destroyed active VMMs: 0 + instances with terminated active migrations: 0 + update sagas started: 0 + update sagas completed successfully: 0 + +task: "instance_watcher" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total instances checked: 0 + checks completed: 0 + successful checks: 0 + update sagas queued: 0 + failed checks: 0 + checks that could not be completed: 0 + stale instance metrics pruned: 0 + +task: "inventory_collection" + configured period: every 10m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + last collection id: ..................... + last collection started: + last collection done: + +task: "lookup_region_port" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total filled in ports: 0 + errors: 0 + +task: "metrics_producer_gc" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "metrics_producer_gc" (don't know how to interpret details: Object {"expiration": String(""), "pruned": Array []}) + +task: "phantom_disks" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + number of phantom disks deleted: 0 + number of phantom disk delete errors: 0 + +task: "physical_disk_adoption" + configured period: every s + currently executing: no + last completed activation: , triggered by a dependent task completing + started at (s ago) and ran for ms + last completion reported error: task disabled + +task: "region_replacement" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + number of region replacements started ok: 0 + number of region replacement start errors: 0 + +task: "region_replacement_driver" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + number of region replacement drive sagas started ok: 0 + number of region replacement finish sagas started ok: 0 + number of errors: 0 + +task: "saga_recovery" + configured period: every 10m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + since Nexus started: + sagas recovered: 0 + sagas recovery errors: 0 + sagas observed started: 0 + sagas inferred finished: 0 + missing from SEC: 0 + bad state in SEC: 0 + last pass: + found sagas: 0 (in-progress, assigned to this Nexus) + recovered: 0 (successfully) + failed: 0 + skipped: 0 (already running) + removed: 0 (newly finished) + no recovered sagas + no saga recovery failures + +task: "service_firewall_rule_propagation" + configured period: every 5m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + +task: "service_zone_nat_tracker" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: inventory collection is None + +task: "switch_port_config_manager" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "switch_port_config_manager" (don't know how to interpret details: Object {}) + +task: "v2p_manager" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "v2p_manager" (don't know how to interpret details: Object {}) + +task: "vpc_route_manager" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "vpc_route_manager" (don't know how to interpret details: Object {}) + --------------------------------------------- stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index d0258aeaed..45492c14ce 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -80,6 +80,7 @@ async fn test_omdb_usage_errors() { &["mgs"], &["nexus"], &["nexus", "background-tasks"], + &["nexus", "background-tasks", "show", "--help"], &["nexus", "blueprints"], &["nexus", "sagas"], // Missing "--destructive" flag. The URL is bogus but just ensures that @@ -144,6 +145,19 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { &["mgs", "inventory"], &["nexus", "background-tasks", "doc"], &["nexus", "background-tasks", "show"], + // background tasks: test picking out specific names + &["nexus", "background-tasks", "show", "saga_recovery"], + &[ + "nexus", + "background-tasks", + "show", + "blueprint_loader", + "blueprint_executor", + ], + // background tasks: test recognized group names + &["nexus", "background-tasks", "show", "dns_internal"], + &["nexus", "background-tasks", "show", "dns_external"], + &["nexus", "background-tasks", "show", "all"], &["nexus", "sagas", "list"], &["--destructive", "nexus", "sagas", "demo-create"], &["nexus", "sagas", "list"], diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 1ee07410bf..55781136b6 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -491,6 +491,46 @@ Connection Options: Safety Options: -w, --destructive Allow potentially-destructive subcommands ============================================= +EXECUTING COMMAND: omdb ["nexus", "background-tasks", "show", "--help"] +termination: Exited(0) +--------------------------------------------- +stdout: +Print human-readable summary of the status of each background task + +Usage: omdb nexus background-tasks show [OPTIONS] [TASK_NAME]... + +Arguments: + [TASK_NAME]... + Names of background tasks to show (default: all) + + You can use any background task name here or one of the special strings "all", + "dns_external", or "dns_internal". + +Options: + --log-level + log level filter + + [env: LOG_LEVEL=] + [default: warn] + + -h, --help + Print help (see a summary with '-h') + +Connection Options: + --nexus-internal-url + URL of the Nexus internal API + + [env: OMDB_NEXUS_URL=] + + --dns-server + [env: OMDB_DNS_SERVER=] + +Safety Options: + -w, --destructive + Allow potentially-destructive subcommands +--------------------------------------------- +stderr: +============================================= EXECUTING COMMAND: omdb ["nexus", "blueprints"] termination: Exited(2) ---------------------------------------------