From aceb7444938afb60f01fdf5ddeb05fd23349674d Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 13 Sep 2023 22:01:25 -0700 Subject: [PATCH] prototype Omicron debugger (#4084) --- Cargo.lock | 65 +++ Cargo.toml | 6 +- common/src/api/external/mod.rs | 1 + dev-tools/src/bin/omicron-dev.rs | 2 +- .../output/cmd-omicron-dev-noargs-stderr | 2 +- nexus-client/src/lib.rs | 6 + nexus/Cargo.toml | 2 +- nexus/db-model/Cargo.toml | 2 +- nexus/db-model/src/service_kind.rs | 3 +- nexus/db-model/src/sled.rs | 4 + nexus/db-queries/Cargo.toml | 2 +- nexus/src/app/background/common.rs | 190 ++++----- nexus/src/app/background/init.rs | 24 +- nexus/src/app/background/mod.rs | 1 + nexus/src/app/background/status.rs | 54 +++ nexus/src/internal_api/http_entrypoints.rs | 64 ++- nexus/types/src/internal_api/views.rs | 146 +++++++ omdb/Cargo.toml | 34 ++ omdb/build.rs | 10 + omdb/src/bin/omdb/db.rs | 370 ++++++++++++++++++ omdb/src/bin/omdb/main.rs | 61 +++ omdb/src/bin/omdb/nexus.rs | 246 ++++++++++++ openapi/nexus-internal.json | 257 ++++++++++++ sled-agent/src/sim/server.rs | 4 +- 24 files changed, 1437 insertions(+), 119 deletions(-) create mode 100644 nexus/src/app/background/status.rs create mode 100644 omdb/Cargo.toml create mode 100644 omdb/build.rs create mode 100644 omdb/src/bin/omdb/db.rs create mode 100644 omdb/src/bin/omdb/main.rs create mode 100644 omdb/src/bin/omdb/nexus.rs diff --git a/Cargo.lock b/Cargo.lock index 77d031bc09..2bcc144406 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -723,6 +723,12 @@ version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +[[package]] +name = "bytecount" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" + [[package]] name = "byteorder" version = "1.4.3" @@ -4984,6 +4990,30 @@ dependencies = [ "uuid", ] +[[package]] +name = "omicron-omdb" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "clap 4.4.2", + "dropshot", + "humantime", + "nexus-client 0.1.0", + "nexus-db-model", + "nexus-db-queries", + "omicron-common 0.1.0", + "omicron-rpaths", + "pq-sys", + "serde_json", + "slog", + "strum", + "tabled", + "textwrap 0.16.0", + "tokio", + "uuid", +] + [[package]] name = "omicron-package" version = "0.1.0" @@ -5583,6 +5613,17 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "papergrid" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2ccbe15f2b6db62f9a9871642746427e297b0ceb85f9a7f1ee5ff47d184d0c8" +dependencies = [ + "bytecount", + "fnv", + "unicode-width", +] + [[package]] name = "parking_lot" version = "0.11.2" @@ -8261,6 +8302,30 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "tabled" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfe9c3632da101aba5131ed63f9eed38665f8b3c68703a6bb18124835c1a5d22" +dependencies = [ + "papergrid", + "tabled_derive", + "unicode-width", +] + +[[package]] +name = "tabled_derive" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99f688a08b54f4f02f0a3c382aefdb7884d3d69609f785bd253dc033243e3fe4" +dependencies = [ + "heck 0.4.1", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "take_mut" version = "0.2.2" diff --git a/Cargo.toml b/Cargo.toml index e4523e350b..1f00842e4b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ members = [ "nexus/test-utils-macros", "nexus/test-utils", "nexus/types", + "omdb", "oxide-client", "oximeter-client", "oximeter/collector", @@ -97,6 +98,7 @@ default-members = [ "nexus/db-queries", "nexus/defaults", "nexus/types", + "omdb", "oxide-client", "oximeter-client", "oximeter/collector", @@ -151,7 +153,7 @@ chacha20poly1305 = "0.10.1" ciborium = "0.2.1" cfg-if = "1.0" chrono = { version = "0.4", features = [ "serde" ] } -clap = { version = "4.4", features = ["derive"] } +clap = { version = "4.4", features = ["derive", "env"] } cookie = "0.16" criterion = { version = "0.5.1", features = [ "async_tokio" ] } crossbeam = "0.8" @@ -233,6 +235,7 @@ omicron-common = { path = "common" } omicron-dev-tools = { path = "dev-tools" } omicron-gateway = { path = "gateway" } omicron-nexus = { path = "nexus" } +omicron-omdb = { path = "omdb" } omicron-package = { path = "package" } omicron-rpaths = { path = "rpaths" } omicron-sled-agent = { path = "sled-agent" } @@ -330,6 +333,7 @@ strum = { version = "0.25", features = [ "derive" ] } subprocess = "0.2.9" libsw = { version = "3.3.0", features = ["tokio"] } syn = { version = "2.0" } +tabled = "0.14" tar = "0.4" tempdir = "0.3" tempfile = "3.6" diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 06a06ea52a..488ee7c268 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -683,6 +683,7 @@ impl TryFrom for Generation { pub enum ResourceType { AddressLot, AddressLotBlock, + BackgroundTask, Fleet, Silo, SiloUser, diff --git a/dev-tools/src/bin/omicron-dev.rs b/dev-tools/src/bin/omicron-dev.rs index 507e3bc918..70018e9d9b 100644 --- a/dev-tools/src/bin/omicron-dev.rs +++ b/dev-tools/src/bin/omicron-dev.rs @@ -39,7 +39,7 @@ async fn main() -> Result<(), anyhow::Error> { Ok(()) } -/// Manage a local CockroachDB database for Omicron development +/// Tools for working with a local Omicron deployment #[derive(Debug, Parser)] #[clap(version)] enum OmicronDb { diff --git a/dev-tools/tests/output/cmd-omicron-dev-noargs-stderr b/dev-tools/tests/output/cmd-omicron-dev-noargs-stderr index 10da31f6c2..f3c28e1ab9 100644 --- a/dev-tools/tests/output/cmd-omicron-dev-noargs-stderr +++ b/dev-tools/tests/output/cmd-omicron-dev-noargs-stderr @@ -1,4 +1,4 @@ -Manage a local CockroachDB database for Omicron development +Tools for working with a local Omicron deployment Usage: omicron-dev diff --git a/nexus-client/src/lib.rs b/nexus-client/src/lib.rs index e878775713..e5cec83f39 100644 --- a/nexus-client/src/lib.rs +++ b/nexus-client/src/lib.rs @@ -246,6 +246,12 @@ impl From for types::Duration { } } +impl From for std::time::Duration { + fn from(s: types::Duration) -> Self { + std::time::Duration::from_nanos(s.secs * 1000000000 + s.nanos as u64) + } +} + impl From for types::IpRange { fn from(r: omicron_common::address::IpRange) -> Self { use omicron_common::address::IpRange; diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index e278219371..1a09f07f6c 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -9,6 +9,7 @@ omicron-rpaths.workspace = true [dependencies] anyhow.workspace = true +assert_matches.workspace = true async-trait.workspace = true base64.workspace = true bb8.workspace = true @@ -91,7 +92,6 @@ oximeter-producer.workspace = true rustls = { workspace = true } [dev-dependencies] -assert_matches.workspace = true async-bb8-diesel.workspace = true criterion.workspace = true diesel.workspace = true diff --git a/nexus/db-model/Cargo.toml b/nexus/db-model/Cargo.toml index 415582f3f2..dc83670725 100644 --- a/nexus/db-model/Cargo.toml +++ b/nexus/db-model/Cargo.toml @@ -26,6 +26,7 @@ semver.workspace = true serde.workspace = true serde_json.workspace = true steno.workspace = true +strum.workspace = true uuid.workspace = true db-macros.workspace = true @@ -38,4 +39,3 @@ sled-agent-client.workspace = true [dev-dependencies] expectorate.workspace = true -strum.workspace = true diff --git a/nexus/db-model/src/service_kind.rs b/nexus/db-model/src/service_kind.rs index d5a34f07db..c2598434d5 100644 --- a/nexus/db-model/src/service_kind.rs +++ b/nexus/db-model/src/service_kind.rs @@ -6,13 +6,14 @@ use super::impl_enum_type; use external_api::shared::ServiceUsingCertificate; use nexus_types::{external_api, internal_api}; use serde::{Deserialize, Serialize}; +use strum::EnumIter; impl_enum_type!( #[derive(Clone, SqlType, Debug, QueryId)] #[diesel(postgres_type(name = "service_kind"))] pub struct ServiceKindEnum; - #[derive(Clone, Copy, Debug, Eq, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[derive(Clone, Copy, Debug, Eq, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq, EnumIter)] #[diesel(sql_type = ServiceKindEnum)] pub enum ServiceKind; diff --git a/nexus/db-model/src/sled.rs b/nexus/db-model/src/sled.rs index 1f1fd78631..5e059946ff 100644 --- a/nexus/db-model/src/sled.rs +++ b/nexus/db-model/src/sled.rs @@ -109,6 +109,10 @@ impl Sled { pub fn address_with_port(&self, port: u16) -> SocketAddrV6 { SocketAddrV6::new(self.ip(), port, 0, 0) } + + pub fn serial_number(&self) -> &str { + &self.serial_number + } } impl From for views::Sled { diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index 2f82588cbf..6739bf0286 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -78,7 +78,7 @@ petgraph.workspace = true rcgen.workspace = true regex.workspace = true rustls.workspace = true +strum.workspace = true subprocess.workspace = true tempfile.workspace = true term.workspace = true -strum.workspace = true diff --git a/nexus/src/app/background/common.rs b/nexus/src/app/background/common.rs index 1807dacd44..868f236b82 100644 --- a/nexus/src/app/background/common.rs +++ b/nexus/src/app/background/common.rs @@ -128,13 +128,19 @@ //! intermediate states.) We don't have to worry about an unbounded queue, or //! handling a full queue, or other forms of backpressure. -use chrono::DateTime; +use assert_matches::assert_matches; use chrono::Utc; use futures::future::BoxFuture; use futures::stream::FuturesUnordered; use futures::FutureExt; use futures::StreamExt; use nexus_db_queries::context::OpContext; +use nexus_types::internal_api::views::ActivationReason; +use nexus_types::internal_api::views::CurrentStatus; +use nexus_types::internal_api::views::CurrentStatusRunning; +use nexus_types::internal_api::views::LastResult; +use nexus_types::internal_api::views::LastResultCompleted; +use nexus_types::internal_api::views::TaskStatus; use std::collections::BTreeMap; use std::sync::Arc; use std::time::Duration; @@ -175,8 +181,17 @@ pub struct Driver { #[derive(Clone, Debug, Ord, PartialOrd, PartialEq, Eq)] pub struct TaskHandle(String); +impl TaskHandle { + /// Returns the unique name of this background task + pub fn name(&self) -> &str { + &self.0 + } +} + /// Driver-side state of a background task struct Task { + /// what this task does (for developers) + description: String, /// channel used to receive updates from the background task's tokio task /// about what the background task is doing status: watch::Receiver, @@ -216,6 +231,7 @@ impl Driver { pub fn register( &mut self, name: String, + description: String, period: Duration, imp: Box, opctx: OpContext, @@ -223,8 +239,10 @@ impl Driver { ) -> TaskHandle { // Activation of the background task happens in a separate tokio task. // Set up a channel so that tokio task can report status back to us. - let (status_tx, status_rx) = - watch::channel(TaskStatus { current: None, last: None }); + let (status_tx, status_rx) = watch::channel(TaskStatus { + current: CurrentStatus::Idle, + last: LastResult::NeverCompleted, + }); // Set up a channel so that we can wake up the tokio task if somebody // requests an explicit activation. @@ -243,7 +261,7 @@ impl Driver { // Create an object to track our side of the background task's state. // This just provides the handles we need to read status and wake up the // tokio task. - let task = Task { status: status_rx, tokio_task, notify }; + let task = Task { description, status: status_rx, tokio_task, notify }; if self.tasks.insert(TaskHandle(name.clone()), task).is_some() { panic!("started two background tasks called {:?}", name); } @@ -262,6 +280,21 @@ impl Driver { self.tasks.keys() } + /// Returns a summary of what this task does (for developers) + pub fn task_description(&self, task: &TaskHandle) -> &str { + // It should be hard to hit this in practice, since you'd have to have + // gotten a TaskHandle from somewhere. It would have to be another + // Driver instance. + let task = self.tasks.get(task).unwrap_or_else(|| { + panic!( + "attempted to get docs for non-existent background task: {:?}", + task + ) + }); + + &task.description + } + /// Activate the specified background task /// /// If the task is currently running, it will be activated again when it @@ -281,8 +314,7 @@ impl Driver { } /// Returns the runtime status of the background task - #[allow(dead_code)] - pub fn status(&self, task: &TaskHandle) -> TaskStatus { + pub fn task_status(&self, task: &TaskHandle) -> TaskStatus { // It should be hard to hit this in practice, since you'd have to have // gotten a TaskHandle from somewhere. It would have to be another // Driver instance. @@ -387,32 +419,33 @@ impl TaskExec { // Update our status with the driver. self.status_tx.send_modify(|status| { - assert!(status.current.is_none()); - status.current = Some(LastStart { + assert_matches!(status.current, CurrentStatus::Idle); + status.current = CurrentStatus::Running(CurrentStatusRunning { start_time, start_instant, reason, - iteration, + iteration: iteration, }); }); // Do it! - let value = self.imp.activate(&self.opctx).await; + let details = self.imp.activate(&self.opctx).await; let elapsed = start_instant.elapsed(); // Update our status with the driver. self.status_tx.send_modify(|status| { - assert!(status.current.is_some()); - let current = status.current.as_ref().unwrap(); + assert!(!status.current.is_idle()); + let current = status.current.unwrap_running(); assert_eq!(current.iteration, iteration); *status = TaskStatus { - current: None, - last: Some(LastResult { + current: CurrentStatus::Idle, + last: LastResult::Completed(LastResultCompleted { iteration, - start_time: current.start_time, + start_time, + reason, elapsed, - value, + details, }), }; }); @@ -426,59 +459,6 @@ impl TaskExec { } } -/// Describes why a background task was activated -/// -/// This is only used for debugging. This is deliberately not made available to -/// the background task itself. See "Design notes" in the module-level -/// documentation for details. -#[derive(Debug, Clone, Copy, Eq, PartialEq)] -pub enum ActivationReason { - Signaled, - Timeout, - Dependency, -} - -/// Describes the runtime status of the background task -#[derive(Clone, Debug)] -pub struct TaskStatus { - /// Describes the current activation, if any - /// - /// If `None`, then this task is not running. - pub current: Option, - - /// Describes the last activation, if any - /// - /// If `None`, then this task has never finished. - pub last: Option, -} - -/// Describes an ongoing activation of a background task -#[derive(Clone, Debug)] -pub struct LastStart { - /// wall-clock time when the activation started - pub start_time: DateTime, - /// monotonic timestamp when the activation started - pub start_instant: Instant, - /// why the activation started - pub reason: ActivationReason, - /// which iteration this was (counting from zero when the background task - /// was first registered with this `Driver`) - pub iteration: u64, -} - -#[derive(Clone, Debug)] -pub struct LastResult { - /// which iteration this was (counting from zero when the background task - /// was first registered with this `Driver`) - pub iteration: u64, - /// wall-clock time when the activation started - pub start_time: DateTime, - /// total time elapsed during the activation - pub elapsed: Duration, - /// arbitrary datum emitted by the background task - pub value: serde_json::Value, -} - /// Used to erase the specific type of a `tokio::sync::watch::Receiver` /// /// This allows the `Driver` to treat these generically, activating a task when @@ -593,6 +573,7 @@ mod test { assert_eq!(*rx1.borrow(), 0); let h1 = driver.register( "t1".to_string(), + "test task".to_string(), Duration::from_millis(100), Box::new(t1), opctx.child(std::collections::BTreeMap::new()), @@ -601,6 +582,7 @@ mod test { let h2 = driver.register( "t2".to_string(), + "test task".to_string(), Duration::from_secs(300), // should never fire in this test Box::new(t2), opctx.child(std::collections::BTreeMap::new()), @@ -609,6 +591,7 @@ mod test { let h3 = driver.register( "t3".to_string(), + "test task".to_string(), Duration::from_secs(300), // should never fire in this test Box::new(t3), opctx, @@ -630,15 +613,15 @@ mod test { ); assert!(duration.as_millis() >= 300); // Check how the last activation was reported. - let status = driver.status(&h1); - let last = status.last.expect("no record of last activation"); + let status = driver.task_status(&h1); + let last = status.last.unwrap_completion(); // It's conceivable that there's been another activation already. assert!(last.iteration == 3 || last.iteration == 4); assert!(last.start_time >= wall_start); assert!(last.start_time <= Utc::now()); assert!(last.elapsed <= duration); assert_matches!( - last.value, + last.details, serde_json::Value::Number(n) if n.as_u64().unwrap() == last.iteration ); @@ -647,11 +630,11 @@ mod test { // time, from its beginning-of-time activation. assert_eq!(*rx2.borrow(), 1); assert_eq!(*rx3.borrow(), 1); - let status = driver.status(&h2); - let last = status.last.expect("no record of last activation"); + let status = driver.task_status(&h2); + let last = status.last.unwrap_completion(); assert_eq!(last.iteration, 1); - let status = driver.status(&h3); - let last = status.last.expect("no record of last activation"); + let status = driver.task_status(&h3); + let last = status.last.unwrap_completion(); assert_eq!(last.iteration, 1); // Explicitly wake up all of our tasks by reporting that dep1 has @@ -662,11 +645,11 @@ mod test { wait_until_count(rx3.clone(), 2).await; assert_eq!(*rx2.borrow(), 2); assert_eq!(*rx3.borrow(), 2); - let status = driver.status(&h2); - let last = status.last.expect("no record of last activation"); + let status = driver.task_status(&h2); + let last = status.last.unwrap_completion(); assert_eq!(last.iteration, 2); - let status = driver.status(&h3); - let last = status.last.expect("no record of last activation"); + let status = driver.task_status(&h3); + let last = status.last.unwrap_completion(); assert_eq!(last.iteration, 2); // Explicitly wake up just "t3" by reporting that dep2 has changed. @@ -675,11 +658,11 @@ mod test { wait_until_count(rx3.clone(), 3).await; assert_eq!(*rx2.borrow(), 2); assert_eq!(*rx3.borrow(), 3); - let status = driver.status(&h2); - let last = status.last.expect("no record of last activation"); + let status = driver.task_status(&h2); + let last = status.last.unwrap_completion(); assert_eq!(last.iteration, 2); - let status = driver.status(&h3); - let last = status.last.expect("no record of last activation"); + let status = driver.task_status(&h3); + let last = status.last.unwrap_completion(); assert_eq!(last.iteration, 3); // Explicitly activate just "t3". @@ -687,11 +670,11 @@ mod test { wait_until_count(rx3.clone(), 4).await; assert_eq!(*rx2.borrow(), 2); assert_eq!(*rx3.borrow(), 4); - let status = driver.status(&h2); - let last = status.last.expect("no record of last activation"); + let status = driver.task_status(&h2); + let last = status.last.unwrap_completion(); assert_eq!(last.iteration, 2); - let status = driver.status(&h3); - let last = status.last.expect("no record of last activation"); + let status = driver.task_status(&h3); + let last = status.last.unwrap_completion(); assert_eq!(last.iteration, 4); } @@ -752,6 +735,7 @@ mod test { let before_instant = Instant::now(); let h1 = driver.register( "t1".to_string(), + "test task".to_string(), Duration::from_secs(300), // should not elapse during test Box::new(t1), opctx.child(std::collections::BTreeMap::new()), @@ -764,9 +748,9 @@ mod test { let after_wall = Utc::now(); let after_instant = Instant::now(); // Verify that it's a timeout-based activation. - let status = driver.status(&h1); - assert!(status.last.is_none()); - let current = status.current.unwrap(); + let status = driver.task_status(&h1); + assert!(!status.last.has_completed()); + let current = status.current.unwrap_running(); assert!(current.start_time >= before_wall); assert!(current.start_time <= after_wall); assert!(current.start_instant >= before_instant); @@ -784,11 +768,11 @@ mod test { assert_eq!(which, 2); assert!(after_instant.elapsed().as_millis() < 5000); // Verify that it's a dependency-caused activation. - let status = driver.status(&h1); - let last = status.last.unwrap(); + let status = driver.task_status(&h1); + let last = status.last.unwrap_completion(); assert_eq!(last.start_time, current.start_time); assert_eq!(last.iteration, current.iteration); - let current = status.current.unwrap(); + let current = status.current.unwrap_running(); assert!(current.start_time >= after_wall); assert!(current.start_instant >= after_instant); assert_eq!(current.iteration, 2); @@ -804,11 +788,11 @@ mod test { assert_eq!(which, 3); assert!(after_instant.elapsed().as_millis() < 10000); // Verify that it's a signal-caused activation. - let status = driver.status(&h1); - let last = status.last.unwrap(); + let status = driver.task_status(&h1); + let last = status.last.unwrap_completion(); assert_eq!(last.start_time, current.start_time); assert_eq!(last.iteration, current.iteration); - let current = status.current.unwrap(); + let current = status.current.unwrap_running(); assert_eq!(current.iteration, 3); assert_eq!(current.reason, ActivationReason::Signaled); // This time, queue up several explicit activations. @@ -827,9 +811,9 @@ mod test { // there's not another one coming, so we just wait long enough that we // expect to have seen it if it is coming. tokio::time::sleep(Duration::from_secs(1)).await; - let status = driver.status(&h1); - assert!(status.current.is_none()); - assert_eq!(status.last.unwrap().iteration, 4); + let status = driver.task_status(&h1); + assert!(status.current.is_idle()); + assert_eq!(status.last.unwrap_completion().iteration, 4); assert_matches!(ready_rx1.try_recv(), Err(TryRecvError::Empty)); // Now, trigger several dependency-based activations. We should see the @@ -841,9 +825,9 @@ mod test { assert_eq!(which, 5); tx1.send(()).await.unwrap(); tokio::time::sleep(Duration::from_secs(1)).await; - let status = driver.status(&h1); - assert!(status.current.is_none()); - assert_eq!(status.last.unwrap().iteration, 5); + let status = driver.task_status(&h1); + assert!(status.current.is_idle()); + assert_eq!(status.last.unwrap_completion().iteration, 5); assert_matches!(ready_rx1.try_recv(), Err(TryRecvError::Empty)); // It would be nice to also verify that multiple time-based activations diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index c8665943fa..a139152845 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -74,7 +74,12 @@ impl BackgroundTasks { external_endpoints::ExternalEndpointsWatcher::new(datastore); let watcher_channel = watcher.watcher(); let task = driver.register( - "external_endpoints".to_string(), + String::from("external_endpoints"), + String::from( + "reads config for silos and TLS certificates to determine \ + the right set of HTTP endpoints, their HTTP server names, \ + and which TLS certificates to use on each one", + ), config.external_endpoints.period_secs, Box::new(watcher), opctx.child(BTreeMap::new()), @@ -113,8 +118,10 @@ fn init_dns( let dns_config = dns_config::DnsConfigWatcher::new(Arc::clone(&datastore), dns_group); let dns_config_watcher = dns_config.watcher(); + let task_name_config = format!("dns_config_{}", dns_group); let task_config = driver.register( - format!("dns_config_{}", dns_group), + task_name_config.clone(), + format!("watches {} DNS data stored in CockroachDB", dns_group), config.period_secs_config, Box::new(dns_config), opctx.child(metadata.clone()), @@ -124,8 +131,13 @@ fn init_dns( // Background task: DNS server list watcher let dns_servers = dns_servers::DnsServersWatcher::new(datastore, dns_group); let dns_servers_watcher = dns_servers.watcher(); + let task_name_servers = format!("dns_servers_{}", dns_group); let task_servers = driver.register( - format!("dns_servers_{}", dns_group), + task_name_servers.clone(), + format!( + "watches list of {} DNS servers stored in CockroachDB", + dns_group, + ), config.period_secs_servers, Box::new(dns_servers), opctx.child(metadata.clone()), @@ -140,6 +152,12 @@ fn init_dns( ); driver.register( format!("dns_propagation_{}", dns_group), + format!( + "propagates latest {} DNS configuration (from {:?} background \ + task) to the latest list of DNS servers (from {:?} background \ + task)", + dns_group, task_name_config, task_name_servers, + ), config.period_secs_propagation, Box::new(dns_propagate), opctx.child(metadata), diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index a3bf8efb29..9ba0780246 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -10,6 +10,7 @@ mod dns_propagation; mod dns_servers; mod external_endpoints; mod init; +mod status; pub use common::Driver; pub use common::TaskHandle; diff --git a/nexus/src/app/background/status.rs b/nexus/src/app/background/status.rs new file mode 100644 index 0000000000..ae8419ae15 --- /dev/null +++ b/nexus/src/app/background/status.rs @@ -0,0 +1,54 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! View status of background tasks (for support and debugging) + +use crate::Nexus; +use nexus_db_queries::authz; +use nexus_db_queries::context::OpContext; +use nexus_types::internal_api::views::BackgroundTask; +use omicron_common::api::external::Error; +use omicron_common::api::external::LookupResult; +use omicron_common::api::external::LookupType; +use omicron_common::api::external::ResourceType; +use std::collections::BTreeMap; + +impl Nexus { + pub(crate) async fn bgtasks_list( + &self, + opctx: &OpContext, + ) -> Result, Error> { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + let driver = &self.background_tasks.driver; + Ok(driver + .tasks() + .map(|t| { + let name = t.name(); + let description = driver.task_description(t); + let status = driver.task_status(t); + ( + name.to_owned(), + BackgroundTask::new(name, description, status), + ) + }) + .collect()) + } + + pub(crate) async fn bgtask_status( + &self, + opctx: &OpContext, + name: &str, + ) -> LookupResult { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + let driver = &self.background_tasks.driver; + let task = + driver.tasks().find(|t| t.name() == name).ok_or_else(|| { + LookupType::ByName(name.to_owned()) + .into_not_found(ResourceType::BackgroundTask) + })?; + let description = driver.task_description(task); + let status = driver.task_status(task); + Ok(BackgroundTask::new(task.name(), description, status)) + } +} diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 6efc0d46e3..a99d386349 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -27,6 +27,7 @@ use hyper::Body; use nexus_types::internal_api::params::SwitchPutRequest; use nexus_types::internal_api::params::SwitchPutResponse; use nexus_types::internal_api::views::to_list; +use nexus_types::internal_api::views::BackgroundTask; use nexus_types::internal_api::views::Saga; use omicron_common::api::external::http_pagination::data_page_params_for; use omicron_common::api::external::http_pagination::PaginatedById; @@ -40,6 +41,7 @@ use oximeter::types::ProducerResults; use oximeter_producer::{collect, ProducerIdPathParams}; use schemars::JsonSchema; use serde::Deserialize; +use std::collections::BTreeMap; use std::sync::Arc; use uuid::Uuid; @@ -66,6 +68,9 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(saga_list)?; api.register(saga_view)?; + api.register(bgtask_list)?; + api.register(bgtask_view)?; + Ok(()) } @@ -447,7 +452,7 @@ async fn saga_list( let nexus = &apictx.nexus; let query = query_params.into_inner(); let pagparams = data_page_params_for(&rqctx, &query)?; - let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; let saga_stream = nexus.sagas_list(&opctx, &pagparams).await?; let view_list = to_list(saga_stream).await; Ok(HttpResponseOk(ScanById::results_page( @@ -456,7 +461,7 @@ async fn saga_list( &|_, saga: &Saga| saga.id, )?)) }; - apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } /// Path parameters for Saga requests @@ -476,11 +481,62 @@ async fn saga_view( ) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { - let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; let nexus = &apictx.nexus; let path = path_params.into_inner(); let saga = nexus.saga_get(&opctx, path.saga_id).await?; Ok(HttpResponseOk(saga)) }; - apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +// Background Tasks + +/// List background tasks +/// +/// This is a list of discrete background activities that Nexus carries out. +/// This is exposed for support and debugging. +#[endpoint { + method = GET, + path = "/bgtasks", +}] +async fn bgtask_list( + rqctx: RequestContext>, +) -> Result>, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.nexus; + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + let bgtask_list = nexus.bgtasks_list(&opctx).await?; + Ok(HttpResponseOk(bgtask_list)) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// Path parameters for Background Task requests +#[derive(Deserialize, JsonSchema)] +struct BackgroundTaskPathParam { + bgtask_name: String, +} + +/// Fetch status of one background task +/// +/// This is exposed for support and debugging. +#[endpoint { + method = GET, + path = "/bgtasks/{bgtask_name}", +}] +async fn bgtask_view( + rqctx: RequestContext>, + path_params: Path, +) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + let bgtask = nexus.bgtask_status(&opctx, &path.bgtask_name).await?; + Ok(HttpResponseOk(bgtask)) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } diff --git a/nexus/types/src/internal_api/views.rs b/nexus/types/src/internal_api/views.rs index e1e8dae453..9929f2531e 100644 --- a/nexus/types/src/internal_api/views.rs +++ b/nexus/types/src/internal_api/views.rs @@ -2,11 +2,15 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use chrono::DateTime; +use chrono::Utc; use futures::future::ready; use futures::stream::StreamExt; use omicron_common::api::external::ObjectStream; use schemars::JsonSchema; use serde::Serialize; +use std::time::Duration; +use std::time::Instant; use steno::SagaResultErr; use steno::UndoActionError; use uuid::Uuid; @@ -143,3 +147,145 @@ impl From for SagaState { } } } + +/// Background tasks +/// +/// These are currently only intended for observability by developers. We will +/// eventually want to flesh this out into something more observable for end +/// users. +#[derive(Clone, Debug, Serialize, JsonSchema)] +pub struct BackgroundTask { + /// unique identifier for this background task + name: String, + /// brief summary (for developers) of what this task does + description: String, + + #[serde(flatten)] + status: TaskStatus, +} + +impl BackgroundTask { + pub fn new( + name: &str, + description: &str, + status: TaskStatus, + ) -> BackgroundTask { + BackgroundTask { + name: name.to_owned(), + description: description.to_owned(), + status, + } + } +} + +/// Describes why a background task was activated +/// +/// This is only used for debugging. This is deliberately not made available to +/// the background task itself. See "Design notes" in the module-level +/// documentation for details. +#[derive(Debug, Clone, Copy, Eq, PartialEq, JsonSchema, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ActivationReason { + Signaled, + Timeout, + Dependency, +} + +/// Describes the runtime status of the background task +#[derive(Clone, Debug, JsonSchema, Serialize)] +pub struct TaskStatus { + /// Describes the current task status + pub current: CurrentStatus, + /// Describes the last completed activation + pub last: LastResult, +} + +/// Describes the current status of a background task +#[derive(Clone, Debug, JsonSchema, Serialize)] +#[serde(rename_all = "snake_case", tag = "current_status", content = "details")] +pub enum CurrentStatus { + /// The background task is not running + /// + /// Typically, the task would be waiting for its next activation, which + /// would happen after a timeout or some other event that triggers + /// activation + Idle, + + /// The background task is currently running + /// + /// More precisely, the task has been activated and has not yet finished + /// this activation + Running(CurrentStatusRunning), +} + +impl CurrentStatus { + pub fn is_idle(&self) -> bool { + matches!(self, CurrentStatus::Idle) + } + + pub fn unwrap_running(&self) -> &CurrentStatusRunning { + match self { + CurrentStatus::Running(r) => r, + CurrentStatus::Idle => { + panic!("attempted to get running state of idle task") + } + } + } +} + +#[derive(Clone, Debug, JsonSchema, Serialize)] +pub struct CurrentStatusRunning { + /// wall-clock time when the current activation started + pub start_time: DateTime, + /// (local) monotonic timestamp when the activation started + // Currently, it's not possible to read this value except in a debugger. + // But it's still potentially useful to be able to read it in a debugger. + #[allow(dead_code)] + #[serde(skip)] + pub start_instant: Instant, + /// what kind of event triggered this activation + pub reason: ActivationReason, + /// which iteration this was (counter) + pub iteration: u64, +} + +#[derive(Clone, Debug, JsonSchema, Serialize)] +#[serde(rename_all = "snake_case", tag = "last_result", content = "details")] +pub enum LastResult { + /// The task has never completed an activation + NeverCompleted, + /// The task has completed at least one activation + Completed(LastResultCompleted), +} + +impl LastResult { + pub fn has_completed(&self) -> bool { + matches!(self, LastResult::Completed(_)) + } + + pub fn unwrap_completion(self) -> LastResultCompleted { + match self { + LastResult::Completed(r) => r, + LastResult::NeverCompleted => { + panic!( + "attempted to get completion state of a task that \ + has never completed" + ); + } + } + } +} + +#[derive(Clone, Debug, JsonSchema, Serialize)] +pub struct LastResultCompleted { + /// which iteration this was (counter) + pub iteration: u64, + /// wall-clock time when the activation started + pub start_time: DateTime, + /// what kind of event triggered this activation + pub reason: ActivationReason, + /// total time elapsed during the activation + pub elapsed: Duration, + /// arbitrary datum emitted by the background task + pub details: serde_json::Value, +} diff --git a/omdb/Cargo.toml b/omdb/Cargo.toml new file mode 100644 index 0000000000..91058fcf3d --- /dev/null +++ b/omdb/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "omicron-omdb" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[build-dependencies] +omicron-rpaths.workspace = true + +[dependencies] +anyhow.workspace = true +chrono.workspace = true +clap.workspace = true +dropshot.workspace = true +humantime.workspace = true +nexus-client.workspace = true +nexus-db-model.workspace = true +nexus-db-queries.workspace = true +omicron-common.workspace = true +# See omicron-rpaths for more about the "pq-sys" dependency. +pq-sys = "*" +serde_json.workspace = true +slog.workspace = true +strum.workspace = true +tabled.workspace = true +textwrap.workspace = true +tokio = { workspace = true, features = [ "full" ] } +uuid.workspace = true + +# Disable doc builds by default for our binaries to work around issue +# rust-lang/cargo#8373. These docs would not be very useful anyway. +[[bin]] +name = "omdb" +doc = false diff --git a/omdb/build.rs b/omdb/build.rs new file mode 100644 index 0000000000..1ba9acd41c --- /dev/null +++ b/omdb/build.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// See omicron-rpaths for documentation. +// NOTE: This file MUST be kept in sync with the other build.rs files in this +// repository. +fn main() { + omicron_rpaths::configure_default_omicron_rpaths(); +} diff --git a/omdb/src/bin/omdb/db.rs b/omdb/src/bin/omdb/db.rs new file mode 100644 index 0000000000..6f3b76a275 --- /dev/null +++ b/omdb/src/bin/omdb/db.rs @@ -0,0 +1,370 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! omdb commands that query or update the database + +use anyhow::anyhow; +use anyhow::Context; +use clap::Args; +use clap::Subcommand; +use nexus_db_model::Sled; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db; +use nexus_db_queries::db::identity::Asset; +use nexus_db_queries::db::model::ServiceKind; +use nexus_db_queries::db::DataStore; +use omicron_common::api::external::DataPageParams; +use omicron_common::postgres_config::PostgresConfigWithUrl; +use std::collections::BTreeMap; +use std::num::NonZeroU32; +use std::sync::Arc; +use strum::IntoEnumIterator; +use tabled::Tabled; +use uuid::Uuid; + +#[derive(Debug, Args)] +pub struct DbArgs { + /// URL of the database SQL interface + db_url: PostgresConfigWithUrl, + + /// limit to apply to queries that fetch rows + #[clap( + long = "fetch-limit", + default_value_t = NonZeroU32::new(100).unwrap() + )] + fetch_limit: NonZeroU32, + + #[command(subcommand)] + command: DbCommands, +} + +/// Subcommands that query or update the database +#[derive(Debug, Subcommand)] +enum DbCommands { + /// Print information about control plane services + Services(ServicesArgs), + /// Print information about sleds + Sleds, +} + +#[derive(Debug, Args)] +struct ServicesArgs { + #[command(subcommand)] + command: ServicesCommands, +} + +#[derive(Debug, Subcommand)] +enum ServicesCommands { + /// List service instances + ListInstances, + /// List service instances, grouped by sled + ListBySled, +} + +impl DbArgs { + /// Run a `omdb db` subcommand. + pub async fn run_cmd( + &self, + log: &slog::Logger, + ) -> Result<(), anyhow::Error> { + let db_config = db::Config { url: self.db_url.clone() }; + let pool = Arc::new(db::Pool::new(&log.clone(), &db_config)); + + // Being a dev tool, we want to try this operation even if the schema + // doesn't match what we expect. So we use `DataStore::new_unchecked()` + // here. We will then check the schema version explicitly and warn the + // user if it doesn't match. + let datastore = Arc::new( + DataStore::new_unchecked(pool) + .map_err(|e| anyhow!(e).context("creating datastore"))?, + ); + check_schema_version(&datastore).await; + + let opctx = OpContext::for_tests(log.clone(), datastore.clone()); + match &self.command { + DbCommands::Services(ServicesArgs { + command: ServicesCommands::ListInstances, + }) => { + cmd_db_services_list_instances( + &opctx, + &datastore, + self.fetch_limit, + ) + .await + } + DbCommands::Services(ServicesArgs { + command: ServicesCommands::ListBySled, + }) => { + cmd_db_services_list_by_sled( + &opctx, + &datastore, + self.fetch_limit, + ) + .await + } + DbCommands::Sleds => { + cmd_db_sleds(&opctx, &datastore, self.fetch_limit).await + } + } + } +} + +/// Check the version of the schema in the database and report whether it +/// appears to be compatible with this tool. +/// +/// This is just advisory. We will not abort if the version appears +/// incompatible because in practice it may well not matter and it's very +/// valuable for this tool to work if it possibly can. +async fn check_schema_version(datastore: &DataStore) { + let expected_version = nexus_db_model::schema::SCHEMA_VERSION; + let version_check = datastore.database_schema_version().await; + + match version_check { + Ok(found_version) => { + if found_version == expected_version { + eprintln!( + "note: schema version matches expected ({})", + expected_version + ); + return; + } + + eprintln!( + "WARN: found schema version {}, expected {}", + found_version, expected_version + ); + } + Err(error) => { + eprintln!("WARN: failed to query schema version: {:#}", error); + } + }; + + eprintln!( + "{}", + textwrap::fill( + "It's possible the database is running a version that's different \ + from what this tool understands. This may result in errors or \ + incorrect output.", + 80 + ) + ); +} + +/// Check the result of a query to see if it hit the given limit. If so, warn +/// the user that our output may be incomplete and that they might try a larger +/// one. (We don't want to bail out, though. Incomplete data is better than no +/// data.) +fn check_limit(items: &[I], limit: NonZeroU32, context: F) +where + F: FnOnce() -> String, +{ + if items.len() == usize::try_from(limit.get()).unwrap() { + eprintln!( + "WARN: {}: found {} items (the limit). There may be more items \ + that were ignored. Consider overriding with --fetch-limit.", + context(), + items.len(), + ); + } +} + +#[derive(Tabled)] +#[tabled(rename_all = "SCREAMING_SNAKE_CASE")] +struct ServiceInstanceRow { + #[tabled(rename = "SERVICE")] + kind: String, + instance_id: Uuid, + addr: String, + sled_serial: String, +} + +/// Run `omdb db services list-instances`. +async fn cmd_db_services_list_instances( + opctx: &OpContext, + datastore: &DataStore, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + let pagparams: DataPageParams<'_, Uuid> = DataPageParams { + marker: None, + direction: dropshot::PaginationOrder::Ascending, + limit, + }; + let sled_list = datastore + .sled_list(&opctx, &pagparams) + .await + .context("listing sleds")?; + check_limit(&sled_list, limit, || String::from("listing sleds")); + + let sleds: BTreeMap = + sled_list.into_iter().map(|s| (s.id(), s)).collect(); + + let mut rows = vec![]; + + for service_kind in ServiceKind::iter() { + let pagparams: DataPageParams<'_, Uuid> = DataPageParams { + marker: None, + direction: dropshot::PaginationOrder::Ascending, + limit, + }; + + let context = + || format!("listing instances of kind {:?}", service_kind); + let instances = datastore + .services_list_kind(&opctx, service_kind, &pagparams) + .await + .with_context(&context)?; + check_limit(&instances, limit, &context); + + rows.extend(instances.into_iter().map(|instance| { + let addr = + std::net::SocketAddrV6::new(*instance.ip, *instance.port, 0, 0) + .to_string(); + + ServiceInstanceRow { + kind: format!("{:?}", service_kind), + instance_id: instance.id(), + addr, + sled_serial: sleds + .get(&instance.sled_id) + .map(|s| s.serial_number()) + .unwrap_or("unknown") + .to_string(), + } + })); + } + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + +#[derive(Tabled)] +#[tabled(rename_all = "SCREAMING_SNAKE_CASE")] +struct ServiceInstanceSledRow { + #[tabled(rename = "SERVICE")] + kind: String, + instance_id: Uuid, + addr: String, +} + +/// Run `omdb db services list-by-sled`. +async fn cmd_db_services_list_by_sled( + opctx: &OpContext, + datastore: &DataStore, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + let pagparams: DataPageParams<'_, Uuid> = DataPageParams { + marker: None, + direction: dropshot::PaginationOrder::Ascending, + limit, + }; + let sled_list = datastore + .sled_list(&opctx, &pagparams) + .await + .context("listing sleds")?; + check_limit(&sled_list, limit, || String::from("listing sleds")); + + let sleds: BTreeMap = + sled_list.into_iter().map(|s| (s.id(), s)).collect(); + let mut services_by_sled: BTreeMap> = + BTreeMap::new(); + + for service_kind in ServiceKind::iter() { + let pagparams: DataPageParams<'_, Uuid> = DataPageParams { + marker: None, + direction: dropshot::PaginationOrder::Ascending, + limit, + }; + + let context = + || format!("listing instances of kind {:?}", service_kind); + let instances = datastore + .services_list_kind(&opctx, service_kind, &pagparams) + .await + .with_context(&context)?; + check_limit(&instances, limit, &context); + + for i in instances { + let addr = + std::net::SocketAddrV6::new(*i.ip, *i.port, 0, 0).to_string(); + let sled_instances = + services_by_sled.entry(i.sled_id).or_insert_with(Vec::new); + sled_instances.push(ServiceInstanceSledRow { + kind: format!("{:?}", service_kind), + instance_id: i.id(), + addr, + }) + } + } + + for (sled_id, instances) in services_by_sled { + println!( + "sled: {} (id {})\n", + sleds.get(&sled_id).map(|s| s.serial_number()).unwrap_or("unknown"), + sled_id, + ); + let table = tabled::Table::new(instances) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + println!("{}", textwrap::indent(&table.to_string(), " ")); + println!(""); + } + + Ok(()) +} + +#[derive(Tabled)] +#[tabled(rename_all = "SCREAMING_SNAKE_CASE")] +struct SledRow { + serial: String, + ip: String, + role: &'static str, + id: Uuid, +} + +impl From for SledRow { + fn from(s: Sled) -> Self { + SledRow { + id: s.id(), + serial: s.serial_number().to_string(), + ip: s.address().to_string(), + role: if s.is_scrimlet() { "scrimlet" } else { "-" }, + } + } +} + +/// Run `omdb db sleds`. +async fn cmd_db_sleds( + opctx: &OpContext, + datastore: &DataStore, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + let pagparams: DataPageParams<'_, Uuid> = DataPageParams { + marker: None, + direction: dropshot::PaginationOrder::Ascending, + limit, + }; + + let sleds = datastore + .sled_list(&opctx, &pagparams) + .await + .context("listing sleds")?; + check_limit(&sleds, limit, || String::from("listing sleds")); + + let rows = sleds.into_iter().map(|s| SledRow::from(s)); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} diff --git a/omdb/src/bin/omdb/main.rs b/omdb/src/bin/omdb/main.rs new file mode 100644 index 0000000000..145410698b --- /dev/null +++ b/omdb/src/bin/omdb/main.rs @@ -0,0 +1,61 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! CLI for debugging Omicron internal state + +use anyhow::Context; +use clap::Parser; +use clap::Subcommand; + +mod db; +mod nexus; + +#[tokio::main] +async fn main() -> Result<(), anyhow::Error> { + let args = Omdb::parse(); + + let log = dropshot::ConfigLogging::StderrTerminal { level: args.log_level } + .to_logger("omdb") + .context("failed to create logger")?; + + match args.command { + OmdbCommands::Nexus(nexus) => nexus.run_cmd(&log).await, + OmdbCommands::Db(db) => db.run_cmd(&log).await, + } +} + +/// Omicron debugger (unstable) +/// +/// This tool provides commands for directly querying Omicron components about +/// their internal state using internal APIs. This is a prototype. The +/// commands and output are unstable and may change. +#[derive(Debug, Parser)] +struct Omdb { + /// log level filter + #[arg( + env, + long, + value_parser = parse_dropshot_log_level, + default_value = "warn", + )] + log_level: dropshot::ConfigLoggingLevel, + + #[command(subcommand)] + command: OmdbCommands, +} + +#[derive(Debug, Subcommand)] +#[allow(clippy::large_enum_variant)] +enum OmdbCommands { + /// Query the control plane database (CockroachDB) + Db(db::DbArgs), + /// Debug a specific Nexus instance + Nexus(nexus::NexusArgs), +} + +fn parse_dropshot_log_level( + s: &str, +) -> Result { + serde_json::from_str(&format!("{:?}", s)).context("parsing log level") +} diff --git a/omdb/src/bin/omdb/nexus.rs b/omdb/src/bin/omdb/nexus.rs new file mode 100644 index 0000000000..9ff9252cd2 --- /dev/null +++ b/omdb/src/bin/omdb/nexus.rs @@ -0,0 +1,246 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! omdb commands that query or update specific Nexus instances + +use anyhow::Context; +use chrono::SecondsFormat; +use clap::Args; +use clap::Subcommand; +use nexus_client::types::ActivationReason; +use nexus_client::types::BackgroundTask; +use nexus_client::types::CurrentStatus; +use nexus_client::types::LastResult; +use tabled::Tabled; + +/// Arguments to the "omdb nexus" subcommand +#[derive(Debug, Args)] +pub struct NexusArgs { + /// URL of the Nexus internal API + nexus_internal_url: String, + + #[command(subcommand)] + command: NexusCommands, +} + +/// Subcommands for the "omdb nexus" subcommand +#[derive(Debug, Subcommand)] +enum NexusCommands { + /// print information about background tasks + BackgroundTask(BackgroundTaskArgs), +} + +#[derive(Debug, Args)] +struct BackgroundTaskArgs { + #[command(subcommand)] + command: BackgroundTaskCommands, +} + +#[derive(Debug, Subcommand)] +enum BackgroundTaskCommands { + /// Show documentation about background tasks + Doc, + /// Print a summary of the status of all background tasks + List, + /// Print human-readable summary of the status of each background task + Details, +} + +impl NexusArgs { + /// Run a `omdb nexus` subcommand. + pub async fn run_cmd( + &self, + log: &slog::Logger, + ) -> Result<(), anyhow::Error> { + let client = + nexus_client::Client::new(&self.nexus_internal_url, log.clone()); + + match &self.command { + NexusCommands::BackgroundTask(BackgroundTaskArgs { + command: BackgroundTaskCommands::Doc, + }) => cmd_nexus_background_task_doc(&client).await, + NexusCommands::BackgroundTask(BackgroundTaskArgs { + command: BackgroundTaskCommands::List, + }) => cmd_nexus_background_task_list(&client).await, + NexusCommands::BackgroundTask(BackgroundTaskArgs { + command: BackgroundTaskCommands::Details, + }) => cmd_nexus_background_task_details(&client).await, + } + } +} + +/// Runs `omdb nexus background-task doc` +async fn cmd_nexus_background_task_doc( + client: &nexus_client::Client, +) -> Result<(), anyhow::Error> { + let response = + client.bgtask_list().await.context("listing background tasks")?; + let tasks = response.into_inner(); + for (_, bgtask) in &tasks { + println!("task: {:?}", bgtask.name); + println!( + "{}", + textwrap::fill( + &bgtask.description, + &textwrap::Options::new(80) + .initial_indent(" ") + .subsequent_indent(" ") + ) + ); + + println!("\n"); + } + + Ok(()) +} + +/// Runs `omdb nexus background-task list` +async fn cmd_nexus_background_task_list( + client: &nexus_client::Client, +) -> Result<(), anyhow::Error> { + let response = + client.bgtask_list().await.context("listing background tasks")?; + let tasks = response.into_inner(); + let table_rows = tasks.values().map(BackgroundTaskStatusRow::from); + let table = tabled::Table::new(table_rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + println!("{}", table); + Ok(()) +} + +/// Runs `omdb nexus background-task details` +async fn cmd_nexus_background_task_details( + client: &nexus_client::Client, +) -> Result<(), anyhow::Error> { + let response = + client.bgtask_list().await.context("listing background tasks")?; + let tasks = response.into_inner(); + for (_, bgtask) in &tasks { + print_task(bgtask); + } + Ok(()) +} + +fn print_task(bgtask: &BackgroundTask) { + println!("task: {:?}", bgtask.name); + print!(" currently executing: "); + match &bgtask.current { + CurrentStatus::Idle => println!("no"), + CurrentStatus::Running(current) => { + let elapsed = std::time::SystemTime::from(current.start_time) + .elapsed() + .map(|s| format!("{:.3}ms", s.as_millis())) + .unwrap_or_else(|error| format!("(unknown: {:#})", error)); + print!( + "iter {}, triggered by {}\n", + current.iteration, + reason_str(¤t.reason) + ); + print!( + " started at {}, running for {}\n", + humantime::format_rfc3339_millis(current.start_time.into()), + elapsed, + ); + } + }; + + print!(" last completed activation: "); + match &bgtask.last { + LastResult::NeverCompleted => print!("never\n"), + LastResult::Completed(last) => { + print!( + "iter {}, triggered by {}\n", + last.iteration, + reason_str(&last.reason) + ); + print!( + " started at {} and ran for {:.3}ms\n", + humantime::format_rfc3339_millis(last.start_time.into()), + std::time::Duration::from(last.elapsed.clone()).as_millis(), + ); + } + }; + + println!(""); +} + +/// Summarizes an `ActivationReason` +fn reason_str(reason: &ActivationReason) -> &'static str { + match reason { + ActivationReason::Signaled => "an explicit signal", + ActivationReason::Dependency => "a dependent task completing", + ActivationReason::Timeout => "a periodic timer firing", + } +} + +/// Used for printing background task status as a table +#[derive(Tabled)] +struct BackgroundTaskStatusRow { + task_name: String, + #[tabled(rename = "PGEN#")] + completed_generation: String, + #[tabled(rename = "PSTART")] + completed_start_time: String, + #[tabled(rename = "Psecs")] + completed_elapsed: String, + #[tabled(rename = "P")] + completed_reason: char, + #[tabled(rename = "CSTART")] + running_since: String, + #[tabled(rename = "C")] + running_reason: char, +} + +impl<'a> From<&'a BackgroundTask> for BackgroundTaskStatusRow { + fn from(t: &'a BackgroundTask) -> Self { + let ( + completed_generation, + completed_start_time, + completed_elapsed, + completed_reason, + ) = match &t.last { + LastResult::NeverCompleted => { + (String::from("-"), String::from("-"), String::from("-"), '-') + } + LastResult::Completed(last) => ( + last.iteration.to_string(), + last.start_time.to_rfc3339_opts(SecondsFormat::Secs, true), + format!( + "{:5.1}", + std::time::Duration::from(last.elapsed.clone()) + .as_secs_f64() + ), + reason_code(last.reason), + ), + }; + + let (running_since, running_reason) = match &t.current { + CurrentStatus::Idle => (String::from("-"), '-'), + CurrentStatus::Running(current) => ( + current.start_time.to_rfc3339_opts(SecondsFormat::Secs, true), + reason_code(current.reason), + ), + }; + + BackgroundTaskStatusRow { + task_name: t.name.clone(), + completed_generation, + completed_start_time, + completed_elapsed, + completed_reason, + running_since, + running_reason, + } + } +} + +fn reason_code(reason: ActivationReason) -> char { + match reason { + ActivationReason::Signaled => 'S', + ActivationReason::Dependency => 'D', + ActivationReason::Timeout => 'T', + } +} diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 12043d8096..cae44907eb 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -61,6 +61,70 @@ } } }, + "/bgtasks": { + "get": { + "summary": "List background tasks", + "description": "This is a list of discrete background activities that Nexus carries out. This is exposed for support and debugging.", + "operationId": "bgtask_list", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Map_of_BackgroundTask", + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/BackgroundTask" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/bgtasks/{bgtask_name}": { + "get": { + "summary": "Fetch status of one background task", + "description": "This is exposed for support and debugging.", + "operationId": "bgtask_view", + "parameters": [ + { + "in": "path", + "name": "bgtask_name", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BackgroundTask" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/disk/{disk_id}/remove-read-only-parent": { "post": { "summary": "Request removal of a read_only_parent from a disk", @@ -628,6 +692,51 @@ } }, "schemas": { + "ActivationReason": { + "description": "Describes why a background task was activated\n\nThis is only used for debugging. This is deliberately not made available to the background task itself. See \"Design notes\" in the module-level documentation for details.", + "type": "string", + "enum": [ + "signaled", + "timeout", + "dependency" + ] + }, + "BackgroundTask": { + "description": "Background tasks\n\nThese are currently only intended for observability by developers. We will eventually want to flesh this out into something more observable for end users.", + "type": "object", + "properties": { + "current": { + "description": "Describes the current task status", + "allOf": [ + { + "$ref": "#/components/schemas/CurrentStatus" + } + ] + }, + "description": { + "description": "brief summary (for developers) of what this task does", + "type": "string" + }, + "last": { + "description": "Describes the last completed activation", + "allOf": [ + { + "$ref": "#/components/schemas/LastResult" + } + ] + }, + "name": { + "description": "unique identifier for this background task", + "type": "string" + } + }, + "required": [ + "current", + "description", + "last", + "name" + ] + }, "Baseboard": { "description": "Describes properties that should uniquely identify Oxide manufactured hardware", "type": "object", @@ -894,6 +1003,74 @@ "value" ] }, + "CurrentStatus": { + "description": "Describes the current status of a background task", + "oneOf": [ + { + "description": "The background task is not running\n\nTypically, the task would be waiting for its next activation, which would happen after a timeout or some other event that triggers activation", + "type": "object", + "properties": { + "current_status": { + "type": "string", + "enum": [ + "idle" + ] + } + }, + "required": [ + "current_status" + ] + }, + { + "description": "The background task is currently running\n\nMore precisely, the task has been activated and has not yet finished this activation", + "type": "object", + "properties": { + "current_status": { + "type": "string", + "enum": [ + "running" + ] + }, + "details": { + "$ref": "#/components/schemas/CurrentStatusRunning" + } + }, + "required": [ + "current_status", + "details" + ] + } + ] + }, + "CurrentStatusRunning": { + "type": "object", + "properties": { + "iteration": { + "description": "which iteration this was (counter)", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "reason": { + "description": "what kind of event triggered this activation", + "allOf": [ + { + "$ref": "#/components/schemas/ActivationReason" + } + ] + }, + "start_time": { + "description": "wall-clock time when the current activation started", + "type": "string", + "format": "date-time" + } + }, + "required": [ + "iteration", + "reason", + "start_time" + ] + }, "DatasetCreateRequest": { "type": "object", "properties": { @@ -2081,6 +2258,86 @@ "last" ] }, + "LastResult": { + "oneOf": [ + { + "description": "The task has never completed an activation", + "type": "object", + "properties": { + "last_result": { + "type": "string", + "enum": [ + "never_completed" + ] + } + }, + "required": [ + "last_result" + ] + }, + { + "description": "The task has completed at least one activation", + "type": "object", + "properties": { + "details": { + "$ref": "#/components/schemas/LastResultCompleted" + }, + "last_result": { + "type": "string", + "enum": [ + "completed" + ] + } + }, + "required": [ + "details", + "last_result" + ] + } + ] + }, + "LastResultCompleted": { + "type": "object", + "properties": { + "details": { + "description": "arbitrary datum emitted by the background task" + }, + "elapsed": { + "description": "total time elapsed during the activation", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + }, + "iteration": { + "description": "which iteration this was (counter)", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "reason": { + "description": "what kind of event triggered this activation", + "allOf": [ + { + "$ref": "#/components/schemas/ActivationReason" + } + ] + }, + "start_time": { + "description": "wall-clock time when the activation started", + "type": "string", + "format": "date-time" + } + }, + "required": [ + "details", + "elapsed", + "iteration", + "reason", + "start_time" + ] + }, "MacAddr": { "example": "ff:ff:ff:ff:ff:ff", "title": "A MAC address", diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index 76b071c725..595d83a7ee 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -97,8 +97,8 @@ impl Server { role: NexusTypes::SledRole::Gimlet, baseboard: NexusTypes::Baseboard { serial_number: format!( - "Simulated sled {}", - config.id + "sim-{}", + &config.id.to_string()[0..8] ), part_number: String::from("Unknown"), revision: 0,