From 3366598075cf1d7478eb55231ee569bf36f5910f Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 9 Aug 2024 15:05:14 -0700 Subject: [PATCH 1/6] add demo saga --- clients/nexus-client/src/lib.rs | 1 + dev-tools/omdb/src/bin/omdb/nexus.rs | 142 +++++++++++++++++++++ dev-tools/omdb/tests/env.out | 16 +++ dev-tools/omdb/tests/successes.out | 35 +++++ dev-tools/omdb/tests/test_all_output.rs | 14 ++ dev-tools/omdb/tests/usage_errors.out | 29 +++++ nexus/internal-api/src/lib.rs | 36 +++++- nexus/src/app/mod.rs | 27 +++- nexus/src/app/saga.rs | 24 +++- nexus/src/app/sagas/demo.rs | 126 ++++++++++++++++++ nexus/src/app/sagas/mod.rs | 2 + nexus/src/internal_api/http_entrypoints.rs | 35 +++++ nexus/tests/integration_tests/demo_saga.rs | 74 +++++++++++ nexus/tests/integration_tests/mod.rs | 1 + nexus/types/src/internal_api/views.rs | 8 ++ openapi/nexus-internal.json | 74 +++++++++++ uuid-kinds/src/lib.rs | 1 + 17 files changed, 637 insertions(+), 8 deletions(-) create mode 100644 nexus/src/app/sagas/demo.rs create mode 100644 nexus/tests/integration_tests/demo_saga.rs diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 162c3f4dbf..8345978db1 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -42,6 +42,7 @@ progenitor::generate_api!( OmicronPhysicalDisksConfig = nexus_types::disk::OmicronPhysicalDisksConfig, RecoverySiloConfig = nexus_sled_agent_shared::recovery_silo::RecoverySiloConfig, TypedUuidForCollectionKind = omicron_uuid_kinds::CollectionUuid, + TypedUuidForDemoSagaKind = omicron_uuid_kinds::DemoSagaUuid, TypedUuidForDownstairsKind = omicron_uuid_kinds::TypedUuid, TypedUuidForPropolisKind = omicron_uuid_kinds::TypedUuid, TypedUuidForSledKind = omicron_uuid_kinds::TypedUuid, diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 8649d15aa6..0a809c48f0 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -25,6 +25,7 @@ use nexus_client::types::BackgroundTasksActivateRequest; use nexus_client::types::CurrentStatus; use nexus_client::types::LastResult; use nexus_client::types::PhysicalDiskPath; +use nexus_client::types::SagaState; use nexus_client::types::SledSelector; use nexus_client::types::UninitializedSledId; use nexus_db_queries::db::lookup::LookupPath; @@ -34,6 +35,7 @@ use nexus_types::internal_api::background::LookupRegionPortStatus; use nexus_types::internal_api::background::RegionReplacementDriverStatus; use nexus_types::inventory::BaseboardId; use omicron_uuid_kinds::CollectionUuid; +use omicron_uuid_kinds::DemoSagaUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; @@ -71,6 +73,8 @@ enum NexusCommands { BackgroundTasks(BackgroundTasksArgs), /// interact with blueprints Blueprints(BlueprintsArgs), + /// view sagas, create and complete demo sagas + Sagas(SagasArgs), /// interact with sleds Sleds(SledsArgs), } @@ -244,6 +248,36 @@ struct BlueprintImportArgs { input: Utf8PathBuf, } +#[derive(Debug, Args)] +struct SagasArgs { + #[command(subcommand)] + command: SagasCommands, +} + +#[derive(Debug, Subcommand)] +enum SagasCommands { + /// List sagas run by this Nexus + /// + /// Note that this is reporting in-memory state about sagas run by *this* + /// Nexus instance. You'll get different answers if you ask different Nexus + /// instances. + List, + + /// Create a "demo" saga + /// + /// This saga will wait until it's explicitly completed using the + /// "demo-complete" subcommand. + DemoCreate, + + /// Complete a demo saga started with "demo-create". + DemoComplete(DemoSagaIdArgs), +} + +#[derive(Debug, Args)] +struct DemoSagaIdArgs { + demo_saga_id: DemoSagaUuid, +} + #[derive(Debug, Args)] struct SledsArgs { #[command(subcommand)] @@ -402,6 +436,31 @@ impl NexusArgs { cmd_nexus_blueprints_import(&client, token, args).await } + NexusCommands::Sagas(SagasArgs { command }) => { + if self.nexus_internal_url.is_none() { + eprintln!( + "{}", + textwrap::wrap( + "WARNING: A Nexus instance was selected from DNS \ + because a specific one was not specified. But \ + the `omdb nexus sagas` commands usually only make \ + sense when targeting a specific Nexus instance.", + 80 + ) + .join("\n") + ); + } + match command { + SagasCommands::List => cmd_nexus_sagas_list(&client).await, + SagasCommands::DemoCreate => { + cmd_nexus_sagas_demo_create(&client).await + } + SagasCommands::DemoComplete(args) => { + cmd_nexus_sagas_demo_complete(&client, args).await + } + } + } + NexusCommands::Sleds(SledsArgs { command: SledsCommands::ListUninitialized, }) => cmd_nexus_sleds_list_uninitialized(&client).await, @@ -1550,6 +1609,89 @@ async fn cmd_nexus_blueprints_import( Ok(()) } +/// Runs `omdb nexus sagas list` +async fn cmd_nexus_sagas_list( + client: &nexus_client::Client, +) -> Result<(), anyhow::Error> { + // We don't want users to confuse this with a general way to list all sagas. + // Such a command would read database state and it would go under "omdb db". + eprintln!( + "{}", + textwrap::wrap( + "NOTE: This command only reads in-memory state from the targeted \ + Nexus instance. Sagas may be missing if they were run by a \ + different Nexus instance or if they finished before this Nexus \ + instance last started up.", + 80 + ) + .join("\n") + ); + + let saga_stream = client.saga_list_stream(None, None); + let sagas = + saga_stream.try_collect::>().await.context("listing sagas")?; + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct SagaRow { + saga_id: Uuid, + state: &'static str, + } + let rows = sagas.into_iter().map(|saga| SagaRow { + saga_id: saga.id, + state: match saga.state { + SagaState::Running => "running", + SagaState::Succeeded => "succeeded", + SagaState::Failed { .. } => "failed", + SagaState::Stuck { .. } => "stuck", + }, + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + println!("{}", table); + Ok(()) +} + +/// Runs `omdb nexus sagas demo-create` +async fn cmd_nexus_sagas_demo_create( + client: &nexus_client::Client, +) -> Result<(), anyhow::Error> { + let demo_saga = + client.saga_demo_create().await.context("creating demo saga")?; + println!("saga id: {}", demo_saga.saga_id); + println!( + "demo saga id: {} (use this with `demo-complete`)", + demo_saga.demo_saga_id.to_string(), + ); + Ok(()) +} + +/// Runs `omdb nexus sagas demo-complete` +async fn cmd_nexus_sagas_demo_complete( + client: &nexus_client::Client, + args: &DemoSagaIdArgs, +) -> Result<(), anyhow::Error> { + if let Err(error) = client + .saga_demo_complete(&args.demo_saga_id) + .await + .context("completing demo saga") + { + eprintln!("error: {:#}", error); + eprintln!( + "note: `demo-complete` must be run against the same Nexus \ + instance that is currently running that saga." + ); + eprintln!( + "note: Be sure that you're using the demo_saga_id, not the saga_id." + ); + Err(error) + } else { + Ok(()) + } +} + /// Runs `omdb nexus sleds list-uninitialized` async fn cmd_nexus_sleds_list_uninitialized( client: &nexus_client::Client, diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index a6bf4d4667..899f12bb18 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -460,6 +460,22 @@ note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=d note: database schema version matches expected () note: listing all commissioned sleds (use -F to filter, e.g. -F in-service) ============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas", "list"] +termination: Exited(0) +--------------------------------------------- +stdout: +SAGA_ID STATE +--------------------------------------------- +stderr: +note: Nexus URL not specified. Will pick one from DNS. +note: using Nexus URL http://[::ffff:127.0.0.1]:REDACTED_PORT +WARNING: A Nexus instance was selected from DNS because a specific one was not +specified. But the `omdb nexus sagas` commands usually only make sense when +targeting a specific Nexus instance. +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +============================================= EXECUTING COMMAND: omdb ["oximeter", "--oximeter-url", "junk", "list-producers"] termination: Exited(1) --------------------------------------------- diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index cec3fa3052..c5525a490a 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -609,6 +609,41 @@ warning: unknown background task: "vpc_route_manager" (don't know how to interpr stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ ============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas", "list"] +termination: Exited(0) +--------------------------------------------- +stdout: +SAGA_ID STATE +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas", "demo-create"] +termination: Exited(0) +--------------------------------------------- +stdout: +saga id: ..................... +demo saga id: ..................... (use this with `demo-complete`) +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas", "list"] +termination: Exited(0) +--------------------------------------------- +stdout: +SAGA_ID STATE +..................... running +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +============================================= EXECUTING COMMAND: omdb ["--destructive", "nexus", "background-tasks", "activate", "inventory_collection"] termination: Exited(0) --------------------------------------------- diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 6a959d726a..c78b59f752 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -81,6 +81,7 @@ async fn test_omdb_usage_errors() { &["nexus"], &["nexus", "background-tasks"], &["nexus", "blueprints"], + &["nexus", "sagas"], &["nexus", "sleds"], &["sled-agent"], &["sled-agent", "zones"], @@ -134,6 +135,9 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { &["mgs", "inventory"], &["nexus", "background-tasks", "doc"], &["nexus", "background-tasks", "show"], + &["nexus", "sagas", "list"], + &["nexus", "sagas", "demo-create"], + &["nexus", "sagas", "list"], &[ "--destructive", "nexus", @@ -326,6 +330,16 @@ async fn test_omdb_env_settings(cptestctx: &ControlPlaneTestContext) { let args = &["--dns-server", &dns_sockaddr.to_string(), "db", "sleds"]; do_run(&mut output, move |exec| exec, &cmd_path, args).await; + // That said, the "sagas" command prints an extra warning in this case. + let args = &["nexus", "sagas", "list"]; + do_run( + &mut output, + move |exec| exec.env("OMDB_DNS_SERVER", &dns_sockaddr.to_string()), + &cmd_path, + args, + ) + .await; + // Case: specified in multiple places (command-line argument wins) let args = &["oximeter", "--oximeter-url", "junk", "list-producers"]; let ox = ox_url.clone(); diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index d4ea5450a6..7eecbe5676 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -442,6 +442,7 @@ Usage: omdb nexus [OPTIONS] Commands: background-tasks print information about background tasks blueprints interact with blueprints + sagas view sagas, create and complete demo sagas sleds interact with sleds help Print this message or the help of the given subcommand(s) @@ -515,6 +516,34 @@ Connection Options: OMDB_NEXUS_URL=] --dns-server [env: OMDB_DNS_SERVER=] +Safety Options: + -w, --destructive Allow potentially-destructive subcommands +============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas"] +termination: Exited(2) +--------------------------------------------- +stdout: +--------------------------------------------- +stderr: +view sagas, create and complete demo sagas + +Usage: omdb nexus sagas [OPTIONS] + +Commands: + list List sagas run by this Nexus + demo-create Create a "demo" saga + demo-complete Complete a demo saga started with "demo-create" + help Print this message or the help of the given subcommand(s) + +Options: + --log-level log level filter [env: LOG_LEVEL=] [default: warn] + -h, --help Print help + +Connection Options: + --nexus-internal-url URL of the Nexus internal API [env: + OMDB_NEXUS_URL=] + --dns-server [env: OMDB_DNS_SERVER=] + Safety Options: -w, --destructive Allow potentially-destructive subcommands ============================================= diff --git a/nexus/internal-api/src/lib.rs b/nexus/internal-api/src/lib.rs index c6ade3b1a2..6a98c44614 100644 --- a/nexus/internal-api/src/lib.rs +++ b/nexus/internal-api/src/lib.rs @@ -23,7 +23,7 @@ use nexus_types::{ OximeterInfo, RackInitializationRequest, SledAgentInfo, SwitchPutRequest, SwitchPutResponse, }, - views::{BackgroundTask, Ipv4NatEntryView, Saga}, + views::{BackgroundTask, DemoSaga, Ipv4NatEntryView, Saga}, }, }; use omicron_common::{ @@ -39,7 +39,8 @@ use omicron_common::{ update::ArtifactId, }; use omicron_uuid_kinds::{ - DownstairsKind, SledUuid, TypedUuid, UpstairsKind, UpstairsRepairKind, + DemoSagaUuid, DownstairsKind, SledUuid, TypedUuid, UpstairsKind, + UpstairsRepairKind, }; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -282,6 +283,31 @@ pub trait NexusInternalApi { path_params: Path, ) -> Result, HttpError>; + /// Kick off an instance of the "demo" saga + /// + /// This saga is used for demo and testing. The saga just waits until you + /// complete using the `saga_demo_complete` API. + #[endpoint { + method = POST, + path = "/demo-saga", + }] + async fn saga_demo_create( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Complete a waiting demo saga + /// + /// Note that the id used here is not the same as the id of the saga. It's + /// the one returned by the `saga_demo_create` API. + #[endpoint { + method = POST, + path = "/demo-saga/{demo_saga_id}/complete", + }] + async fn saga_demo_complete( + rqctx: RequestContext, + path_params: Path, + ) -> Result; + // Background Tasks /// List background tasks @@ -565,6 +591,12 @@ pub struct SagaPathParam { pub saga_id: Uuid, } +/// Path parameters for DemoSaga requests +#[derive(Deserialize, JsonSchema)] +pub struct DemoSagaPathParam { + pub demo_saga_id: DemoSagaUuid, +} + /// Path parameters for Background Task requests #[derive(Deserialize, JsonSchema)] pub struct BackgroundTaskPathParam { diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 60ed611bd7..5cfacd0c9c 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -7,6 +7,7 @@ use self::external_endpoints::NexusCertResolver; use self::saga::SagaExecutor; use crate::app::background::BackgroundTasksData; +use crate::app::background::SagaRecoveryHelpers; use crate::app::oximeter::LazyTimeseriesClient; use crate::populate::populate_start; use crate::populate::PopulateArgs; @@ -19,6 +20,7 @@ use nexus_config::NexusConfig; use nexus_config::RegionAllocationStrategy; use nexus_config::Tunables; use nexus_config::UpdatesConfig; +use nexus_db_model::AllSchemaVersions; use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; @@ -35,6 +37,7 @@ use std::net::SocketAddrV6; use std::net::{IpAddr, Ipv6Addr}; use std::sync::Arc; use std::sync::OnceLock; +use tokio::sync::mpsc; use uuid::Uuid; // The implementation of Nexus is large, and split into a number of submodules @@ -89,12 +92,9 @@ pub(crate) mod sagas; // TODO: When referring to API types, we should try to include // the prefix unless it is unambiguous. -pub(crate) use nexus_db_queries::db::queries::disk::MAX_DISKS_PER_INSTANCE; - -use crate::app::background::SagaRecoveryHelpers; -use nexus_db_model::AllSchemaVersions; pub(crate) use nexus_db_model::MAX_NICS_PER_INSTANCE; -use tokio::sync::mpsc; +pub(crate) use nexus_db_queries::db::queries::disk::MAX_DISKS_PER_INSTANCE; +use sagas::demo::CompletingDemoSagas; // XXX: Might want to recast as max *floating* IPs, we have at most one // ephemeral (so bounded in saga by design). @@ -204,6 +204,9 @@ pub struct Nexus { /// Default Crucible region allocation strategy default_region_allocation_strategy: RegionAllocationStrategy, + + /// List of demo sagas awaiting a request to complete them + demo_sagas: Arc>, } impl Nexus { @@ -480,6 +483,9 @@ impl Nexus { .pkg .default_region_allocation_strategy .clone(), + demo_sagas: Arc::new(std::sync::Mutex::new( + CompletingDemoSagas::new(), + )), }; // TODO-cleanup all the extra Arcs here seems wrong @@ -955,6 +961,17 @@ impl Nexus { } Ok(clients.into_iter().collect::>()) } + + pub(crate) fn demo_sagas( + &self, + ) -> Result, Error> { + self.demo_sagas.lock().map_err(|error| { + Error::internal_error(&format!( + "failed to acquire demo_sagas lock: {:#}", + error + )) + }) + } } /// For unimplemented endpoints, indicates whether the resource identified diff --git a/nexus/src/app/saga.rs b/nexus/src/app/saga.rs index 2b510a0f12..c35b8e1c3e 100644 --- a/nexus/src/app/saga.rs +++ b/nexus/src/app/saga.rs @@ -58,12 +58,14 @@ use futures::FutureExt; use futures::StreamExt; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; +use nexus_types::internal_api::views::DemoSaga; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::ListResult; use omicron_common::api::external::LookupResult; use omicron_common::api::external::ResourceType; use omicron_common::bail_unless; +use omicron_uuid_kinds::DemoSagaUuid; use std::sync::Arc; use std::sync::OnceLock; use steno::SagaDag; @@ -296,7 +298,6 @@ pub(crate) struct RunnableSaga { } impl RunnableSaga { - #[cfg(test)] pub(crate) fn id(&self) -> SagaId { self.id } @@ -463,4 +464,25 @@ impl super::Nexus { pub(crate) fn sec(&self) -> &steno::SecClient { &self.sagas.sec_client } + + pub(crate) async fn saga_demo_create(&self) -> Result { + use crate::app::sagas::demo; + let demo_saga_id = DemoSagaUuid::new_v4(); + let saga_params = demo::Params { id: demo_saga_id }; + let saga_dag = create_saga_dag::(saga_params)?; + let runnable_saga = self.sagas.saga_prepare(saga_dag).await?; + let saga_id = runnable_saga.id().0; + // We don't need the handle that runnable_saga.start() returns because + // we're not going to wait for the saga to finish here. + let _ = runnable_saga.start().await?; + Ok(DemoSaga { saga_id, demo_saga_id }) + } + + pub(crate) fn saga_demo_complete( + &self, + demo_saga_id: DemoSagaUuid, + ) -> Result<(), Error> { + let mut demo_sagas = self.demo_sagas()?; + demo_sagas.complete(demo_saga_id) + } } diff --git a/nexus/src/app/sagas/demo.rs b/nexus/src/app/sagas/demo.rs new file mode 100644 index 0000000000..f1817542d8 --- /dev/null +++ b/nexus/src/app/sagas/demo.rs @@ -0,0 +1,126 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Demo saga, used for testing and interactive debugging +//! +//! The "Demo" saga exists so that developers and automated tests can create a +//! saga that will not complete until they take some action to complete it. The +//! saga just waits until it gets the message that it should finish. Users +//! create demo sagas and complete them using requests to the internal API. +//! +//! The implementation is entirely in-memory, which means you have to send the +//! completion message to the Nexus that's running the saga. However, it does +//! work across Nexus restarts, so this can be used to exercise the saga +//! recovery path. +//! +//! It's tempting to build this only for development and not official releases, +//! but that'd be more work, there's little downside to always including it, and +//! it's conceivable that it'd be useful for production systems, too. + +use super::NexusActionContext; +use super::{ActionRegistry, NexusSaga, SagaInitError}; +use crate::app::sagas::declare_saga_actions; +use omicron_common::api::external::Error; +use omicron_uuid_kinds::DemoSagaUuid; +use serde::Deserialize; +use serde::Serialize; +use slog::info; +use std::collections::BTreeMap; +use steno::ActionError; +use tokio::sync::oneshot; + +/// Set of demo sagas that have been marked completed +/// +/// Nexus maintains one of these at the top level. Individual demo sagas wait +/// until their id shows up here, then remove it and proceed. +pub struct CompletingDemoSagas { + ids: BTreeMap>, +} + +impl CompletingDemoSagas { + pub fn new() -> CompletingDemoSagas { + CompletingDemoSagas { ids: BTreeMap::new() } + } + + pub fn complete(&mut self, id: DemoSagaUuid) -> Result<(), Error> { + self.ids + .remove(&id) + .ok_or_else(|| { + Error::non_resourcetype_not_found(format!( + "demo saga with id {:?}", + id + )) + })? + .send(()) + .map_err(|_| { + Error::internal_error( + "saga stopped listening (Nexus shutting down?)", + ) + }) + } + + pub fn subscribe(&mut self, id: DemoSagaUuid) -> oneshot::Receiver<()> { + let (tx, rx) = oneshot::channel(); + assert!( + self.ids.insert(id, tx).is_none(), + "multiple subscriptions for the same demo saga" + ); + rx + } +} + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + pub id: DemoSagaUuid, +} + +declare_saga_actions! { + demo; + DEMO_WAIT -> "demo_wait" { + + demo_wait + } +} + +#[derive(Debug)] +pub(crate) struct SagaDemo; +impl NexusSaga for SagaDemo { + const NAME: &'static str = "demo"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + demo_register_actions(registry); + } + + fn make_saga_dag( + _params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(demo_wait_action()); + Ok(builder.build()?) + } +} + +async fn demo_wait(sagactx: NexusActionContext) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let demo_id = sagactx.saga_params::()?.id; + let log = osagactx.log(); + info!(log, "demo saga: begin wait"; "id" => %demo_id); + let rx = { + let mut demo_sagas = osagactx + .nexus() + .demo_sagas() + .map_err(ActionError::action_failed)?; + demo_sagas.subscribe(demo_id) + }; + match rx.await { + Ok(_) => { + info!(log, "demo saga: completing"; "id" => %demo_id); + } + Err(_) => { + info!(log, "demo saga: waiting failed (Nexus shutting down?)"; + "id" => %demo_id); + } + } + Ok(()) +} diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 17f43b4950..8e910a8d70 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -22,6 +22,7 @@ use steno::SagaType; use thiserror::Error; use uuid::Uuid; +pub mod demo; pub mod disk_create; pub mod disk_delete; pub mod finalize_disk; @@ -133,6 +134,7 @@ fn make_action_registry() -> ActionRegistry { let mut registry = steno::ActionRegistry::new(); registry.register(Arc::clone(&*ACTION_GENERATE_ID)); + ::register_actions(&mut registry); ::register_actions(&mut registry); ::register_actions(&mut registry); ::register_actions( diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 28ff712c24..2ac769b3f2 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -35,6 +35,7 @@ use nexus_types::internal_api::params::SwitchPutRequest; use nexus_types::internal_api::params::SwitchPutResponse; use nexus_types::internal_api::views::to_list; use nexus_types::internal_api::views::BackgroundTask; +use nexus_types::internal_api::views::DemoSaga; use nexus_types::internal_api::views::Ipv4NatEntryView; use nexus_types::internal_api::views::Saga; use omicron_common::api::external::http_pagination::data_page_params_for; @@ -530,6 +531,40 @@ impl NexusInternalApi for NexusInternalApiImpl { .await } + async fn saga_demo_create( + rqctx: RequestContext, + ) -> Result, HttpError> { + let apictx = &rqctx.context().context; + let handler = async { + let nexus = &apictx.nexus; + let demo_saga = nexus.saga_demo_create().await?; + Ok(HttpResponseOk(demo_saga)) + }; + + apictx + .internal_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn saga_demo_complete( + rqctx: RequestContext, + path_params: Path, + ) -> Result { + let apictx = &rqctx.context().context; + let handler = async { + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + nexus.saga_demo_complete(path.demo_saga_id)?; + Ok(HttpResponseUpdatedNoContent()) + }; + + apictx + .internal_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + // Background Tasks async fn bgtask_list( diff --git a/nexus/tests/integration_tests/demo_saga.rs b/nexus/tests/integration_tests/demo_saga.rs new file mode 100644 index 0000000000..e81c7f6df0 --- /dev/null +++ b/nexus/tests/integration_tests/demo_saga.rs @@ -0,0 +1,74 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Smoke test for the demo saga + +use futures::TryStreamExt; +use nexus_client::types::Saga; +use nexus_client::types::SagaState; +use nexus_test_interface::NexusServer; +use nexus_test_utils_macros::nexus_test; +use omicron_test_utils::dev::poll::wait_for_condition; +use omicron_test_utils::dev::poll::CondCheckError; +use std::time::Duration; + +type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + +// Tests that we can create a demo saga, then mark it completed, and the actual +// saga's state matches what we expect along the way. +#[nexus_test] +async fn test_demo_saga(cptestctx: &ControlPlaneTestContext) { + let log = &cptestctx.logctx.log; + let nexus_internal_url = format!( + "http://{}", + cptestctx.server.get_http_server_internal_address().await + ); + let nexus_client = + nexus_client::Client::new(&nexus_internal_url, log.clone()); + + let sagas_before = list_sagas(&nexus_client).await; + eprintln!("found sagas (before): {:?}", sagas_before); + let demo_saga = nexus_client.saga_demo_create().await.unwrap(); + let saga_id = demo_saga.saga_id; + assert!(sagas_before.into_iter().find(|s| s.id == saga_id).is_none()); + + let sagas_after = list_sagas(&nexus_client).await; + eprintln!("found sagas (after): {:?}", sagas_after); + let found = sagas_after.into_iter().find(|s| s.id == saga_id).unwrap(); + assert!(matches!(found.state, SagaState::Running)); + + // It is hard to verify that the saga is not going to complete by itself. + // No matter how long we wait and make sure it didn't complete, it might + // have completed after that. And then we've made the test suite take that + // much longer. But we can at least make sure that completing the saga + // does cause it to finish. + nexus_client.saga_demo_complete(&demo_saga.demo_saga_id).await.unwrap(); + + // Completion is not synchronous -- that just unblocked the saga. So we + // need to poll a bit to wait for it to actually finish. + let found = wait_for_condition( + || async { + let sagas = list_sagas(&nexus_client).await; + eprintln!("found sagas (last): {:?}", sagas); + let found = sagas.into_iter().find(|s| s.id == saga_id).unwrap(); + if matches!(found.state, SagaState::Succeeded) { + Ok(found) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &Duration::from_millis(50), + &Duration::from_secs(30), + ) + .await + .unwrap(); + + assert_eq!(found.id, saga_id); + assert!(matches!(found.state, SagaState::Succeeded)); +} + +async fn list_sagas(client: &nexus_client::Client) -> Vec { + client.saga_list_stream(None, None).try_collect::>().await.unwrap() +} diff --git a/nexus/tests/integration_tests/mod.rs b/nexus/tests/integration_tests/mod.rs index 5054527c63..fdf14dbd07 100644 --- a/nexus/tests/integration_tests/mod.rs +++ b/nexus/tests/integration_tests/mod.rs @@ -11,6 +11,7 @@ mod basic; mod certificates; mod commands; mod console_api; +mod demo_saga; mod device_auth; mod disks; mod external_ips; diff --git a/nexus/types/src/internal_api/views.rs b/nexus/types/src/internal_api/views.rs index b71fd04779..a4557ffd31 100644 --- a/nexus/types/src/internal_api/views.rs +++ b/nexus/types/src/internal_api/views.rs @@ -9,6 +9,7 @@ use futures::stream::StreamExt; use omicron_common::api::external::MacAddr; use omicron_common::api::external::ObjectStream; use omicron_common::api::external::Vni; +use omicron_uuid_kinds::DemoSagaUuid; use schemars::JsonSchema; use serde::Serialize; use std::net::Ipv4Addr; @@ -152,6 +153,13 @@ impl From for SagaState { } } +/// Identifies an instance of the demo saga +#[derive(Clone, Debug, Serialize, JsonSchema)] +pub struct DemoSaga { + pub saga_id: Uuid, + pub demo_saga_id: DemoSagaUuid, +} + /// Background tasks /// /// These are currently only intended for observability by developers. We will diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 912ccbcf00..0eaa6dd1d2 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -364,6 +364,59 @@ } } }, + "/demo-saga": { + "post": { + "summary": "Kick off an instance of the \"demo\" saga", + "description": "This saga is used for demo and testing. The saga just waits until you complete using the `saga_demo_complete` API.", + "operationId": "saga_demo_create", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DemoSaga" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/demo-saga/{demo_saga_id}/complete": { + "post": { + "summary": "Complete a waiting demo saga", + "description": "Note that the id used here is not the same as the id of the saga. It's the one returned by the `saga_demo_create` API.", + "operationId": "saga_demo_complete", + "parameters": [ + { + "in": "path", + "name": "demo_saga_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForDemoSagaKind" + } + } + ], + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/deployment/blueprints/all": { "get": { "summary": "Lists blueprints", @@ -2624,6 +2677,23 @@ "kind" ] }, + "DemoSaga": { + "description": "Identifies an instance of the demo saga", + "type": "object", + "properties": { + "demo_saga_id": { + "$ref": "#/components/schemas/TypedUuidForDemoSagaKind" + }, + "saga_id": { + "type": "string", + "format": "uuid" + } + }, + "required": [ + "demo_saga_id", + "saga_id" + ] + }, "DiskIdentity": { "description": "Uniquely identifies a disk.", "type": "object", @@ -4966,6 +5036,10 @@ "SwitchPutResponse": { "type": "object" }, + "TypedUuidForDemoSagaKind": { + "type": "string", + "format": "uuid" + }, "TypedUuidForDownstairsRegionKind": { "type": "string", "format": "uuid" diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 8f695d2399..ba586c03a5 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -51,6 +51,7 @@ macro_rules! impl_typed_uuid_kind { impl_typed_uuid_kind! { Collection => "collection", Dataset => "dataset", + DemoSaga => "demo_saga", Downstairs => "downstairs", DownstairsRegion => "downstairs_region", ExternalIp => "external_ip", From c2ef4175e0638eab9302ad693927fa8942db03eb Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 9 Aug 2024 17:15:51 -0700 Subject: [PATCH 2/6] mark the demo saga commands destructive; fix lints --- dev-tools/omdb/src/bin/omdb/nexus.rs | 11 ++++++++--- dev-tools/omdb/tests/successes.out | 2 +- dev-tools/omdb/tests/test_all_output.rs | 11 ++++++++++- dev-tools/omdb/tests/usage_errors.out | 9 +++++++++ nexus/tests/integration_tests/demo_saga.rs | 2 +- 5 files changed, 29 insertions(+), 6 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 0a809c48f0..cacac49ee9 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -453,10 +453,13 @@ impl NexusArgs { match command { SagasCommands::List => cmd_nexus_sagas_list(&client).await, SagasCommands::DemoCreate => { - cmd_nexus_sagas_demo_create(&client).await + let token = omdb.check_allow_destructive()?; + cmd_nexus_sagas_demo_create(&client, token).await } SagasCommands::DemoComplete(args) => { - cmd_nexus_sagas_demo_complete(&client, args).await + let token = omdb.check_allow_destructive()?; + cmd_nexus_sagas_demo_complete(&client, args, token) + .await } } } @@ -1657,13 +1660,14 @@ async fn cmd_nexus_sagas_list( /// Runs `omdb nexus sagas demo-create` async fn cmd_nexus_sagas_demo_create( client: &nexus_client::Client, + _destruction_token: DestructiveOperationToken, ) -> Result<(), anyhow::Error> { let demo_saga = client.saga_demo_create().await.context("creating demo saga")?; println!("saga id: {}", demo_saga.saga_id); println!( "demo saga id: {} (use this with `demo-complete`)", - demo_saga.demo_saga_id.to_string(), + demo_saga.demo_saga_id, ); Ok(()) } @@ -1672,6 +1676,7 @@ async fn cmd_nexus_sagas_demo_create( async fn cmd_nexus_sagas_demo_complete( client: &nexus_client::Client, args: &DemoSagaIdArgs, + _destruction_token: DestructiveOperationToken, ) -> Result<(), anyhow::Error> { if let Err(error) = client .saga_demo_complete(&args.demo_saga_id) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index c5525a490a..c67871993e 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -621,7 +621,7 @@ NOTE: This command only reads in-memory state from the targeted Nexus instance. Sagas may be missing if they were run by a different Nexus instance or if they finished before this Nexus instance last started up. ============================================= -EXECUTING COMMAND: omdb ["nexus", "sagas", "demo-create"] +EXECUTING COMMAND: omdb ["--destructive", "nexus", "sagas", "demo-create"] termination: Exited(0) --------------------------------------------- stdout: diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index c78b59f752..d0258aeaed 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -82,6 +82,15 @@ async fn test_omdb_usage_errors() { &["nexus", "background-tasks"], &["nexus", "blueprints"], &["nexus", "sagas"], + // Missing "--destructive" flag. The URL is bogus but just ensures that + // we get far enough to hit the error we care about. + &[ + "nexus", + "--nexus-internal-url", + "http://[::1]:111", + "sagas", + "demo-create", + ], &["nexus", "sleds"], &["sled-agent"], &["sled-agent", "zones"], @@ -136,7 +145,7 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { &["nexus", "background-tasks", "doc"], &["nexus", "background-tasks", "show"], &["nexus", "sagas", "list"], - &["nexus", "sagas", "demo-create"], + &["--destructive", "nexus", "sagas", "demo-create"], &["nexus", "sagas", "list"], &[ "--destructive", diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 7eecbe5676..10795b35b7 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -547,6 +547,15 @@ Connection Options: Safety Options: -w, --destructive Allow potentially-destructive subcommands ============================================= +EXECUTING COMMAND: omdb ["nexus", "--nexus-internal-url", "http://[::1]:111", "sagas", "demo-create"] +termination: Exited(1) +--------------------------------------------- +stdout: +--------------------------------------------- +stderr: +note: using Nexus URL http://[::1]:111 +Error: This command is potentially destructive. Pass the `-w` / `--destructive` flag to allow it. +============================================= EXECUTING COMMAND: omdb ["nexus", "sleds"] termination: Exited(2) --------------------------------------------- diff --git a/nexus/tests/integration_tests/demo_saga.rs b/nexus/tests/integration_tests/demo_saga.rs index e81c7f6df0..888fa35965 100644 --- a/nexus/tests/integration_tests/demo_saga.rs +++ b/nexus/tests/integration_tests/demo_saga.rs @@ -32,7 +32,7 @@ async fn test_demo_saga(cptestctx: &ControlPlaneTestContext) { eprintln!("found sagas (before): {:?}", sagas_before); let demo_saga = nexus_client.saga_demo_create().await.unwrap(); let saga_id = demo_saga.saga_id; - assert!(sagas_before.into_iter().find(|s| s.id == saga_id).is_none()); + assert!(!sagas_before.into_iter().any(|s| s.id == saga_id)); let sagas_after = list_sagas(&nexus_client).await; eprintln!("found sagas (after): {:?}", sagas_after); From 41cc6a62c550187071d0fba3743cd22af53619d2 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 12 Aug 2024 12:56:12 -0700 Subject: [PATCH 3/6] dont panic --- nexus/src/app/sagas/demo.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/nexus/src/app/sagas/demo.rs b/nexus/src/app/sagas/demo.rs index f1817542d8..4a8eda8b80 100644 --- a/nexus/src/app/sagas/demo.rs +++ b/nexus/src/app/sagas/demo.rs @@ -21,6 +21,7 @@ use super::NexusActionContext; use super::{ActionRegistry, NexusSaga, SagaInitError}; use crate::app::sagas::declare_saga_actions; +use anyhow::ensure; use omicron_common::api::external::Error; use omicron_uuid_kinds::DemoSagaUuid; use serde::Deserialize; @@ -60,13 +61,16 @@ impl CompletingDemoSagas { }) } - pub fn subscribe(&mut self, id: DemoSagaUuid) -> oneshot::Receiver<()> { + pub fn subscribe( + &mut self, + id: DemoSagaUuid, + ) -> Result, anyhow::Error> { let (tx, rx) = oneshot::channel(); - assert!( + ensure!( self.ids.insert(id, tx).is_none(), "multiple subscriptions for the same demo saga" ); - rx + Ok(rx) } } @@ -111,7 +115,12 @@ async fn demo_wait(sagactx: NexusActionContext) -> Result<(), ActionError> { .nexus() .demo_sagas() .map_err(ActionError::action_failed)?; - demo_sagas.subscribe(demo_id) + demo_sagas.subscribe(demo_id).map_err(|e| { + ActionError::action_failed(Error::internal_error(&format!( + "demo saga subscribe failed: {:#}", + e + ))) + })? }; match rx.await { Ok(_) => { From 9ee9d04fbd7b4b7ab3ebe2f9456c9801cbea5db5 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 12 Aug 2024 13:23:09 -0700 Subject: [PATCH 4/6] add doc --- docs/demo-saga.adoc | 202 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 docs/demo-saga.adoc diff --git a/docs/demo-saga.adoc b/docs/demo-saga.adoc new file mode 100644 index 0000000000..36521b0888 --- /dev/null +++ b/docs/demo-saga.adoc @@ -0,0 +1,202 @@ +:showtitle: +:numbered: +:toc: left + += Demo saga + +Nexus ships with a "demo" saga that can be used to interactively experiment with sagas, saga recovery, and saga transfer (after Nexus zone expungement). The demo saga consists of a single action that blocks until it's instructed to proceed. You instruct it to proceed using a request to the Nexus _internal_ API. + +In the example below, we'll: + +. Use `omicron-dev run-all` to run a simulated control plane stack +. Start a second Nexus whose execution we can control precisely +. Use the `omdb nexus sagas demo-create` command to kick off a demo saga +. Use the `omdb nexus sagas demo-complete` command to instruct that saga to finish + +For steps 1-2, we're just following the https://github.com/oxidecomputer/omicron/blob/main/docs/how-to-run-simulated.adoc#using-both-omicron-dev-run-all-and-running-nexus-manually[docs for running a simulated stack and a second Nexus]. First, run `omicron-dev run-all`: + +``` +$ cargo xtask omicron-dev run-all + Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.62s + Running `target/debug/xtask omicron-dev run-all` + Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.98s + Running `target/debug/omicron-dev run-all` +omicron-dev: setting up all services ... +log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.0.log +note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.0.log" +DB URL: postgresql://root@[::1]:43428/omicron?sslmode=disable +DB address: [::1]:43428 +log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.2.log +note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.2.log" +log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.3.log +note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.3.log" +omicron-dev: services are running. +omicron-dev: nexus external API: 127.0.0.1:12220 +omicron-dev: nexus internal API: [::1]:12221 +omicron-dev: cockroachdb pid: 7166 +omicron-dev: cockroachdb URL: postgresql://root@[::1]:43428/omicron?sslmode=disable +omicron-dev: cockroachdb directory: /dangerzone/omicron_tmp/.tmpkzPi6h +omicron-dev: internal DNS HTTP: http://[::1]:55952 +omicron-dev: internal DNS: [::1]:36474 +omicron-dev: external DNS name: oxide-dev.test +omicron-dev: external DNS HTTP: http://[::1]:64396 +omicron-dev: external DNS: [::1]:35977 +omicron-dev: e.g. `dig @::1 -p 35977 test-suite-silo.sys.oxide-dev.test` +omicron-dev: management gateway: http://[::1]:33325 (switch0) +omicron-dev: management gateway: http://[::1]:61144 (switch1) +omicron-dev: silo name: test-suite-silo +omicron-dev: privileged user name: test-privileged +``` + +Then follow those docs to configure and start a second Nexus: + +``` +$ cargo run --bin=nexus -- config-second.toml +... +Aug 12 20:16:25.405 INFO listening, local_addr: [::1]:12223, component: dropshot_internal, name: a4ef738a-1fb0-47b1-9da2-4919c7ec7c7f, file: /home/dap/.cargo/git/checkouts/dropshot-a4a923d29dccc492/52d900a/dropshot/src/server.rs:205 +... +``` + +The rest of these instructions will use `omdb` pointed at the second Nexus instance, so we'll set OMDB_NEXUS_URL in the environment: + +``` +$ export OMDB_NEXUS_URL=http://[::1]:12223 +``` + +Now we can use `omdb nexus sagas list` to list the sagas that have run _in that second Nexus process_ only: + +``` +$ cargo run --bin=omdb -- nexus sagas list + Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.95s + Running `target/debug/omdb nexus sagas list` +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +``` + +Now we can create a demo saga: + +``` +$ cargo run --bin=omdb -- --destructive nexus sagas demo-create + Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.98s + Running `target/debug/omdb --destructive nexus sagas demo-create` +note: using Nexus URL http://[::1]:12223 +saga id: f7765d6a-6e45-4c13-8904-2677b79a97eb +demo saga id: 88eddf09-dda3-4d70-8d99-1d3b441c57da (use this with `demo-complete`) +``` + +We have to use the `--destructive` option because this command by nature changes state in Nexus and `omdb` won't allow commands that change state by default. + +We can see the new saga in the list of sagas now. It's running: + +``` +$ cargo run --bin=omdb -- --destructive nexus sagas list + Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.98s + Running `target/debug/omdb --destructive nexus sagas list` +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +f7765d6a-6e45-4c13-8904-2677b79a97eb running +``` + +and it will stay running indefinitely until we run `demo-complete`. Let's do that: + +``` +$ cargo run --bin=omdb -- --destructive nexus sagas demo-complete 88eddf09-dda3-4d70-8d99-1d3b441c57da + Finished `dev` profile [unoptimized + debuginfo] target(s) in 1.04s + Running `target/debug/omdb --destructive nexus sagas demo-complete 88eddf09-dda3-4d70-8d99-1d3b441c57da` +note: using Nexus URL http://[::1]:12223 +``` + +and then list sagas again: + +``` +$ cargo run --bin=omdb -- nexus sagas list + Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.97s + Running `target/debug/omdb nexus sagas list` +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +f7765d6a-6e45-4c13-8904-2677b79a97eb succeeded +``` + +It works across recovery, too. You can go through the same loop again, but this time kill Nexus and start it again: + +``` +77b79a97eb succeeded +dap@ivanova omicron-merge $ cargo run --bin=omdb -- --destructive nexus sagas demo-create + Finished `dev` profile [unoptimized + debuginfo] target(s) in 1.00s + Running `target/debug/omdb --destructive nexus sagas demo-create` +note: using Nexus URL http://[::1]:12223 +saga id: 65253cb6-4428-4aa7-9afc-bf9b42166cb5 +demo saga id: 208ebc89-acc6-42d3-9f40-7f5567c8a39b (use this with `demo-complete`) +``` + +Now restart Nexus (^C the second invocation and run it again). Now if we use `omdb we don't see the earlier saga because it was finished when this new Nexus process started. But we see the one we created later because it was recovered: + +``` +dap@ivanova omicron-merge $ cargo run --bin=omdb -- nexus sagas list + Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.97s + Running `target/debug/omdb nexus sagas list` +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +65253cb6-4428-4aa7-9afc-bf9b42166cb5 running +``` + +Side note: we can see it was recovered: + +``` +$ cargo run --bin=omdb -- nexus background-tasks show +... +task: "saga_recovery" + configured period: every 10m + currently executing: no + last completed activation: iter 1, triggered by a periodic timer firing + started at 2024-08-12T20:20:41.714Z (44s ago) and ran for 79ms + since Nexus started: + sagas recovered: 1 + sagas recovery errors: 0 + sagas observed started: 0 + sagas inferred finished: 0 + missing from SEC: 0 + bad state in SEC: 0 + last pass: + found sagas: 1 (in-progress, assigned to this Nexus) + recovered: 1 (successfully) + failed: 0 + skipped: 0 (already running) + removed: 0 (newly finished) + recently recovered sagas (1): + TIME SAGA_ID + 2024-08-12T20:20:41Z 65253cb6-4428-4aa7-9afc-bf9b42166cb5 + no saga recovery failures +... +``` + +Now we can complete that saga: + +``` +$ cargo run --bin=omdb -- --destructive nexus sagas demo-complete 208ebc89-acc6-42d3-9f40-7f5567c8a39b + Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.98s + Running `target/debug/omdb --destructive nexus sagas demo-complete 208ebc89-acc6-42d3-9f40-7f5567c8a39b` +note: using Nexus URL http://[::1]:12223 + +$ cargo run --bin=omdb -- nexus sagas list + Finished `dev` profile [unoptimized + debuginfo] target(s) in 1.00s + Running `target/debug/omdb nexus sagas list` +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +65253cb6-4428-4aa7-9afc-bf9b42166cb5 succeeded +``` From e199de753fdca3075e242fa9008c8c43f335105f Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 12 Aug 2024 13:27:54 -0700 Subject: [PATCH 5/6] docs nits --- docs/demo-saga.adoc | 63 ++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/docs/demo-saga.adoc b/docs/demo-saga.adoc index 36521b0888..b71d23d8fd 100644 --- a/docs/demo-saga.adoc +++ b/docs/demo-saga.adoc @@ -15,12 +15,9 @@ In the example below, we'll: For steps 1-2, we're just following the https://github.com/oxidecomputer/omicron/blob/main/docs/how-to-run-simulated.adoc#using-both-omicron-dev-run-all-and-running-nexus-manually[docs for running a simulated stack and a second Nexus]. First, run `omicron-dev run-all`: -``` +```terminal $ cargo xtask omicron-dev run-all - Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.62s - Running `target/debug/xtask omicron-dev run-all` - Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.98s - Running `target/debug/omicron-dev run-all` +... omicron-dev: setting up all services ... log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.0.log note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.0.log" @@ -50,7 +47,7 @@ omicron-dev: privileged user name: test-privileged Then follow those docs to configure and start a second Nexus: -``` +```terminal $ cargo run --bin=nexus -- config-second.toml ... Aug 12 20:16:25.405 INFO listening, local_addr: [::1]:12223, component: dropshot_internal, name: a4ef738a-1fb0-47b1-9da2-4919c7ec7c7f, file: /home/dap/.cargo/git/checkouts/dropshot-a4a923d29dccc492/52d900a/dropshot/src/server.rs:205 @@ -59,16 +56,15 @@ Aug 12 20:16:25.405 INFO listening, local_addr: [::1]:12223, component: dropshot The rest of these instructions will use `omdb` pointed at the second Nexus instance, so we'll set OMDB_NEXUS_URL in the environment: -``` +```terminal $ export OMDB_NEXUS_URL=http://[::1]:12223 ``` Now we can use `omdb nexus sagas list` to list the sagas that have run _in that second Nexus process_ only: -``` +```terminal $ cargo run --bin=omdb -- nexus sagas list - Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.95s - Running `target/debug/omdb nexus sagas list` +... note: using Nexus URL http://[::1]:12223 NOTE: This command only reads in-memory state from the targeted Nexus instance. Sagas may be missing if they were run by a different Nexus instance or if they @@ -78,10 +74,9 @@ SAGA_ID STATE Now we can create a demo saga: -``` +```terminal $ cargo run --bin=omdb -- --destructive nexus sagas demo-create - Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.98s - Running `target/debug/omdb --destructive nexus sagas demo-create` +... note: using Nexus URL http://[::1]:12223 saga id: f7765d6a-6e45-4c13-8904-2677b79a97eb demo saga id: 88eddf09-dda3-4d70-8d99-1d3b441c57da (use this with `demo-complete`) @@ -91,10 +86,9 @@ We have to use the `--destructive` option because this command by nature changes We can see the new saga in the list of sagas now. It's running: -``` -$ cargo run --bin=omdb -- --destructive nexus sagas list - Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.98s - Running `target/debug/omdb --destructive nexus sagas list` +```terminal +$ cargo run --bin=omdb -- nexus sagas list +... note: using Nexus URL http://[::1]:12223 NOTE: This command only reads in-memory state from the targeted Nexus instance. Sagas may be missing if they were run by a different Nexus instance or if they @@ -105,19 +99,17 @@ f7765d6a-6e45-4c13-8904-2677b79a97eb running and it will stay running indefinitely until we run `demo-complete`. Let's do that: -``` +```terminal $ cargo run --bin=omdb -- --destructive nexus sagas demo-complete 88eddf09-dda3-4d70-8d99-1d3b441c57da - Finished `dev` profile [unoptimized + debuginfo] target(s) in 1.04s - Running `target/debug/omdb --destructive nexus sagas demo-complete 88eddf09-dda3-4d70-8d99-1d3b441c57da` +... note: using Nexus URL http://[::1]:12223 ``` and then list sagas again: -``` +```terminal $ cargo run --bin=omdb -- nexus sagas list - Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.97s - Running `target/debug/omdb nexus sagas list` +... note: using Nexus URL http://[::1]:12223 NOTE: This command only reads in-memory state from the targeted Nexus instance. Sagas may be missing if they were run by a different Nexus instance or if they @@ -128,11 +120,9 @@ f7765d6a-6e45-4c13-8904-2677b79a97eb succeeded It works across recovery, too. You can go through the same loop again, but this time kill Nexus and start it again: -``` -77b79a97eb succeeded -dap@ivanova omicron-merge $ cargo run --bin=omdb -- --destructive nexus sagas demo-create - Finished `dev` profile [unoptimized + debuginfo] target(s) in 1.00s - Running `target/debug/omdb --destructive nexus sagas demo-create` +```terminal +$ cargo run --bin=omdb -- --destructive nexus sagas demo-create +... note: using Nexus URL http://[::1]:12223 saga id: 65253cb6-4428-4aa7-9afc-bf9b42166cb5 demo saga id: 208ebc89-acc6-42d3-9f40-7f5567c8a39b (use this with `demo-complete`) @@ -140,10 +130,9 @@ demo saga id: 208ebc89-acc6-42d3-9f40-7f5567c8a39b (use this with `demo-complete Now restart Nexus (^C the second invocation and run it again). Now if we use `omdb we don't see the earlier saga because it was finished when this new Nexus process started. But we see the one we created later because it was recovered: -``` -dap@ivanova omicron-merge $ cargo run --bin=omdb -- nexus sagas list - Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.97s - Running `target/debug/omdb nexus sagas list` +```terminal +$ cargo run --bin=omdb -- nexus sagas list +... note: using Nexus URL http://[::1]:12223 NOTE: This command only reads in-memory state from the targeted Nexus instance. Sagas may be missing if they were run by a different Nexus instance or if they @@ -154,7 +143,7 @@ SAGA_ID STATE Side note: we can see it was recovered: -``` +```terminal $ cargo run --bin=omdb -- nexus background-tasks show ... task: "saga_recovery" @@ -184,15 +173,13 @@ task: "saga_recovery" Now we can complete that saga: -``` +```terminal $ cargo run --bin=omdb -- --destructive nexus sagas demo-complete 208ebc89-acc6-42d3-9f40-7f5567c8a39b - Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.98s - Running `target/debug/omdb --destructive nexus sagas demo-complete 208ebc89-acc6-42d3-9f40-7f5567c8a39b` +... note: using Nexus URL http://[::1]:12223 $ cargo run --bin=omdb -- nexus sagas list - Finished `dev` profile [unoptimized + debuginfo] target(s) in 1.00s - Running `target/debug/omdb nexus sagas list` +... note: using Nexus URL http://[::1]:12223 NOTE: This command only reads in-memory state from the targeted Nexus instance. Sagas may be missing if they were run by a different Nexus instance or if they From de786b11949be660e03a5d56f2c01dfd7005c455 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 12 Aug 2024 13:31:48 -0700 Subject: [PATCH 6/6] more doc nits --- docs/demo-saga.adoc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/demo-saga.adoc b/docs/demo-saga.adoc index b71d23d8fd..316050fc23 100644 --- a/docs/demo-saga.adoc +++ b/docs/demo-saga.adoc @@ -128,7 +128,7 @@ saga id: 65253cb6-4428-4aa7-9afc-bf9b42166cb5 demo saga id: 208ebc89-acc6-42d3-9f40-7f5567c8a39b (use this with `demo-complete`) ``` -Now restart Nexus (^C the second invocation and run it again). Now if we use `omdb we don't see the earlier saga because it was finished when this new Nexus process started. But we see the one we created later because it was recovered: +Now restart Nexus (^C the second invocation and run it again). Now if we use `omdb` we don't see the earlier saga because it was finished when this new Nexus process started. But we see the one we created later because it was recovered: ```terminal $ cargo run --bin=omdb -- nexus sagas list @@ -177,7 +177,11 @@ Now we can complete that saga: $ cargo run --bin=omdb -- --destructive nexus sagas demo-complete 208ebc89-acc6-42d3-9f40-7f5567c8a39b ... note: using Nexus URL http://[::1]:12223 +``` + +and see it finish: +``` $ cargo run --bin=omdb -- nexus sagas list ... note: using Nexus URL http://[::1]:12223 @@ -187,3 +191,5 @@ finished before this Nexus instance last started up. SAGA_ID STATE 65253cb6-4428-4aa7-9afc-bf9b42166cb5 succeeded ``` + +Note too that the completion is not synchronous with the `demo-complete` command, though it usually _is_ pretty quick. It's possible you'll catch it `running` if you run `nexus sagas list` right after running `nexus sagas demo-complete`, but you should quickly see it `succeeded` if you keep running `nexus sagas list`.