From bb9d3dfefc80c5d01fd09fe0415187d70a219142 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 6 Mar 2024 17:06:49 -0800 Subject: [PATCH 01/34] WIP: initial changes to add external DNS generation --- nexus/db-model/src/deployment.rs | 3 ++ nexus/db-model/src/schema.rs | 1 + .../db-queries/src/db/datastore/deployment.rs | 18 ++++++++++-- nexus/reconfigurator/execution/src/dns.rs | 2 ++ .../execution/src/omicron_zones.rs | 1 + .../planning/src/blueprint_builder.rs | 29 +++++++++++++++++-- nexus/reconfigurator/planning/src/planner.rs | 20 +++++++++++-- .../src/app/background/blueprint_execution.rs | 5 ++-- nexus/src/app/background/blueprint_load.rs | 1 + nexus/src/app/deployment.rs | 16 ++++++++-- nexus/types/src/deployment.rs | 8 ++++- openapi/nexus-internal.json | 18 ++++++++++++ schema/crdb/dbinit.sql | 5 +++- 13 files changed, 113 insertions(+), 14 deletions(-) diff --git a/nexus/db-model/src/deployment.rs b/nexus/db-model/src/deployment.rs index a1f285fbef..2a0df611b5 100644 --- a/nexus/db-model/src/deployment.rs +++ b/nexus/db-model/src/deployment.rs @@ -24,6 +24,7 @@ pub struct Blueprint { pub id: Uuid, pub parent_blueprint_id: Option, pub internal_dns_version: Generation, + pub external_dns_version: Generation, pub time_created: DateTime, pub creator: String, pub comment: String, @@ -35,6 +36,7 @@ impl From<&'_ nexus_types::deployment::Blueprint> for Blueprint { id: bp.id, parent_blueprint_id: bp.parent_blueprint_id, internal_dns_version: Generation(bp.internal_dns_version), + external_dns_version: Generation(bp.external_dns_version), time_created: bp.time_created, creator: bp.creator.clone(), comment: bp.comment.clone(), @@ -48,6 +50,7 @@ impl From for nexus_types::deployment::BlueprintMetadata { id: value.id, parent_blueprint_id: value.parent_blueprint_id, internal_dns_version: *value.internal_dns_version, + external_dns_version: *value.external_dns_version, time_created: value.time_created, creator: value.creator, comment: value.comment, diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 55d3e9b43f..6b1d8d422e 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1425,6 +1425,7 @@ table! { comment -> Text, internal_dns_version -> Int8, + external_dns_version -> Int8, } } diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 020916928d..7d7552584e 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -236,6 +236,7 @@ impl DataStore { let ( parent_blueprint_id, internal_dns_version, + external_dns_version, time_created, creator, comment, @@ -258,6 +259,7 @@ impl DataStore { ( blueprint.parent_blueprint_id, *blueprint.internal_dns_version, + *blueprint.external_dns_version, blueprint.time_created, blueprint.creator, blueprint.comment, @@ -487,6 +489,7 @@ impl DataStore { zones_in_service, parent_blueprint_id, internal_dns_version, + external_dns_version, time_created, creator, comment, @@ -1186,6 +1189,7 @@ mod tests { let blueprint = BlueprintBuilder::build_initial_from_collection( &collection, Generation::new(), + Generation::new(), &policy, "test", ) @@ -1220,6 +1224,7 @@ mod tests { let blueprint1 = BlueprintBuilder::build_initial_from_collection( &collection, Generation::new(), + Generation::new(), &EMPTY_POLICY, "test", ) @@ -1347,10 +1352,12 @@ mod tests { // Create a builder for a child blueprint. While we're at it, use a // different DNS version to test that that works. - let new_dns_version = blueprint1.internal_dns_version.next(); + let new_internal_dns_version = blueprint1.internal_dns_version.next(); + let new_external_dns_version = new_internal_dns_version.next(); let mut builder = BlueprintBuilder::new_based_on( &blueprint1, - new_dns_version, + new_internal_dns_version, + new_external_dns_version, &policy, "test", ) @@ -1402,7 +1409,8 @@ mod tests { .expect("failed to read collection back"); println!("diff: {}", blueprint2.diff_sleds(&blueprint_read)); assert_eq!(blueprint2, blueprint_read); - assert_eq!(blueprint2.internal_dns_version, new_dns_version); + assert_eq!(blueprint2.internal_dns_version, new_internal_dns_version); + assert_eq!(blueprint2.external_dns_version, new_external_dns_version); { let mut expected_ids = [blueprint1.id, blueprint2.id]; expected_ids.sort(); @@ -1495,6 +1503,7 @@ mod tests { let blueprint1 = BlueprintBuilder::build_initial_from_collection( &collection, Generation::new(), + Generation::new(), &EMPTY_POLICY, "test1", ) @@ -1502,6 +1511,7 @@ mod tests { let blueprint2 = BlueprintBuilder::new_based_on( &blueprint1, Generation::new(), + Generation::new(), &EMPTY_POLICY, "test2", ) @@ -1510,6 +1520,7 @@ mod tests { let blueprint3 = BlueprintBuilder::new_based_on( &blueprint1, Generation::new(), + Generation::new(), &EMPTY_POLICY, "test3", ) @@ -1606,6 +1617,7 @@ mod tests { let blueprint4 = BlueprintBuilder::new_based_on( &blueprint3, Generation::new(), + Generation::new(), &EMPTY_POLICY, "test3", ) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 0fa8eb1c10..8ef04c0f0c 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -360,6 +360,7 @@ mod test { BlueprintBuilder::build_initial_from_collection( &collection, Generation::new(), + Generation::new(), &policy, "test-suite", ) @@ -434,6 +435,7 @@ mod test { let mut blueprint = BlueprintBuilder::build_initial_from_collection( &collection, initial_dns_generation, + Generation::new(), &policy, "test-suite", ) diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index 1d5c4444b1..d453f0eb23 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -124,6 +124,7 @@ mod test { zones_in_service: BTreeSet::new(), parent_blueprint_id: None, internal_dns_version: Generation::new(), + external_dns_version: Generation::new(), time_created: chrono::Utc::now(), creator: "test".to_string(), comment: "test blueprint".to_string(), diff --git a/nexus/reconfigurator/planning/src/blueprint_builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder.rs index d58d798770..9a82a3720a 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder.rs @@ -101,6 +101,7 @@ pub struct BlueprintBuilder<'a> { /// previous blueprint, on which this one will be based parent_blueprint: &'a Blueprint, internal_dns_version: Generation, + external_dns_version: Generation, // These fields are used to allocate resources from sleds. policy: &'a Policy, @@ -130,6 +131,7 @@ impl<'a> BlueprintBuilder<'a> { pub fn build_initial_from_collection( collection: &'a Collection, internal_dns_version: Generation, + external_dns_version: Generation, policy: &'a Policy, creator: &str, ) -> Result { @@ -177,6 +179,7 @@ impl<'a> BlueprintBuilder<'a> { zones_in_service, parent_blueprint_id: None, internal_dns_version, + external_dns_version, time_created: now_db_precision(), creator: creator.to_owned(), comment: format!("from collection {}", collection.id), @@ -188,6 +191,7 @@ impl<'a> BlueprintBuilder<'a> { pub fn new_based_on( parent_blueprint: &'a Blueprint, internal_dns_version: Generation, + external_dns_version: Generation, policy: &'a Policy, creator: &str, ) -> anyhow::Result> { @@ -289,6 +293,7 @@ impl<'a> BlueprintBuilder<'a> { Ok(BlueprintBuilder { parent_blueprint, internal_dns_version, + external_dns_version, policy, sled_ip_allocators: BTreeMap::new(), zones: BlueprintZones::new(parent_blueprint), @@ -313,6 +318,7 @@ impl<'a> BlueprintBuilder<'a> { zones_in_service: self.zones_in_service, parent_blueprint_id: Some(self.parent_blueprint.id), internal_dns_version: self.internal_dns_version, + external_dns_version: self.external_dns_version, time_created: now_db_precision(), creator: self.creator, comment: self.comments.join(", "), @@ -950,6 +956,7 @@ pub mod test { BlueprintBuilder::build_initial_from_collection( &collection, Generation::new(), + Generation::new(), &policy, "the_test", ) @@ -973,6 +980,7 @@ pub mod test { let builder = BlueprintBuilder::new_based_on( &blueprint_initial, Generation::new(), + Generation::new(), &policy, "test_basic", ) @@ -995,6 +1003,7 @@ pub mod test { let blueprint1 = BlueprintBuilder::build_initial_from_collection( &collection, Generation::new(), + Generation::new(), &policy, "the_test", ) @@ -1004,6 +1013,7 @@ pub mod test { let mut builder = BlueprintBuilder::new_based_on( &blueprint1, Generation::new(), + Generation::new(), &policy, "test_basic", ) @@ -1038,6 +1048,7 @@ pub mod test { let mut builder = BlueprintBuilder::new_based_on( &blueprint2, Generation::new(), + Generation::new(), &policy, "test_basic", ) @@ -1112,8 +1123,9 @@ pub mod test { fn test_add_nexus_with_no_existing_nexus_zones() { let (mut collection, policy) = example(DEFAULT_N_SLEDS); - // We don't care about the internal DNS version here. + // We don't care about the DNS versions here. let internal_dns_version = Generation::new(); + let external_dns_version = Generation::new(); // Adding a new Nexus zone currently requires copying settings from an // existing Nexus zone. If we remove all Nexus zones from the @@ -1128,6 +1140,7 @@ pub mod test { let parent = BlueprintBuilder::build_initial_from_collection( &collection, internal_dns_version, + external_dns_version, &policy, "test", ) @@ -1136,6 +1149,7 @@ pub mod test { let mut builder = BlueprintBuilder::new_based_on( &parent, internal_dns_version, + external_dns_version, &policy, "test", ) @@ -1163,8 +1177,9 @@ pub mod test { fn test_add_nexus_error_cases() { let (mut collection, policy) = example(DEFAULT_N_SLEDS); - // We don't care about the internal DNS version here. + // We don't care about the DNS versions here. let internal_dns_version = Generation::new(); + let external_dns_version = Generation::new(); // Remove the Nexus zone from one of the sleds so that // `sled_ensure_zone_nexus` can attempt to add a Nexus zone to @@ -1187,6 +1202,7 @@ pub mod test { let parent = BlueprintBuilder::build_initial_from_collection( &collection, Generation::new(), + Generation::new(), &policy, "test", ) @@ -1198,6 +1214,7 @@ pub mod test { let mut builder = BlueprintBuilder::new_based_on( &parent, internal_dns_version, + external_dns_version, &policy, "test", ) @@ -1216,6 +1233,7 @@ pub mod test { let mut builder = BlueprintBuilder::new_based_on( &parent, internal_dns_version, + external_dns_version, &policy, "test", ) @@ -1248,6 +1266,7 @@ pub mod test { let mut builder = BlueprintBuilder::new_based_on( &parent, internal_dns_version, + external_dns_version, &policy, "test", ) @@ -1303,6 +1322,7 @@ pub mod test { let parent = BlueprintBuilder::build_initial_from_collection( &collection, Generation::new(), + Generation::new(), &policy, "test", ) @@ -1311,6 +1331,7 @@ pub mod test { match BlueprintBuilder::new_based_on( &parent, Generation::new(), + Generation::new(), &policy, "test", ) { @@ -1352,6 +1373,7 @@ pub mod test { let parent = BlueprintBuilder::build_initial_from_collection( &collection, Generation::new(), + Generation::new(), &policy, "test", ) @@ -1360,6 +1382,7 @@ pub mod test { match BlueprintBuilder::new_based_on( &parent, Generation::new(), + Generation::new(), &policy, "test", ) { @@ -1401,6 +1424,7 @@ pub mod test { let parent = BlueprintBuilder::build_initial_from_collection( &collection, Generation::new(), + Generation::new(), &policy, "test", ) @@ -1409,6 +1433,7 @@ pub mod test { match BlueprintBuilder::new_based_on( &parent, Generation::new(), + Generation::new(), &policy, "test", ) { diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 0773fec2bf..bd48acedc1 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -41,6 +41,7 @@ impl<'a> Planner<'a> { log: Logger, parent_blueprint: &'a Blueprint, internal_dns_version: Generation, + external_dns_version: Generation, policy: &'a Policy, creator: &str, // NOTE: Right now, we just assume that this is the latest inventory @@ -50,6 +51,7 @@ impl<'a> Planner<'a> { let blueprint = BlueprintBuilder::new_based_on( parent_blueprint, internal_dns_version, + external_dns_version, policy, creator, )?; @@ -336,8 +338,9 @@ mod test { fn test_basic_add_sled() { let logctx = test_setup_log("planner_basic_add_sled"); - // For our purposes, we don't care about the internal DNS generation. + // For our purposes, we don't care about the DNS generations. let internal_dns_version = Generation::new(); + let external_dns_version = Generation::new(); // Use our example inventory collection. let (mut collection, mut policy) = example(DEFAULT_N_SLEDS); @@ -347,6 +350,7 @@ mod test { let blueprint1 = BlueprintBuilder::build_initial_from_collection( &collection, internal_dns_version, + external_dns_version, &policy, "the_test", ) @@ -360,6 +364,7 @@ mod test { logctx.log.clone(), &blueprint1, internal_dns_version, + external_dns_version, &policy, "no-op?", &collection, @@ -385,6 +390,7 @@ mod test { logctx.log.clone(), &blueprint2, internal_dns_version, + external_dns_version, &policy, "test: add NTP?", &collection, @@ -418,6 +424,7 @@ mod test { logctx.log.clone(), &blueprint3, internal_dns_version, + external_dns_version, &policy, "test: add nothing more", &collection, @@ -455,6 +462,7 @@ mod test { logctx.log.clone(), &blueprint3, internal_dns_version, + external_dns_version, &policy, "test: add Crucible zones?", &collection, @@ -491,6 +499,7 @@ mod test { logctx.log.clone(), &blueprint5, internal_dns_version, + external_dns_version, &policy, "test: no-op?", &collection, @@ -515,8 +524,9 @@ mod test { fn test_add_multiple_nexus_to_one_sled() { let logctx = test_setup_log("planner_add_multiple_nexus_to_one_sled"); - // For our purposes, we don't care about the internal DNS generation. + // For our purposes, we don't care about the DNS generations. let internal_dns_version = Generation::new(); + let external_dns_version = Generation::new(); // Use our example inventory collection as a starting point, but strip // it down to just one sled. @@ -540,6 +550,7 @@ mod test { let blueprint1 = BlueprintBuilder::build_initial_from_collection( &collection, internal_dns_version, + external_dns_version, &policy, "the_test", ) @@ -567,6 +578,7 @@ mod test { logctx.log.clone(), &blueprint1, internal_dns_version, + external_dns_version, &policy, "add more Nexus", &collection, @@ -611,6 +623,7 @@ mod test { let blueprint1 = BlueprintBuilder::build_initial_from_collection( &collection, Generation::new(), + Generation::new(), &policy, "the_test", ) @@ -635,6 +648,7 @@ mod test { logctx.log.clone(), &blueprint1, Generation::new(), + Generation::new(), &policy, "add more Nexus", &collection, @@ -697,6 +711,7 @@ mod test { let blueprint1 = BlueprintBuilder::build_initial_from_collection( &collection, Generation::new(), + Generation::new(), &policy, "the_test", ) @@ -752,6 +767,7 @@ mod test { logctx.log.clone(), &blueprint1, Generation::new(), + Generation::new(), &policy, "add more Nexus", &collection, diff --git a/nexus/src/app/background/blueprint_execution.rs b/nexus/src/app/background/blueprint_execution.rs index 3c2530a3d3..3cfc6d4e7f 100644 --- a/nexus/src/app/background/blueprint_execution.rs +++ b/nexus/src/app/background/blueprint_execution.rs @@ -131,7 +131,7 @@ mod test { fn create_blueprint( omicron_zones: BTreeMap, - internal_dns_version: Generation, + dns_version: Generation, ) -> (BlueprintTarget, Blueprint) { let id = Uuid::new_v4(); ( @@ -145,7 +145,8 @@ mod test { omicron_zones, zones_in_service: BTreeSet::new(), parent_blueprint_id: None, - internal_dns_version, + internal_dns_version: dns_version, + external_dns_version: dns_version, time_created: chrono::Utc::now(), creator: "test".to_string(), comment: "test blueprint".to_string(), diff --git a/nexus/src/app/background/blueprint_load.rs b/nexus/src/app/background/blueprint_load.rs index 8886df81cd..6b4ac4e640 100644 --- a/nexus/src/app/background/blueprint_load.rs +++ b/nexus/src/app/background/blueprint_load.rs @@ -208,6 +208,7 @@ mod test { zones_in_service: BTreeSet::new(), parent_blueprint_id, internal_dns_version: Generation::new(), + external_dns_version: Generation::new(), time_created: now_db_precision(), creator: "test".to_string(), comment: "test blueprint".to_string(), diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index 31ba9fe065..ea5bf2f060 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -45,6 +45,7 @@ struct PlanningContext { creator: String, inventory: Option, internal_dns_version: Generation, + external_dns_version: Generation, } impl super::Nexus { @@ -211,15 +212,21 @@ impl super::Nexus { "fetching latest inventory collection for blueprint planner", )?; - // Fetch the current internal DNS version. This could be made part of + // Fetch the current DNS versions. This could be made part of // inventory, but it's enough of a one-off that there's no particular // advantage to doing that work now. - let dns_version = datastore + let internal_dns_version = datastore .dns_group_latest_version(opctx, DnsGroup::Internal) .await .internal_context( "fetching internal DNS version for blueprint planning", )?; + let external_dns_version = datastore + .dns_group_latest_version(opctx, DnsGroup::External) + .await + .internal_context( + "fetching external DNS version for blueprint planning", + )?; Ok(PlanningContext { creator, @@ -229,7 +236,8 @@ impl super::Nexus { target_nexus_zone_count: NEXUS_REDUNDANCY, }, inventory, - internal_dns_version: *dns_version.version, + internal_dns_version: *internal_dns_version.version, + external_dns_version: *external_dns_version.version, }) } @@ -254,6 +262,7 @@ impl super::Nexus { let blueprint = BlueprintBuilder::build_initial_from_collection( &collection, planning_context.internal_dns_version, + planning_context.external_dns_version, &planning_context.policy, &planning_context.creator, ) @@ -289,6 +298,7 @@ impl super::Nexus { opctx.log.clone(), &parent_blueprint, planning_context.internal_dns_version, + planning_context.external_dns_version, &planning_context.policy, &planning_context.creator, &inventory, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index ef3c03a302..8862303c62 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -152,9 +152,13 @@ pub struct Blueprint { pub parent_blueprint_id: Option, /// internal DNS version when this blueprint was created - // See blueprint generation for more on this. + // See blueprint execution for more on this. pub internal_dns_version: Generation, + /// external DNS version when thi blueprint was created + // See blueprint execution for more on this. + pub external_dns_version: Generation, + /// when this blueprint was generated (for debugging) pub time_created: chrono::DateTime, /// identity of the component that generated the blueprint (for debugging) @@ -238,6 +242,8 @@ pub struct BlueprintMetadata { pub parent_blueprint_id: Option, /// internal DNS version when this blueprint was created pub internal_dns_version: Generation, + /// external DNS version when this blueprint was created + pub external_dns_version: Generation, /// when this blueprint was generated (for debugging) pub time_created: chrono::DateTime, diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 53a53fb219..dda2cbae42 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -2119,6 +2119,14 @@ "description": "identity of the component that generated the blueprint (for debugging) This would generally be the Uuid of a Nexus instance.", "type": "string" }, + "external_dns_version": { + "description": "external DNS version when thi blueprint was created", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, "id": { "description": "unique identifier for this blueprint", "type": "string", @@ -2163,6 +2171,7 @@ "required": [ "comment", "creator", + "external_dns_version", "id", "internal_dns_version", "omicron_zones", @@ -2182,6 +2191,14 @@ "description": "identity of the component that generated the blueprint (for debugging) This would generally be the Uuid of a Nexus instance.", "type": "string" }, + "external_dns_version": { + "description": "external DNS version when this blueprint was created", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, "id": { "description": "unique identifier for this blueprint", "type": "string", @@ -2210,6 +2227,7 @@ "required": [ "comment", "creator", + "external_dns_version", "id", "internal_dns_version", "time_created" diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 40a6fd463f..72592d1baf 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3132,7 +3132,10 @@ CREATE TABLE IF NOT EXISTS omicron.public.blueprint ( comment TEXT NOT NULL, -- identifies the latest internal DNS version when blueprint planning began - internal_dns_version INT8 NOT NULL + internal_dns_version INT8 NOT NULL, + -- identifies the latest external DNS version when blueprint planning began + -- XXX-dap migration code must set the value for existing blueprints + external_dns_version INT8 NOT NULL ); -- table describing both the current and historical target blueprints of the From c48aa07ee2409dcedf2a825415136e0406f28152 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 6 Mar 2024 17:07:00 -0800 Subject: [PATCH 02/34] WIP: initial changes to add external DNS to execution --- nexus/reconfigurator/execution/src/dns.rs | 124 ++++++++++++++++------ 1 file changed, 90 insertions(+), 34 deletions(-) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 8ef04c0f0c..dd23822502 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -9,6 +9,7 @@ use dns_service_client::DnsDiff; use internal_dns::DnsConfigBuilder; use internal_dns::ServiceName; use nexus_db_model::DnsGroup; +use nexus_db_model::Silo; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder; use nexus_db_queries::db::DataStore; @@ -42,11 +43,15 @@ pub(crate) async fn deploy_dns( blueprint: &Blueprint, sleds_by_id: &BTreeMap, ) -> Result<(), Error> { - // First, fetch the current DNS config. - let dns_config_current = datastore + // First, fetch the current DNS configs. + let internal_dns_config_current = datastore .dns_config_read(opctx, DnsGroup::Internal) .await - .internal_context("reading current DNS")?; + .internal_context("reading current DNS (internal)")?; + let external_dns_config_current = datastore + .dns_config_read(opctx, DnsGroup::External) + .await + .internal_context("reading current DNS (external)")?; // We could check here that the DNS version we found isn't newer than when // the blueprint was generated. But we have to check later when we try to @@ -55,18 +60,59 @@ pub(crate) async fn deploy_dns( // we know it's being hit when we exercise this condition. // Next, construct the DNS config represented by the blueprint. - let dns_config_blueprint = blueprint_dns_config(blueprint, sleds_by_id); + let internal_dns_config_blueprint = + blueprint_internal_dns_config(blueprint, sleds_by_id); + let silos = todo!(); // XXX-dap + let external_dns_config_blueprint = + blueprint_external_dns_config(blueprint, silos); + + // Deploy the changes. + deploy_dns_one( + opctx, + datastore, + creator, + blueprint, + &internal_dns_config_current, + &internal_dns_config_blueprint, + DnsGroup::Internal, + ) + .await?; + deploy_dns_one( + opctx, + datastore, + creator, + blueprint, + &external_dns_config_current, + &external_dns_config_blueprint, + DnsGroup::External, + ) + .await?; + Ok(()) +} + +pub(crate) async fn deploy_dns_one( + opctx: &OpContext, + datastore: &DataStore, + creator: String, + blueprint: &Blueprint, + dns_config_current: &DnsConfigParams, + dns_config_blueprint: &DnsConfigParams, + dns_group: DnsGroup, +) -> Result<(), Error> { + let log = opctx + .log + .new(o!("blueprint_execution" => format!("dns {:?}", dns_group))); // Looking at the current contents of DNS, prepare an update that will make // it match what it should be. - let log = opctx.log.new(o!("blueprint_execution" => "DNS")); let comment = format!("blueprint {} ({})", blueprint.id, blueprint.comment); let maybe_update = dns_compute_update( &log, + dns_group, comment, creator, - &dns_config_current, - &dns_config_blueprint, + dns_config_current, + dns_config_blueprint, )?; let Some(update) = maybe_update else { // Nothing to do. @@ -81,12 +127,11 @@ pub(crate) async fn deploy_dns( // executing a newer target blueprint. // // To avoid this problem, before generating a blueprint, Nexus fetches the - // current internal DNS generation and stores that into the blueprint - // itself. Here, when we execute the blueprint, we make our database update - // conditional on that still being the current internal DNS generation. - // If some other instance has already come along and updated the database, - // whether for this same blueprint or a newer one, our attempt to update the - // database will fail. + // current DNS generation and stores that into the blueprint itself. Here, + // when we execute the blueprint, we make our database update conditional on + // that still being the current DNS generation. If some other instance has + // already come along and updated the database, whether for this same + // blueprint or a newer one, our attempt to update the database will fail. // // Let's look at a tricky example. Suppose: // @@ -100,7 +145,7 @@ pub(crate) async fn deploy_dns( // that's still the current version in DNS. B3 is made the current // target. // - // Assume B2 and B3 specify different internal DNS contents (e.g., have a + // Assume B2 and B3 specify different DNS contents (e.g., have a // different set of Omicron zones in them). // // 4. Nexus instance N1 finds B2 to be the current target and starts @@ -121,21 +166,21 @@ pub(crate) async fn deploy_dns( // Now, one of two things could happen: // // 1. N1 wins. Its database update applies successfully. In the database, - // the internal DNS version becomes version 4. In this case, N2 loses. - // Its database operation fails altogether. At this point, any - // subsequent attempt to execute blueprint B3 will fail because any DNS - // update will be conditional on the database having version 3. The only - // way out of this is for the planner to generate a new blueprint B4 - // that's exactly equivalent to B3 except that the stored internal DNS - // version is 4. Then we'll be able to execute that. + // the DNS version becomes version 4. In this case, N2 loses. Its + // database operation fails altogether. At this point, any subsequent + // attempt to execute blueprint B3 will fail because any DNS update will + // be conditional on the database having version 3. The only way out of + // this is for the planner to generate a new blueprint B4 that's exactly + // equivalent to B3 except that the stored DNS version is 4. Then we'll + // be able to execute that. // // 2. N2 wins. Its database update applies successfully. In the database, - // the internal DNS version becomes version 4. In this case, N1 loses. - // Its database operation fails altogether. At this point, any - // subsequent attempt to execute blueprint B3 will fail because any DNS - // update will be conditional on the databae having version 3. No - // further action is needed, though, because we've successfully executed - // the latest target blueprint. + // the DNS version becomes version 4. In this case, N1 loses. Its + // database operation fails altogether. At this point, any subsequent + // attempt to execute blueprint B3 will fail because any DNS update will + // be conditional on the databae having version 3. No further action is + // needed, though, because we've successfully executed the latest target + // blueprint. // // In both cases, the system will (1) converge to having successfully // executed the target blueprint, and (2) never have rolled any changes back @@ -149,7 +194,7 @@ pub(crate) async fn deploy_dns( let generation_u32 = u32::try_from(dns_config_current.generation).map_err(|e| { Error::internal_error(&format!( - "internal DNS generation got too large: {}", + "DNS generation got too large: {}", e, )) })?; @@ -159,7 +204,7 @@ pub(crate) async fn deploy_dns( } /// Returns the expected contents of internal DNS based on the given blueprint -pub fn blueprint_dns_config( +pub fn blueprint_internal_dns_config( blueprint: &Blueprint, sleds_by_id: &BTreeMap, ) -> DnsConfigParams { @@ -255,15 +300,22 @@ pub fn blueprint_dns_config( dns_builder.build() } +pub fn blueprint_external_dns_config( + blueprint: &Blueprint, + silos: Vec, +) -> DnsConfigParams { + todo!(); // XXX-dap +} + fn dns_compute_update( log: &slog::Logger, + dns_group: DnsGroup, comment: String, creator: String, current_config: &DnsConfigParams, new_config: &DnsConfigParams, ) -> Result, Error> { - let mut update = - DnsVersionUpdateBuilder::new(DnsGroup::Internal, comment, creator); + let mut update = DnsVersionUpdateBuilder::new(dns_group, comment, creator); let diff = DnsDiff::new(¤t_config, &new_config) .map_err(|e| Error::internal_error(&format!("{:#}", e)))?; @@ -315,11 +367,12 @@ fn dns_compute_update( #[cfg(test)] mod test { - use super::blueprint_dns_config; + use super::blueprint_internal_dns_config; use super::dns_compute_update; use crate::Sled; use internal_dns::ServiceName; use internal_dns::DNS_ZONE; + use nexus_db_model::DnsGroup; use nexus_inventory::CollectionBuilder; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_types::deployment::Blueprint; @@ -382,7 +435,8 @@ mod test { #[test] fn test_blueprint_dns_empty() { let blueprint = blueprint_empty(); - let blueprint_dns = blueprint_dns_config(&blueprint, &BTreeMap::new()); + let blueprint_dns = + blueprint_internal_dns_config(&blueprint, &BTreeMap::new()); assert!(blueprint_dns.sole_zone().unwrap().records.is_empty()); } @@ -480,7 +534,7 @@ mod test { .collect(); let dns_config_blueprint = - blueprint_dns_config(&blueprint, &sleds_by_id); + blueprint_internal_dns_config(&blueprint, &sleds_by_id); assert_eq!( dns_config_blueprint.generation, u64::from(initial_dns_generation.next()) @@ -666,6 +720,7 @@ mod test { let dns_empty = dns_config_empty(); match dns_compute_update( &logctx.log, + DnsGroup::Internal, "test-suite".to_string(), "test-suite".to_string(), &dns_empty, @@ -719,6 +774,7 @@ mod test { let update = dns_compute_update( &logctx.log, + DnsGroup::Internal, "test-suite".to_string(), "test-suite".to_string(), &dns_config1, From c7fad86b2a7da6296ddefd82722798fce9581740 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 6 Mar 2024 17:37:01 -0800 Subject: [PATCH 03/34] abandoned WIP: move nexus-external-endpoints into a crate --- Cargo.lock | 34 + Cargo.toml | 3 + nexus/Cargo.toml | 1 + nexus/external-endpoints/Cargo.toml | 39 + nexus/external-endpoints/build.rs | 10 + nexus/external-endpoints/src/lib.rs | 1540 +++++++++++++++++ nexus/reconfigurator/execution/Cargo.toml | 1 + nexus/reconfigurator/execution/src/dns.rs | 16 +- .../src/app/background/external_endpoints.rs | 4 +- nexus/src/app/external_endpoints.rs | 1525 +--------------- nexus/src/app/rack.rs | 2 +- nexus/src/app/silo.rs | 14 +- nexus/src/external_api/device_auth.rs | 2 +- 13 files changed, 1652 insertions(+), 1539 deletions(-) create mode 100644 nexus/external-endpoints/Cargo.toml create mode 100644 nexus/external-endpoints/build.rs create mode 100644 nexus/external-endpoints/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 18c783037f..08c93bb798 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4433,6 +4433,38 @@ dependencies = [ "serde_json", ] +[[package]] +name = "nexus-external-endpoints" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "dropshot", + "hex", + "http 0.2.11", + "hyper 0.14.27", + "nexus-db-model", + "nexus-db-queries", + "nexus-types", + "omicron-common", + "omicron-rpaths", + "omicron-test-utils", + "omicron-workspace-hack", + "openssl", + "pq-sys", + "rcgen", + "reqwest", + "rustls 0.22.2", + "rustls-pemfile 2.1.0", + "schemars", + "serde", + "serde_with", + "slog", + "thiserror", + "tokio", + "uuid", +] + [[package]] name = "nexus-inventory" version = "0.1.0" @@ -4485,6 +4517,7 @@ dependencies = [ "nexus-config", "nexus-db-model", "nexus-db-queries", + "nexus-external-endpoints", "nexus-inventory", "nexus-reconfigurator-planning", "nexus-test-utils", @@ -5060,6 +5093,7 @@ dependencies = [ "nexus-db-model", "nexus-db-queries", "nexus-defaults", + "nexus-external-endpoints", "nexus-inventory", "nexus-reconfigurator-execution", "nexus-reconfigurator-planning", diff --git a/Cargo.toml b/Cargo.toml index 474739a932..fcb939cca2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,7 @@ members = [ "nexus/db-model", "nexus/db-queries", "nexus/defaults", + "nexus/external-endpoints", "nexus/inventory", "nexus/macros-common", "nexus/reconfigurator/execution", @@ -121,6 +122,7 @@ default-members = [ "nexus/db-model", "nexus/db-queries", "nexus/defaults", + "nexus/external-endpoints", "nexus/inventory", "nexus/reconfigurator/execution", "nexus/reconfigurator/planning", @@ -260,6 +262,7 @@ nexus-config = { path = "nexus-config" } nexus-db-model = { path = "nexus/db-model" } nexus-db-queries = { path = "nexus/db-queries" } nexus-defaults = { path = "nexus/defaults" } +nexus-external-endpoints = { path = "nexus/external-endpoints" } nexus-inventory = { path = "nexus/inventory" } nexus-macros-common = { path = "nexus/macros-common" } nexus-reconfigurator-execution = { path = "nexus/reconfigurator/execution" } diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index de79f3429d..271fd0866b 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -81,6 +81,7 @@ uuid.workspace = true nexus-defaults.workspace = true nexus-db-model.workspace = true nexus-db-queries.workspace = true +nexus-external-endpoints.workspace = true nexus-inventory.workspace = true nexus-reconfigurator-execution.workspace = true nexus-reconfigurator-planning.workspace = true diff --git a/nexus/external-endpoints/Cargo.toml b/nexus/external-endpoints/Cargo.toml new file mode 100644 index 0000000000..e356223e0b --- /dev/null +++ b/nexus/external-endpoints/Cargo.toml @@ -0,0 +1,39 @@ +[package] +name = "nexus-external-endpoints" +version = "0.1.0" +edition = "2021" + +[build-dependencies] +omicron-rpaths.workspace = true + +[dependencies] +anyhow.workspace = true +dropshot.workspace = true +hex.workspace = true +http.workspace = true +hyper.workspace = true +nexus-db-model.workspace = true +nexus-db-queries.workspace = true +nexus-types.workspace = true +omicron-common.workspace = true +openssl.workspace = true +# See omicron-rpaths for more about the "pq-sys" dependency. +pq-sys = "*" +rcgen.workspace = true +reqwest.workspace = true +rustls.workspace = true +rustls-pemfile.workspace = true +serde_with.workspace = true +serde.workspace = true +slog.workspace = true +thiserror.workspace = true +tokio.workspace = true +uuid.workspace = true + +omicron-workspace-hack.workspace = true + +[dev-dependencies] +chrono.workspace = true +http.workspace = true +omicron-test-utils.workspace = true +schemars.workspace = true diff --git a/nexus/external-endpoints/build.rs b/nexus/external-endpoints/build.rs new file mode 100644 index 0000000000..1ba9acd41c --- /dev/null +++ b/nexus/external-endpoints/build.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// See omicron-rpaths for documentation. +// NOTE: This file MUST be kept in sync with the other build.rs files in this +// repository. +fn main() { + omicron_rpaths::configure_default_omicron_rpaths(); +} diff --git a/nexus/external-endpoints/src/lib.rs b/nexus/external-endpoints/src/lib.rs new file mode 100644 index 0000000000..5136375cb8 --- /dev/null +++ b/nexus/external-endpoints/src/lib.rs @@ -0,0 +1,1540 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Management of external HTTPS endpoints +//! +//! Whenever a client connects to one of our external endpoints and attempts to +//! establish a TLS session, we must provide a TLS certificate to authenticate +//! ourselves to the client. But each Silo has a separate external DNS name and +//! may have its own TLS certificate for that DNS name. These all resolve to +//! the same set of IPs, so we cannot tell from the IP address alone which +//! Silo's endpoint the client is trying to reach nor which certificate to +//! present. TLS provides a mechanism called Server Name Indication (SNI) for +//! clients to specify the name of the server they're trying to reach _before_ +//! the TLS session is established. We use this to determine which Silo +//! endpoint the client is trying to reach and so which TLS certificate to +//! present. +//! +//! To achieve this, we first need to know what DNS names, Silos, and TLS +//! certificates are available at any given time. This is summarized in +//! [`ExternalEndpoints`]. A background task is responsible for maintaining +//! this, providing the latest version to whoever needs it via a `watch` +//! channel. How do we tell the TLS stack what certificate to use? When +//! setting up the Dropshot server in the first place, we provide a +//! [`rustls::ServerConfig`] that describes various TLS settings, including an +//! "certificate resolver" object that impls +//! [`rustls::server::ResolvesServerCert`]. See [`NexusCertResolver`]. + +use anyhow::anyhow; +use anyhow::bail; +use anyhow::Context; +use nexus_db_model::AuthenticationMode; +use nexus_db_model::Certificate; +use nexus_db_model::DnsGroup; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::datastore::Discoverability; +use nexus_db_queries::db::fixed_data::silo::SILO_ID; +use nexus_db_queries::db::model::ServiceKind; +use nexus_db_queries::db::DataStore; +use nexus_types::identity::Resource; +use omicron_common::api::external::http_pagination::PaginatedBy; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::Error; +use omicron_common::bail_unless; +use openssl::pkey::PKey; +use openssl::x509::X509; +use rustls::sign::CertifiedKey; +use serde::Serialize; +use serde_with::SerializeDisplay; +use slog::{debug, error, o, trace, warn}; +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; +use std::fmt; +use std::num::NonZeroU32; +use std::sync::Arc; +use thiserror::Error; +use tokio::sync::watch; +use uuid::Uuid; + +/// Returns the (relative) DNS name for this Silo's API and console endpoints +/// _within_ the external DNS zone (i.e., without that zone's suffix) +/// +/// This specific naming scheme is determined under RFD 357. +pub fn silo_dns_name(name: &omicron_common::api::external::Name) -> String { + // RFD 4 constrains resource names (including Silo names) to DNS-safe + // strings, which is why it's safe to directly put the name of the + // resource into the DNS name rather than doing any kind of escaping. + format!("{}.sys", name) +} + +/// Describes the set of external endpoints, organized by DNS name +/// +/// This data structure provides a quick way to determine which Silo and TLS +/// certificate(s) make sense for an incoming request, based on the TLS +/// session's SNI (DNS name). See module-level docs for details. +/// +/// This object provides no interfaces outside this module. It's only used by +/// the `NexusCertResolver` that's also in this module. +/// +/// This structure impls `Serialize` only so that background tasks can easily +/// present the latest configuration that they've found (e.g., via a debug API) +#[derive(Clone, Debug, Eq, PartialEq, Serialize)] +pub struct ExternalEndpoints { + by_dns_name: BTreeMap>, + warnings: Vec, + default_endpoint: Option>, +} + +impl ExternalEndpoints { + /// Assemble a list of Silos, TLS certificates, and external DNS zones into + /// a structure that we can use for quickly figuring out which Silo and TLS + /// certificates are associated with each incoming DNS name + pub fn new( + silos: Vec, + certs: Vec, + external_dns_zones: Vec, + ) -> ExternalEndpoints { + // We want to avoid failing this operation even if we encounter problems + // because we want to serve as many DNS certificates as we can (so that + // an operator has a chance of fixing any problems that do exist). + // Instead of returning any errors, keep track of any issues as + // warnings. + let mut warnings = vec![]; + + // Compute a mapping from external DNS name to Silo id. Detect any + // duplicates and leave them out (but report them). There should not + // be any duplicates since the DNS names are constructed from the + // (unique) Silo names. Even if we support aliases in the future, they + // will presumably need to be unique, too. + let silos_by_id: BTreeMap> = silos + .into_iter() + .map(|db_silo| (db_silo.id(), Arc::new(db_silo))) + .collect(); + let mut dns_names: BTreeMap = BTreeMap::new(); + for z in external_dns_zones { + for (_, db_silo) in &silos_by_id { + let dns_name = format!( + "{}.{}", + silo_dns_name(db_silo.name()), + z.zone_name + ); + match dns_names.entry(dns_name.clone()) { + Entry::Vacant(vac) => { + vac.insert(db_silo.id()); + } + Entry::Occupied(occ) => { + let first_silo_id = *occ.get(); + let first_silo_name = silos_by_id + .get(&first_silo_id) + .unwrap() + .name() + .to_string(); + warnings.push(ExternalEndpointError::DupDnsName { + dup_silo_id: db_silo.id(), + dup_silo_name: db_silo.name().to_string(), + first_silo_id, + first_silo_name, + dns_name, + }) + } + }; + } + } + + // Compute a mapping from silo id to a list of usable TLS certificates + // for the Silo. By "usable" here, we just mean that we are capable of + // providing it to the client. This basically means that we can parse + // it. A certificate might be invalid for some other reason (e.g., does + // not match the right DNS name or it's expired). We may later choose + // to prefer some certificates over others, but that'll be decided later + // (see best_certificate()). And in the end it'll be better to provide + // an expired certificate than none at all. + let parsed_certificates = certs.into_iter().map(|db_cert| { + let silo_id = db_cert.silo_id; + let tls_cert = TlsCertificate::try_from(db_cert).map_err(|e| { + ExternalEndpointError::BadCert { silo_id, reason: Arc::new(e) } + })?; + let db_silo = silos_by_id + .get(&silo_id) + .ok_or_else(|| ExternalEndpointError::BadCert { + silo_id, + reason: Arc::new(anyhow!("silo not found")), + })? + .clone(); + Ok((silo_id, db_silo, tls_cert)) + }); + + let mut certs_by_silo_id = BTreeMap::new(); + for parsed_cert in parsed_certificates { + match parsed_cert { + Err(error) => { + warnings.push(error); + } + Ok((silo_id, db_silo, tls_cert)) => { + let silo_entry = certs_by_silo_id + .entry(silo_id) + .or_insert_with(|| ExternalEndpoint { + silo_id, + db_silo, + tls_certs: Vec::new(), + }); + silo_entry.tls_certs.push(tls_cert) + } + }; + } + + let certs_by_silo_id: BTreeMap<_, _> = certs_by_silo_id + .into_iter() + .map(|(k, v)| (k, Arc::new(v))) + .collect(); + + let by_dns_name: BTreeMap<_, _> = dns_names + .into_iter() + .map(|(dns_name, silo_id)| { + let silo_info = certs_by_silo_id + .get(&silo_id) + .cloned() + .unwrap_or_else(|| { + // For something to appear in `dns_names`, we must have + // found it in `silos`, and so it must be in + // `silos_by_id`. + let db_silo = + silos_by_id.get(&silo_id).unwrap().clone(); + Arc::new(ExternalEndpoint { + silo_id, + db_silo, + tls_certs: vec![], + }) + }); + + if silo_info.tls_certs.is_empty() { + warnings.push(ExternalEndpointError::NoSiloCerts { + silo_id, + dns_name: dns_name.clone(), + }) + } + + (dns_name, silo_info) + }) + .collect(); + + if by_dns_name.is_empty() { + warnings.push(ExternalEndpointError::NoEndpoints); + } + + // Pick a default endpoint. This will be used if a request arrives + // without specifying an endpoint via the HTTP/1.1 Host header or the + // HTTP2 URL. This is only intended for development, where external DNS + // may not be set up. + // + // We somewhat arbitrarily choose the first Silo we find that's not JIT. + // This would usually be the recovery Silo. + let default_endpoint = silos_by_id + .values() + .filter(|s| { + // Ignore the built-in Silo, which people are not supposed to + // log into. + s.id() != *SILO_ID + }) + .find(|s| s.authentication_mode == AuthenticationMode::Local) + .and_then(|s| { + by_dns_name + .iter() + .find(|(_, endpoint)| endpoint.silo_id == s.id()) + .map(|(_, endpoint)| endpoint.clone()) + }); + + ExternalEndpoints { by_dns_name, warnings, default_endpoint } + } + + pub fn dns_names(&self) -> impl Iterator { + self.by_dns_name.keys() + } + + pub fn has_domain(&self, dns_name: &str) -> bool { + self.by_dns_name.contains_key(dns_name) + } + + pub fn ndomains(&self) -> usize { + self.by_dns_name.len() + } + + pub fn nwarnings(&self) -> usize { + self.warnings.len() + } +} + +/// Describes a single external "endpoint", by which we mean an external DNS +/// name that's associated with a particular Silo +#[derive(Debug, PartialEq, Eq, Serialize)] +pub struct ExternalEndpoint { + /// the id of the Silo associated with this endpoint + // This is redundant with `db_silo`, but it's convenient to put it here and + // it shows up in the serialized form this way. + silo_id: Uuid, + /// the silo associated with this endpoint + #[serde(skip)] + db_silo: Arc, + /// the set of TLS certificate chains that could be appropriate for this + /// endpoint + tls_certs: Vec, +} + +impl ExternalEndpoint { + pub fn silo(&self) -> &nexus_db_model::Silo { + &self.db_silo + } + + /// Chooses a TLS certificate (chain) to use when handling connections to + /// this endpoint + fn best_certificate(&self) -> Result<&TlsCertificate, anyhow::Error> { + // We expect the most common case to be that there's only one + // certificate chain here. The next most common case is that there are + // two because the administrator is in the process of rotating + // certificates, usually due to upcoming expiration. In principle, it + // would be useful to allow operators to control which certificate chain + // gets used, and maybe even do something like a canary to mitigate the + // risk of a botched certificate update. Absent that, we're going to do + // our best to pick the best chain automatically. + // + // This could be a lot more sophisticated than it is. We could try to + // avoid using certificates that are clearly not valid based on the + // "not_after" and "not_before" bounds. We could check each certificate + // in the chain, not just the last one. We could use a margin of error + // when doing this to account for small variations in the wall clock + // between us and the client. We could try to avoid using a certificate + // that doesn't appear to be compatible with the SNI value (DNS domain) + // that this request came in on. + // + // IMPORTANT: If we ever decide to do those things, they should only be + // used to decide which of several certificates is preferred. We should + // always pick a certificate if we possibly can, even if it seems to be + // invalid. A client can always choose not to trust it. But in the + // unfortunate case where there are no good certificates, a customer's + // only option may be to instruct their client to trust an invalid + // certificate _so that they can log in and fix the certificate + // problem_. If we provide no certificate at all here, a customer may + // have no way to fix the problem. + // + // Anyway, we don't yet do anything of these things. For now, pick the + // certificate chain whose leaf certificate has the latest expiration + // time. + + // This would be cleaner if Asn1Time impl'd Ord or even just a way to + // convert it to a Unix timestamp or any other comparable timestamp. + let mut latest_expiration: Option<&TlsCertificate> = None; + for t in &self.tls_certs { + // We'll choose this certificate (so far) if we find that it's + // anything other than "earlier" than the best we've seen so far. + // That includes the case where we haven't seen any so far, where + // this one is greater than or equal to the best so far, as well as + // the case where they're incomparable for whatever reason. (This + // ensures that we always pick at least one.) + if latest_expiration.is_none() + || !matches!( + t.parsed.not_after().partial_cmp( + latest_expiration.unwrap().parsed.not_after() + ), + Some(std::cmp::Ordering::Less) + ) + { + latest_expiration = Some(t); + } + } + + latest_expiration.ok_or_else(|| { + anyhow!("silo {} has no usable certificates", self.silo_id) + }) + } +} + +/// Describes a problem encountered while assembling an [`ExternalEndpoints`] +/// object +#[derive(Clone, Debug, Error, SerializeDisplay)] +pub enum ExternalEndpointError { + #[error( + "ignoring silo {dup_silo_id} ({dup_silo_name:?}): has the same DNS \ + name ({dns_name:?}) as previously-found silo {first_silo_id} \ + ({first_silo_name:?})" + )] + DupDnsName { + dup_silo_id: Uuid, + dup_silo_name: String, + first_silo_id: Uuid, + first_silo_name: String, + dns_name: String, + }, + + #[error("ignoring certificate for silo {silo_id}: {reason:#}")] + BadCert { + silo_id: Uuid, + #[source] + reason: Arc, + }, + + #[error( + "silo {silo_id} with DNS name {dns_name:?} has no usable certificates" + )] + NoSiloCerts { silo_id: Uuid, dns_name: String }, + + #[error("no external endpoints were found")] + NoEndpoints, +} + +impl Eq for ExternalEndpointError {} +impl PartialEq for ExternalEndpointError { + fn eq(&self, other: &Self) -> bool { + self.to_string() == other.to_string() + } +} + +/// A parsed, validated TLS certificate ready to use with an external TLS server +#[derive(Serialize)] +#[serde(transparent)] +struct TlsCertificate { + /// This is what we need to provide to the TLS stack when we decide to use + /// this certificate for an incoming TLS connection + // NOTE: It's important that we do not serialize the private key! + #[serde(skip)] + certified_key: Arc, + + /// Parsed representation of the whole certificate chain + /// + /// This is used to extract metadata like the expiration time. + // NOTE: It's important that we do not serialize the private key! + #[serde(skip)] + parsed: X509, + + /// certificate digest (historically sometimes called a "fingerprint") + // This is the only field that appears in the serialized output or debug + // output. + digest: String, +} + +impl fmt::Debug for TlsCertificate { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // It's important that only the digest appear in the debug output. We + // definitely don't want to leak the private key this way. Really, + // we don't want even the public parts adding noise to debug output. + f.debug_struct("TlsCertificate").field("digest", &self.digest).finish() + } +} + +impl Eq for TlsCertificate {} +impl PartialEq for TlsCertificate { + fn eq(&self, other: &Self) -> bool { + self.digest == other.digest + } +} + +impl TryFrom for TlsCertificate { + type Error = anyhow::Error; + + fn try_from(db_cert: Certificate) -> Result { + // Parse and validate what we've got. + let certs_pem = openssl::x509::X509::stack_from_pem(&db_cert.cert) + .context("parsing PEM stack")?; + let private_key = PKey::private_key_from_pem(&db_cert.key) + .context("parsing private key PEM")?; + + // Assemble a rustls CertifiedKey with both the certificate and the key. + let certified_key = { + let mut cursor = std::io::Cursor::new(db_cert.key.clone()); + let rustls_private_key = rustls_pemfile::private_key(&mut cursor) + .expect("parsing private key PEM") + .expect("no private keys found"); + let rustls_signing_key = + rustls::crypto::ring::sign::any_supported_type( + &rustls_private_key, + ) + .context("parsing DER private key")?; + let rustls_certs = certs_pem + .iter() + .map(|x509| { + x509.to_der() + .context("serializing cert to DER") + .map(rustls::pki_types::CertificateDer::from) + }) + .collect::>()?; + Arc::new(CertifiedKey::new(rustls_certs, rustls_signing_key)) + }; + + let end_cert = certs_pem + .into_iter() + .next() + .ok_or_else(|| anyhow!("no certificates in PEM stack"))?; + anyhow::ensure!( + end_cert + .public_key() + .context("certificate publickey")? + .public_eq(&private_key), + "certificate public key does not match stored private key" + ); + + // Compute a digest (fingerprint) that we can use for debugging. + let digest = { + let digest_bytes = end_cert + .digest(openssl::hash::MessageDigest::sha256()) + .context("computing fingerprint")?; + hex::encode(&digest_bytes) + }; + + Ok(TlsCertificate { certified_key, digest, parsed: end_cert }) + } +} + +/// Read the lists of all Silos, external DNS zones, and external TLS +/// certificates from the database and assemble an `ExternalEndpoints` structure +/// that describes what DNS names exist, which Silos they correspond to, and +/// what TLS certificates can be used for them +// This structure is used to determine what TLS certificates are used for +// incoming connections to the external console/API endpoints. As such, it's +// critical that we produce a usable result if at all possible, even if it's +// incomplete. Otherwise, we won't be able to serve _any_ incoming connections +// to _any_ of our external endpoints! If data from the database is invalid or +// inconsistent, that data is discarded and a warning is produced, but we'll +// still return a usable object. +pub async fn read_all_endpoints( + datastore: &DataStore, + opctx: &OpContext, +) -> Result { + // We will not look for more than this number of external DNS zones, Silos, + // or certificates. We do not expect very many of any of these objects. + const MAX: u32 = 200; + let pagparams_id = DataPageParams { + marker: None, + limit: NonZeroU32::new(MAX).unwrap(), + direction: dropshot::PaginationOrder::Ascending, + }; + let pagbyid = PaginatedBy::Id(pagparams_id); + let pagparams_name = DataPageParams { + marker: None, + limit: NonZeroU32::new(MAX).unwrap(), + direction: dropshot::PaginationOrder::Ascending, + }; + + let silos = + datastore.silos_list(opctx, &pagbyid, Discoverability::All).await?; + let external_dns_zones = datastore + .dns_zones_list(opctx, DnsGroup::External, &pagparams_name) + .await?; + bail_unless!( + !external_dns_zones.is_empty(), + "expected at least one external DNS zone" + ); + let certs = datastore + .certificate_list_for(opctx, Some(ServiceKind::Nexus), &pagbyid, false) + .await?; + + // If we found too many of any of these things, complain as loudly as we + // can. Our results will be wrong. But we still don't want to fail if we + // can avoid it because we want to be able to serve as many endpoints as we + // can. + // TODO-reliability we should prevent people from creating more than this + // maximum number of Silos and certificates. + let max = usize::try_from(MAX).unwrap(); + if silos.len() >= max { + error!( + &opctx.log, + "reading endpoints: expected at most {} silos, but found at \ + least {}. TLS may not work on some Silos' external endpoints.", + MAX, + silos.len(), + ); + } + if external_dns_zones.len() >= max { + error!( + &opctx.log, + "reading endpoints: expected at most {} external DNS zones, but \ + found at least {}. TLS may not work on some Silos' external \ + endpoints.", + MAX, + external_dns_zones.len(), + ); + } + if certs.len() >= max { + error!( + &opctx.log, + "reading endpoints: expected at most {} certificates, but \ + found at least {}. TLS may not work on some Silos' external \ + endpoints.", + MAX, + certs.len(), + ); + } + + Ok(ExternalEndpoints::new(silos, certs, external_dns_zones)) +} + +/// TLS SNI certificate resolver for use with rustls/Dropshot +/// +/// This object exists to impl `rustls::server::ResolvesServerCert`. This +/// object looks at an incoming TLS session's SNI field, matches it against the +/// latest `ExternalEndpoints` configuration (available via a watch channel), +/// and then determines which certificate (if any) to provide for the new +/// session. +/// +/// See the module-level comment for more details. +#[derive(Debug)] +pub struct NexusCertResolver { + log: slog::Logger, + config_rx: watch::Receiver>, +} + +impl NexusCertResolver { + pub fn new( + log: slog::Logger, + config_rx: watch::Receiver>, + ) -> NexusCertResolver { + NexusCertResolver { log, config_rx } + } + + fn do_resolve_endpoint( + &self, + server_name: Option<&str>, + ) -> Result, anyhow::Error> { + let Some(server_name) = server_name else { + bail!("TLS session had no server name") + }; + + let config_ref = self.config_rx.borrow(); + let config = match &*config_ref { + Some(c) => c, + None => bail!("no TLS config found"), + }; + + config + .by_dns_name + .get(server_name) + .ok_or_else(|| anyhow!("unrecognized server name: {}", server_name)) + .cloned() + } + + fn do_resolve( + &self, + server_name: Option<&str>, + ) -> Option> { + let log = + self.log.new(o!("server_name" => server_name.map(String::from))); + + trace!(&log, "resolving TLS certificate"); + let resolved = self.do_resolve_endpoint(server_name); + let result = match resolved { + Ok(ref endpoint) => match endpoint.best_certificate() { + Ok(certificate) => Ok((endpoint.silo_id, certificate)), + Err(error) => Err(error), + }, + Err(error) => Err(error), + }; + match result { + Ok((silo_id, certificate)) => { + debug!(log, "resolved TLS certificate"; + "silo_id" => silo_id.to_string(), + "certificate" => ?certificate + ); + Some(certificate.certified_key.clone()) + } + Err(error) => { + // TODO-security There is a (limited) DoS risk here, in that the + // client controls the request made to this endpoint and we're + // going to emit something to the log every time this happens. + // But at this stage it's pretty valuable to be able to debug + // this problem. + warn!( + log, + "failed to resolve TLS certificate"; + "error" => format!("{:#}", error), + ); + None + } + } + } +} + +impl rustls::server::ResolvesServerCert for NexusCertResolver { + fn resolve( + &self, + client_hello: rustls::server::ClientHello, + ) -> Option> { + let server_name = client_hello.server_name(); + self.do_resolve(server_name) + } +} + +/// Returns the host and port of the server that the client is trying to +/// reach +/// +/// Recall that Nexus serves many external endpoints on the same set of IP +/// addresses, each corresponding to a particular Silo. We use the standard +/// HTTP 1.1 "host" header or HTTP2 URI authority to determine which +/// Silo's endpoint the client is trying to reach. +pub fn authority_for_request( + rqinfo: &dropshot::RequestInfo, +) -> Result { + if rqinfo.version() > hyper::Version::HTTP_11 { + // For HTTP2, the server name is specified in the URL's "authority". + rqinfo + .uri() + .authority() + .cloned() + .ok_or_else(|| String::from("request URL missing authority")) + } else { + // For earlier versions of HTTP, the server name is specified by the + // "Host" header. + rqinfo + .headers() + .get(http::header::HOST) + .ok_or_else(|| String::from("request missing \"host\" header"))? + .to_str() + .map_err(|e| format!("failed to decode \"host\" header: {:#}", e)) + .and_then(|hostport| { + hostport.parse().map_err(|e| { + format!("unsupported \"host\" header: {:#}", e) + }) + }) + } +} + +// See `Nexus::endpoint_for_request()`. This is factored out to be able to test +// it without a whole server. +pub fn endpoint_for_authority( + log: &slog::Logger, + requested_authority: &http::uri::Authority, + config_rx: &tokio::sync::watch::Receiver>, +) -> Result, Error> { + let requested_host = requested_authority.host(); + let log = log.new(o!("server_name" => requested_host.to_string())); + trace!(&log, "determining endpoint"); + + // If we have not successfully loaded the endpoint configuration yet, + // there's nothing we can do here. We could try to do better (e.g., use + // the recovery Silo?). But if we failed to load endpoints, it's likely + // the database is down, and we're not going to get much further anyway. + let endpoint_config = config_rx.borrow(); + let endpoints = endpoint_config.as_ref().ok_or_else(|| { + error!(&log, "received request with no endpoints loaded"); + Error::unavail("endpoints not loaded") + })?; + + // See if there's an endpoint for the requested name. If so, use it. + if let Some(endpoint) = endpoints.by_dns_name.get(requested_host) { + trace!( + &log, + "received request for endpoint"; + "silo_name" => ?endpoint.db_silo.name(), + "silo_id" => ?endpoint.silo_id, + ); + + return Ok(endpoint.clone()); + } + + // There was no endpoint for the requested name. This should generally + // not happen in deployed systems where we expect people to have set up + // DNS to find the external endpoints. But in development, we don't + // always have DNS set up. People may use an IP address to get here. + // To accommodate this use case, we make a best-effort to pick a default + // endpoint when we can't find one for the name we were given. + // + // If this ever does happen in a production system, this might be + // confusing. The best thing to do in a production system is probably + // to return an error saying that the requested server name was unknown. + // Instead, we'll wind up choosing some Silo here. This has no impact + // on authenticated requests because for those we use the authenticated + // identity's Silo. (That's as of this writing. Again, we may want to + // disallow this and produce an error instead.) If the request is not + // authenticated, we may wind up sending them to a login page for this + // Silo that may not be the Silo they meant. + endpoints + .default_endpoint + .as_ref() + .ok_or_else(|| { + error!( + &log, + "received request for unknown host and no default \ + endpoint is available", + ); + Error::invalid_request(&format!( + "HTTP request for unknown server name {:?}", + requested_host, + )) + }) + .map(|c| c.clone()) +} + +#[cfg(test)] +mod test { + use super::authority_for_request; + use super::endpoint_for_authority; + use super::ExternalEndpointError; + use super::ExternalEndpoints; + use super::NexusCertResolver; + use super::TlsCertificate; + use chrono::Utc; + use dropshot::endpoint; + use dropshot::test_util::LogContext; + use dropshot::ConfigLogging; + use dropshot::ConfigLoggingIfExists; + use dropshot::ConfigLoggingLevel; + use http::uri::Authority; + use nexus_db_model::Certificate; + use nexus_db_model::DnsGroup; + use nexus_db_model::DnsZone; + use nexus_db_model::ServiceKind; + use nexus_db_model::Silo; + use nexus_types::external_api::params; + use nexus_types::external_api::shared; + use nexus_types::identity::Resource; + use omicron_common::api::external::Error; + use omicron_common::api::external::IdentityMetadataCreateParams; + use schemars::JsonSchema; + use serde::Deserialize; + use serde::Serialize; + use std::net::SocketAddr; + use uuid::Uuid; + + fn create_silo(silo_id: Option, name: &str, saml: bool) -> Silo { + let identity_mode = if saml { + shared::SiloIdentityMode::SamlJit + } else { + shared::SiloIdentityMode::LocalOnly + }; + let params = params::SiloCreate { + identity: IdentityMetadataCreateParams { + name: name.parse().unwrap(), + description: String::new(), + }, + quotas: params::SiloQuotasCreate::empty(), + discoverable: false, + identity_mode, + admin_group_name: None, + tls_certificates: vec![], + mapped_fleet_roles: Default::default(), + }; + + if let Some(silo_id) = silo_id { + Silo::new_with_id(silo_id, params) + } else { + Silo::new(params) + } + .unwrap() + } + + fn create_certificate( + domain: &str, + expired: bool, + ) -> params::CertificateCreate { + let mut cert_params = + rcgen::CertificateParams::new(vec![domain.to_string()]); + if expired { + cert_params.not_after = std::time::UNIX_EPOCH.into(); + } + let cert = rcgen::Certificate::from_params(cert_params).unwrap(); + let cert_pem = + cert.serialize_pem().expect("serializing certificate as PEM"); + let key_pem = cert.serialize_private_key_pem(); + let namestr = format!("cert-for-{}", domain.replace('.', "-")); + params::CertificateCreate { + identity: IdentityMetadataCreateParams { + name: namestr.parse().unwrap(), + description: String::new(), + }, + cert: cert_pem, + key: key_pem, + service: shared::ServiceUsingCertificate::ExternalApi, + } + } + + fn create_dns_zone(domain: &str) -> DnsZone { + DnsZone { + id: Uuid::new_v4(), + time_created: Utc::now(), + dns_group: DnsGroup::External, + zone_name: format!("{}.test", domain), + } + } + + fn cert_matches(tls_cert: &TlsCertificate, cert: &Certificate) -> bool { + let parse_right = openssl::x509::X509::from_pem(&cert.cert).unwrap(); + tls_cert.parsed == parse_right + } + + #[test] + fn test_external_endpoints_empty() { + // Truly trivial case: no endpoints at all. + let ee1 = ExternalEndpoints::new(vec![], vec![], vec![]); + assert_eq!(ee1.ndomains(), 0); + assert_eq!(ee1.nwarnings(), 1); + assert_eq!( + ee1.warnings[0].to_string(), + "no external endpoints were found" + ); + assert!(ee1.default_endpoint.is_none()); + + // There are also no endpoints if there's a Silo but no external DNS + // zones. + let silo_id: Uuid = + "6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3".parse().unwrap(); + let silo = create_silo(Some(silo_id), "dummy", false); + let ee2 = ExternalEndpoints::new(vec![silo], vec![], vec![]); + assert_eq!(ee2.ndomains(), 0); + assert_eq!(ee2.nwarnings(), 1); + assert_eq!( + ee2.warnings[0].to_string(), + "no external endpoints were found" + ); + assert!(ee2.default_endpoint.is_none()); + // Test PartialEq impl. + assert_eq!(ee1, ee2); + + // There are also no endpoints if there's an external DNS zone but no + // Silo. + let dns_zone1 = create_dns_zone("oxide1"); + let ee2 = ExternalEndpoints::new(vec![], vec![], vec![dns_zone1]); + assert_eq!(ee2.ndomains(), 0); + assert_eq!(ee2.nwarnings(), 1); + assert_eq!( + ee2.warnings[0].to_string(), + "no external endpoints were found" + ); + assert!(ee2.default_endpoint.is_none()); + // Test PartialEq impl. + assert_eq!(ee1, ee2); + + // Finally, there are no endpoints if there's a certificate and nothing + // else. This isn't really valid. But it's useful to verify that we + // won't crash or otherwise fail if we get a certificate with an invalid + // silo_id. + let cert_create = create_certificate("dummy.sys.oxide1.test", false); + let cert = Certificate::new( + silo_id, + Uuid::new_v4(), + ServiceKind::Nexus, + cert_create, + &["dummy.sys.oxide1.test".to_string()], + ) + .unwrap(); + let ee2 = ExternalEndpoints::new(vec![], vec![cert], vec![]); + assert_eq!(ee2.ndomains(), 0); + assert_eq!(ee2.nwarnings(), 2); + assert!(ee2.warnings[0].to_string().contains("silo not found"),); + assert_eq!( + ee2.warnings[1].to_string(), + "no external endpoints were found" + ); + assert!(ee2.default_endpoint.is_none()); + } + + #[test] + fn test_external_endpoints_basic() { + // Empty case for comparison. + let ee1 = ExternalEndpoints::new(vec![], vec![], vec![]); + + // Sample data + let silo_id: Uuid = + "6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3".parse().unwrap(); + let silo = create_silo(Some(silo_id), "dummy", false); + let dns_zone1 = create_dns_zone("oxide1"); + let cert_create = create_certificate("dummy.sys.oxide1.test", false); + let cert = Certificate::new( + silo_id, + Uuid::new_v4(), + ServiceKind::Nexus, + cert_create, + &["dummy.sys.oxide1.test".to_string()], + ) + .unwrap(); + + // Simple case: one silo, one DNS zone. We should see an endpoint for + // the Silo. Since it has no certificates, we'll get a warning. + let ee3 = ExternalEndpoints::new( + vec![silo.clone()], + vec![], + vec![dns_zone1.clone()], + ); + // Test PartialEq impl. + assert_ne!(ee1, ee3); + assert_eq!(ee3.ndomains(), 1); + assert!(ee3.has_domain("dummy.sys.oxide1.test")); + assert_eq!(ee3.nwarnings(), 1); + assert_eq!( + ee3.warnings[0].to_string(), + "silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 with DNS name \ + \"dummy.sys.oxide1.test\" has no usable certificates" + ); + // This also exercises best_certificate() with zero certificates. + assert_eq!( + ee3.by_dns_name["dummy.sys.oxide1.test"] + .best_certificate() + .unwrap_err() + .to_string(), + "silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 has no usable \ + certificates" + ); + assert_eq!(ee3.default_endpoint.as_ref().unwrap().silo_id, silo_id); + + // Now try with a certificate. + let ee4 = ExternalEndpoints::new( + vec![silo.clone()], + vec![cert.clone()], + vec![dns_zone1.clone()], + ); + assert_ne!(ee3, ee4); + assert_eq!(ee4.ndomains(), 1); + assert!(ee4.has_domain("dummy.sys.oxide1.test")); + assert_eq!(ee4.nwarnings(), 0); + let endpoint = &ee4.by_dns_name["dummy.sys.oxide1.test"]; + assert_eq!(endpoint.silo_id, silo_id); + assert_eq!(endpoint.tls_certs.len(), 1); + assert!(cert_matches(&endpoint.tls_certs[0], &cert)); + // This also exercises best_certificate() with one certificate. + assert_eq!( + *endpoint.best_certificate().unwrap(), + endpoint.tls_certs[0] + ); + assert_eq!(ee4.default_endpoint.as_ref().unwrap().silo_id, silo_id); + + // Add a second external DNS zone. There should now be two endpoints, + // both pointing to the same Silo. + let dns_zone2 = DnsZone { + id: Uuid::new_v4(), + time_created: Utc::now(), + dns_group: DnsGroup::External, + zone_name: String::from("oxide2.test"), + }; + let ee5 = ExternalEndpoints::new( + vec![silo.clone()], + vec![cert.clone()], + vec![dns_zone1.clone(), dns_zone2], + ); + assert_ne!(ee4, ee5); + assert_eq!(ee5.ndomains(), 2); + assert!(ee5.has_domain("dummy.sys.oxide1.test")); + assert!(ee5.has_domain("dummy.sys.oxide2.test")); + assert_eq!(ee5.nwarnings(), 0); + assert_eq!(ee5.default_endpoint.as_ref().unwrap().silo_id, silo_id); + let endpoint1 = &ee5.by_dns_name["dummy.sys.oxide1.test"]; + let endpoint2 = &ee5.by_dns_name["dummy.sys.oxide2.test"]; + assert_eq!(endpoint1, endpoint2); + assert_eq!(endpoint1.silo_id, silo_id); + assert_eq!(endpoint1.tls_certs.len(), 1); + assert_eq!(endpoint2.silo_id, silo_id); + assert_eq!(endpoint2.tls_certs.len(), 1); + + // Add a second Silo with the same name as the first one. This should + // not be possible in practice. In the future, we expect other features + // (e.g., DNS aliases) to make it possible for silos' DNS names to + // overlap like this. + let silo2_same_name_id = + "e3f36f20-56c3-c545-8320-c19d98b82c1d".parse().unwrap(); + let silo2_same_name = + create_silo(Some(silo2_same_name_id), "dummy", false); + let ee6 = ExternalEndpoints::new( + vec![silo, silo2_same_name], + vec![cert], + vec![dns_zone1], + ); + assert_ne!(ee5, ee6); + assert_eq!(ee6.ndomains(), 1); + assert!(ee6.has_domain("dummy.sys.oxide1.test")); + assert_eq!(ee6.default_endpoint.as_ref().unwrap().silo_id, silo_id); + let endpoint = &ee6.by_dns_name["dummy.sys.oxide1.test"]; + assert_eq!(endpoint.silo_id, silo_id); + assert_eq!(endpoint.tls_certs.len(), 1); + assert_eq!(ee6.nwarnings(), 1); + assert_eq!( + ee6.warnings[0].to_string(), + "ignoring silo e3f36f20-56c3-c545-8320-c19d98b82c1d (\"dummy\"): \ + has the same DNS name (\"dummy.sys.oxide1.test\") as \ + previously-found silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 \ + (\"dummy\")" + ); + } + + #[test] + fn test_external_endpoints_complex() { + // Set up a somewhat complex scenario: + // + // - four Silos + // - silo1: two certificates, one of which is expired + // - silo2: two certificates, one of which is expired + // (in the other order to make sure it's not working by accident) + // - silo3: one certificate that is invalid + // - silo4: one certificate that is expired + // - two DNS zones + // + // We should wind up with eight endpoints and one warning. + let silo1 = create_silo(None, "silo1", true); + let silo2 = create_silo(None, "silo2", true); + let silo3 = create_silo(None, "silo3", false); + let silo4 = create_silo(None, "silo4", true); + let silo1_cert1_params = + create_certificate("silo1.sys.oxide1.test", false); + let silo1_cert1 = Certificate::new( + silo1.identity().id, + Uuid::new_v4(), + ServiceKind::Nexus, + silo1_cert1_params, + &["silo1.sys.oxide1.test".to_string()], + ) + .unwrap(); + let silo1_cert2_params = + create_certificate("silo1.sys.oxide1.test", true); + let silo1_cert2 = Certificate::new_unvalidated( + silo1.identity().id, + Uuid::new_v4(), + ServiceKind::Nexus, + silo1_cert2_params, + ); + let silo2_cert1_params = + create_certificate("silo2.sys.oxide1.test", true); + let silo2_cert1 = Certificate::new_unvalidated( + silo2.identity().id, + Uuid::new_v4(), + ServiceKind::Nexus, + silo2_cert1_params, + ); + let silo2_cert2_params = + create_certificate("silo2.sys.oxide1.test", false); + let silo2_cert2 = Certificate::new( + silo2.identity().id, + Uuid::new_v4(), + ServiceKind::Nexus, + silo2_cert2_params, + &["silo2.sys.oxide1.test".to_string()], + ) + .unwrap(); + let silo3_cert_params = + create_certificate("silo3.sys.oxide1.test", false); + let mut silo3_cert = Certificate::new( + silo3.identity().id, + Uuid::new_v4(), + ServiceKind::Nexus, + silo3_cert_params, + &["silo3.sys.oxide1.test".to_string()], + ) + .unwrap(); + // Corrupt a byte of this last certificate. (This has to be done after + // constructing it or we would fail validation.) + silo3_cert.cert[0] ^= 1; + let silo4_cert_params = + create_certificate("silo4.sys.oxide1.test", true); + let silo4_cert = Certificate::new_unvalidated( + silo4.identity().id, + Uuid::new_v4(), + ServiceKind::Nexus, + silo4_cert_params, + ); + let dns_zone1 = create_dns_zone("oxide1"); + let dns_zone2 = create_dns_zone("oxide2"); + + let ee = ExternalEndpoints::new( + vec![silo1.clone(), silo2.clone(), silo3.clone(), silo4.clone()], + vec![ + silo1_cert1.clone(), + silo1_cert2.clone(), + silo2_cert1, + silo2_cert2.clone(), + silo3_cert.clone(), + silo4_cert.clone(), + ], + vec![dns_zone1, dns_zone2], + ); + println!("{:?}", ee); + assert_eq!(ee.ndomains(), 8); + assert_eq!(ee.nwarnings(), 3); + assert_eq!( + 2, + ee.warnings + .iter() + .filter(|warning| matches!(warning, + ExternalEndpointError::NoSiloCerts { silo_id, .. } + if *silo_id == silo3.id() + )) + .count() + ); + assert_eq!( + 1, + ee.warnings + .iter() + .filter(|warning| matches!(warning, + ExternalEndpointError::BadCert { silo_id, .. } + if *silo_id == silo3.id() + )) + .count() + ); + + assert_eq!( + ee.by_dns_name["silo1.sys.oxide1.test"], + ee.by_dns_name["silo1.sys.oxide2.test"] + ); + assert_eq!( + ee.by_dns_name["silo2.sys.oxide1.test"], + ee.by_dns_name["silo2.sys.oxide2.test"] + ); + assert_eq!( + ee.by_dns_name["silo3.sys.oxide1.test"], + ee.by_dns_name["silo3.sys.oxide2.test"] + ); + assert_eq!( + ee.by_dns_name["silo4.sys.oxide1.test"], + ee.by_dns_name["silo4.sys.oxide2.test"] + ); + assert_eq!( + ee.default_endpoint.as_ref().unwrap().silo_id, + silo3.identity().id + ); + + let e1 = &ee.by_dns_name["silo1.sys.oxide1.test"]; + assert_eq!(e1.silo_id, silo1.id()); + let c1 = e1.best_certificate().unwrap(); + // It must be cert1 because cert2 is expired. + assert!(cert_matches(c1, &silo1_cert1)); + + let e2 = &ee.by_dns_name["silo2.sys.oxide1.test"]; + assert_eq!(e2.silo_id, silo2.id()); + let c2 = e2.best_certificate().unwrap(); + // It must be cert2 because cert1 is expired. + assert!(cert_matches(c2, &silo2_cert2)); + assert!(!cert_matches(c2, &silo1_cert1)); + assert!(!cert_matches(c2, &silo1_cert2)); + + let e3 = &ee.by_dns_name["silo3.sys.oxide1.test"]; + assert_eq!(e3.silo_id, silo3.id()); + assert!(e3.best_certificate().is_err()); + + // We should get an expired cert if it's the only option. + let e4 = &ee.by_dns_name["silo4.sys.oxide1.test"]; + assert_eq!(e4.silo_id, silo4.id()); + let c4 = e4.best_certificate().unwrap(); + assert!(cert_matches(c4, &silo4_cert)); + + // + // Test endpoint lookup by authority. + // + let logctx = LogContext::new( + "test_external_endpoints_complex", + &ConfigLogging::File { + level: ConfigLoggingLevel::Trace, + path: "UNUSED".into(), + if_exists: ConfigLoggingIfExists::Append, + }, + ); + let log = &logctx.log; + let (_, watch_rx) = tokio::sync::watch::channel(Some(ee.clone())); + + // Basic cases: look up a few Silos by name. + let authority = Authority::from_static("silo1.sys.oxide1.test"); + let ae1 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); + assert_eq!(ae1, *e1); + let authority = Authority::from_static("silo1.sys.oxide2.test"); + let ae1 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); + assert_eq!(ae1, *e1); + let authority = Authority::from_static("silo2.sys.oxide1.test"); + let ae2 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); + assert_eq!(ae2, *e2); + // The port number in the authority should be ignored. + let authority = Authority::from_static("silo3.sys.oxide1.test:456"); + let ae3 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); + assert_eq!(ae3, *e3); + // We should get back a default endpoint if we use a server name that's + // not known. That includes any IPv4 or IPv6 address, too. The default + // endpoint should always be silo3 because it's the only one we've + // created LocalOnly. + for name in [ + "springfield.sys.oxide1.test", + "springfield.sys.oxide1.test:123", + "10.1.2.3:456", + "[fe80::1]:789", + ] { + let authority = Authority::from_static(name); + let ae = + endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); + assert_eq!(ae, *e3); + } + + // + // Now test the NexusCertResolver. + // + let (watch_tx, watch_rx) = tokio::sync::watch::channel(None); + let cert_resolver = + NexusCertResolver::new(logctx.log.clone(), watch_rx); + + // At this point we haven't filled in the configuration so any attempt + // to resolve anything should fail. + assert!(cert_resolver + .do_resolve(Some("silo1.sys.oxide1.test")) + .is_none()); + + // Now pass along the configuration and try again. + watch_tx.send(Some(ee.clone())).unwrap(); + let resolved_c1 = + cert_resolver.do_resolve(Some("silo1.sys.oxide1.test")).unwrap(); + assert_eq!(resolved_c1.cert, c1.certified_key.cert); + let resolved_c2 = + cert_resolver.do_resolve(Some("silo2.sys.oxide1.test")).unwrap(); + assert_eq!(resolved_c2.cert, c2.certified_key.cert); + assert!(cert_resolver + .do_resolve(Some("silo3.sys.oxide1.test")) + .is_none()); + // We should get an expired cert if it's the only option. + let resolved_c4 = + cert_resolver.do_resolve(Some("silo4.sys.oxide1.test")).unwrap(); + assert_eq!(resolved_c4.cert, c4.certified_key.cert); + + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_authority() { + // Tests for authority_for_request(). The function itself is pretty + // simple. That makes it easy to test fairly exhaustively. It's also + // useful to verify that we're doing what we think we're doing + // (identifying the name that the client thinks they're connecting to). + + // First, set up a Dropshot server that just echoes back whatever + // authority_for_request() returns for a given request. + let logctx = omicron_test_utils::dev::test_setup_log("test_authority"); + let mut api = dropshot::ApiDescription::new(); + api.register(echo_server_name).unwrap(); + let server = dropshot::HttpServerStarter::new( + &dropshot::ConfigDropshot::default(), + api, + (), + &logctx.log, + ) + .expect("failed to create dropshot server") + .start(); + let local_addr = server.local_addr(); + let port = local_addr.port(); + + #[derive(Debug, PartialEq, Eq, JsonSchema, Serialize, Deserialize)] + struct AuthorityResponse { + host: String, + port: Option, + } + + #[endpoint(method = GET, path = "/server_name")] + async fn echo_server_name( + rqctx: dropshot::RequestContext<()>, + ) -> Result< + dropshot::HttpResponseOk>, + dropshot::HttpError, + > { + Ok(dropshot::HttpResponseOk( + authority_for_request(&rqctx.request).map(|authority| { + AuthorityResponse { + host: authority.host().to_string(), + port: authority.port_u16(), + } + }), + )) + } + + // Generally, the "authority" for a request is determined by the URL + // provided to the client. We can test basically two cases this way: an + // authority with a host and port and an authority with an IP address + // and port. We can't test any cases that require the client to connect + // to a different host/port than what's in the URL. So we can't test + // the case of an authority with no port number in it (since our server + // doesn't run on port 80). + // + // With HTTP 1.1, you can generally override the authority by specifying + // your own "host" header. That lets us exercise the case of an + // authority that has no port number, even though the client would be + // connecting to a URL with a port number in it. It might also let us + // test other cases, like an authority with an invalid DNS name. + // However, it's not clear any of this is possible with HTTP 2 or later. + + async fn test_v2_host( + hostname: &str, + addr: SocketAddr, + ) -> AuthorityResponse { + let v2_client = reqwest::ClientBuilder::new() + .http2_prior_knowledge() + .resolve(hostname, addr) + .build() + .unwrap(); + test_request(&v2_client, &format!("{}:{}", hostname, addr.port())) + .await + } + + async fn test_v2_ip(addr: SocketAddr) -> AuthorityResponse { + let v2_client = reqwest::ClientBuilder::new() + .http2_prior_knowledge() + .build() + .unwrap(); + test_request(&v2_client, &addr.to_string()).await + } + + async fn test_v1_host( + hostname: &str, + addr: SocketAddr, + override_host: Option<&str>, + ) -> AuthorityResponse { + let mut v1_builder = reqwest::ClientBuilder::new() + .http1_only() + .resolve(hostname, addr); + if let Some(host) = override_host { + let mut headers = http::header::HeaderMap::new(); + headers.insert(http::header::HOST, host.try_into().unwrap()); + v1_builder = v1_builder.default_headers(headers); + } + let v1_client = v1_builder.build().unwrap(); + test_request(&v1_client, &format!("{}:{}", hostname, addr.port())) + .await + } + + async fn test_v1_ip( + addr: SocketAddr, + override_host: Option<&str>, + ) -> AuthorityResponse { + let mut v1_builder = reqwest::ClientBuilder::new().http1_only(); + if let Some(host) = override_host { + let mut headers = http::header::HeaderMap::new(); + headers.append(http::header::HOST, host.try_into().unwrap()); + v1_builder = v1_builder.default_headers(headers); + } + let v1_client = v1_builder.build().unwrap(); + test_request(&v1_client, &addr.to_string()).await + } + + async fn test_request( + client: &reqwest::Client, + connect_host: &str, + ) -> AuthorityResponse { + let url = format!("http://{}/server_name", connect_host); + + let result = client + .get(&url) + .send() + .await + .unwrap_or_else(|e| panic!("GET {:?}: {:#}", url, e)); + let status = result.status(); + println!("status: {:?}", status); + if status != http::StatusCode::OK { + panic!("GET {:?}: unexpected status: {:?}", url, status); + } + + let body: Result = + result.json().await.unwrap_or_else(|e| { + panic!("GET {:?}: parse json: {:#}", url, e); + }); + println!("body: {:?}", body); + body.unwrap() + } + + // HTTP 2: regular hostname (with port) + let authority = test_v2_host("foo.example.com", local_addr).await; + assert_eq!(authority.host, "foo.example.com"); + assert_eq!(authority.port, Some(port)); + + // HTTP 2: IP address (with port) + let authority = test_v2_ip(local_addr).await; + assert_eq!(authority.host, local_addr.ip().to_string()); + assert_eq!(authority.port, Some(port)); + + // HTTP 1.1: regular hostname, no overridden "host" header. + let authority = test_v1_host("foo.example.com", local_addr, None).await; + assert_eq!(authority.host, "foo.example.com"); + assert_eq!(authority.port, Some(port)); + + // HTTP 1.1: regular hostname, override "host" header with port. + let authority = test_v1_host( + "foo.example.com", + local_addr, + Some("foo.example.com:123"), + ) + .await; + assert_eq!(authority.host, "foo.example.com"); + assert_eq!(authority.port, Some(123)); + + // HTTP 1.1: regular hostname, override "host" header with no port. + let authority = test_v1_host( + "foo.example.com", + local_addr, + Some("foo.example.com"), + ) + .await; + assert_eq!(authority.host, "foo.example.com"); + assert_eq!(authority.port, None); + + // HTTP 1.1: IP address, no overridden "host" header. + let authority = test_v1_ip(local_addr, None).await; + assert_eq!(authority.host, local_addr.ip().to_string()); + assert_eq!(authority.port, Some(port)); + + // HTTP 1.1: IP address, override "host" header with port. + let authority = + test_v1_ip(local_addr, Some("foo.example.com:123")).await; + assert_eq!(authority.host, "foo.example.com"); + assert_eq!(authority.port, Some(123)); + + // HTTP 1.1: IP address, override "host" header with no port. + let authority = test_v1_ip(local_addr, Some("foo.example.com")).await; + assert_eq!(authority.host, "foo.example.com"); + assert_eq!(authority.port, None); + + server.close().await.expect("failed to shut down dropshot server"); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_no_endpoint() { + let logctx = + omicron_test_utils::dev::test_setup_log("test_no_endpoint"); + let log = &logctx.log; + + // We'll test two configurations at the same time: one where there's no + // configuration at all, and one where there's a configuration but no + // default endpoint. These should always produce errors, no matter what + // endpoint we're looking up. + let ee = ExternalEndpoints::new(vec![], vec![], vec![]); + let (_, none_rx) = + tokio::sync::watch::channel::>(None); + let (_, empty_rx) = + tokio::sync::watch::channel::>(Some(ee)); + + for name in [ + "dummy", + "dummy.example", + "dummy.example:123", + "10.1.2.3:456", + "[fe80::1]:789", + ] { + let authority = Authority::from_static(name); + for (rx_label, rx_channel) in + [("empty", &empty_rx), ("none", &none_rx)] + { + println!("config {:?} endpoint {:?}", rx_label, name); + let result = + endpoint_for_authority(&log, &authority, rx_channel); + match result { + Err(Error::ServiceUnavailable { internal_message }) => { + assert_eq!(rx_label, "none"); + assert_eq!(internal_message, "endpoints not loaded"); + } + Err(Error::InvalidRequest { message }) => { + assert_eq!(rx_label, "empty"); + assert_eq!( + message.external_message(), + format!( + "HTTP request for unknown server name {:?}", + authority.host() + ) + ); + } + result => { + panic!( + "unexpected result looking up endpoint for \ + {:?} with config {:?}: {:?}", + name, rx_label, result + ); + } + } + } + } + + logctx.cleanup_successful(); + } +} diff --git a/nexus/reconfigurator/execution/Cargo.toml b/nexus/reconfigurator/execution/Cargo.toml index 62155d9783..07b97457d7 100644 --- a/nexus/reconfigurator/execution/Cargo.toml +++ b/nexus/reconfigurator/execution/Cargo.toml @@ -15,6 +15,7 @@ internal-dns.workspace = true nexus-config.workspace = true nexus-db-model.workspace = true nexus-db-queries.workspace = true +nexus-external-endpoints.workspace = true nexus-types.workspace = true omicron-common.workspace = true reqwest.workspace = true diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index dd23822502..fcf9b1961c 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -13,6 +13,7 @@ use nexus_db_model::Silo; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder; use nexus_db_queries::db::DataStore; +use nexus_external_endpoints::ExternalEndpoints; use nexus_types::deployment::Blueprint; use nexus_types::deployment::OmicronZoneType; use nexus_types::internal_api::params::DnsConfigParams; @@ -62,9 +63,11 @@ pub(crate) async fn deploy_dns( // Next, construct the DNS config represented by the blueprint. let internal_dns_config_blueprint = blueprint_internal_dns_config(blueprint, sleds_by_id); - let silos = todo!(); // XXX-dap + let external_endpoints = read_all_endpoints(datastore, opctx) + .await + .internal_context("reading external endpoints to deploy DNS")?; let external_dns_config_blueprint = - blueprint_external_dns_config(blueprint, silos); + blueprint_external_dns_config(blueprint, &external_endpoints); // Deploy the changes. deploy_dns_one( @@ -302,8 +305,15 @@ pub fn blueprint_internal_dns_config( pub fn blueprint_external_dns_config( blueprint: &Blueprint, - silos: Vec, + endpoints: &ExternalEndpoints, ) -> DnsConfigParams { + let dns_params = DnsConfigParams { + generation: blueprint.external-dns_version.next(), + time_created: Utc::now(), + zones: vec![DnsConfigZone { + + }] + }; todo!(); // XXX-dap } diff --git a/nexus/src/app/background/external_endpoints.rs b/nexus/src/app/background/external_endpoints.rs index ed530e0775..1bad865e2a 100644 --- a/nexus/src/app/background/external_endpoints.rs +++ b/nexus/src/app/background/external_endpoints.rs @@ -7,12 +7,12 @@ //! associated with those names use super::common::BackgroundTask; -use crate::app::external_endpoints::read_all_endpoints; -pub use crate::app::external_endpoints::ExternalEndpoints; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; +use nexus_external_endpoints::read_all_endpoints; +pub use nexus_external_endpoints::ExternalEndpoints; use serde_json::json; use std::sync::Arc; use tokio::sync::watch; diff --git a/nexus/src/app/external_endpoints.rs b/nexus/src/app/external_endpoints.rs index bcfec667ce..66d31e2ead 100644 --- a/nexus/src/app/external_endpoints.rs +++ b/nexus/src/app/external_endpoints.rs @@ -4,652 +4,16 @@ //! Management of external HTTPS endpoints //! -//! Whenever a client connects to one of our external endpoints and attempts to -//! establish a TLS session, we must provide a TLS certificate to authenticate -//! ourselves to the client. But each Silo has a separate external DNS name and -//! may have its own TLS certificate for that DNS name. These all resolve to -//! the same set of IPs, so we cannot tell from the IP address alone which -//! Silo's endpoint the client is trying to reach nor which certificate to -//! present. TLS provides a mechanism called Server Name Indication (SNI) for -//! clients to specify the name of the server they're trying to reach _before_ -//! the TLS session is established. We use this to determine which Silo -//! endpoint the client is trying to reach and so which TLS certificate to -//! present. -//! -//! To achieve this, we first need to know what DNS names, Silos, and TLS -//! certificates are available at any given time. This is summarized in -//! [`ExternalEndpoints`]. A background task is responsible for maintaining -//! this, providing the latest version to whoever needs it via a `watch` -//! channel. How do we tell the TLS stack what certificate to use? When -//! setting up the Dropshot server in the first place, we provide a -//! [`rustls::ServerConfig`] that describes various TLS settings, including an -//! "certificate resolver" object that impls -//! [`rustls::server::ResolvesServerCert`]. See [`NexusCertResolver`]. +//! The guts of this subsystem are in the separate `nexus-external-endpoints` +//! crate. -use super::silo::silo_dns_name; use crate::ServerContext; -use anyhow::anyhow; -use anyhow::bail; -use anyhow::Context; -use nexus_db_model::AuthenticationMode; -use nexus_db_model::Certificate; -use nexus_db_model::DnsGroup; -use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::datastore::Discoverability; -use nexus_db_queries::db::fixed_data::silo::SILO_ID; -use nexus_db_queries::db::model::ServiceKind; -use nexus_db_queries::db::DataStore; -use nexus_types::identity::Resource; -use omicron_common::api::external::http_pagination::PaginatedBy; -use omicron_common::api::external::DataPageParams; +use nexus_external_endpoints::authority_for_request; +use nexus_external_endpoints::endpoint_for_authority; +use nexus_external_endpoints::ExternalEndpoint; +pub use nexus_external_endpoints::NexusCertResolver; use omicron_common::api::external::Error; -use omicron_common::bail_unless; -use openssl::pkey::PKey; -use openssl::x509::X509; -use rustls::sign::CertifiedKey; -use serde::Serialize; -use serde_with::SerializeDisplay; -use std::collections::btree_map::Entry; -use std::collections::BTreeMap; -use std::fmt; -use std::num::NonZeroU32; use std::sync::Arc; -use thiserror::Error; -use tokio::sync::watch; -use uuid::Uuid; - -/// Describes the set of external endpoints, organized by DNS name -/// -/// This data structure provides a quick way to determine which Silo and TLS -/// certificate(s) make sense for an incoming request, based on the TLS -/// session's SNI (DNS name). See module-level docs for details. -/// -/// This object provides no interfaces outside this module. It's only used by -/// the `NexusCertResolver` that's also in this module. -/// -/// This structure impls `Serialize` only so that background tasks can easily -/// present the latest configuration that they've found (e.g., via a debug API) -#[derive(Clone, Debug, Eq, PartialEq, Serialize)] -pub struct ExternalEndpoints { - by_dns_name: BTreeMap>, - warnings: Vec, - default_endpoint: Option>, -} - -impl ExternalEndpoints { - /// Assemble a list of Silos, TLS certificates, and external DNS zones into - /// a structure that we can use for quickly figuring out which Silo and TLS - /// certificates are associated with each incoming DNS name - fn new( - silos: Vec, - certs: Vec, - external_dns_zones: Vec, - ) -> ExternalEndpoints { - // We want to avoid failing this operation even if we encounter problems - // because we want to serve as many DNS certificates as we can (so that - // an operator has a chance of fixing any problems that do exist). - // Instead of returning any errors, keep track of any issues as - // warnings. - let mut warnings = vec![]; - - // Compute a mapping from external DNS name to Silo id. Detect any - // duplicates and leave them out (but report them). There should not - // be any duplicates since the DNS names are constructed from the - // (unique) Silo names. Even if we support aliases in the future, they - // will presumably need to be unique, too. - let silos_by_id: BTreeMap> = silos - .into_iter() - .map(|db_silo| (db_silo.id(), Arc::new(db_silo))) - .collect(); - let mut dns_names: BTreeMap = BTreeMap::new(); - for z in external_dns_zones { - for (_, db_silo) in &silos_by_id { - let dns_name = format!( - "{}.{}", - silo_dns_name(db_silo.name()), - z.zone_name - ); - match dns_names.entry(dns_name.clone()) { - Entry::Vacant(vac) => { - vac.insert(db_silo.id()); - } - Entry::Occupied(occ) => { - let first_silo_id = *occ.get(); - let first_silo_name = silos_by_id - .get(&first_silo_id) - .unwrap() - .name() - .to_string(); - warnings.push(ExternalEndpointError::DupDnsName { - dup_silo_id: db_silo.id(), - dup_silo_name: db_silo.name().to_string(), - first_silo_id, - first_silo_name, - dns_name, - }) - } - }; - } - } - - // Compute a mapping from silo id to a list of usable TLS certificates - // for the Silo. By "usable" here, we just mean that we are capable of - // providing it to the client. This basically means that we can parse - // it. A certificate might be invalid for some other reason (e.g., does - // not match the right DNS name or it's expired). We may later choose - // to prefer some certificates over others, but that'll be decided later - // (see best_certificate()). And in the end it'll be better to provide - // an expired certificate than none at all. - let parsed_certificates = certs.into_iter().map(|db_cert| { - let silo_id = db_cert.silo_id; - let tls_cert = TlsCertificate::try_from(db_cert).map_err(|e| { - ExternalEndpointError::BadCert { silo_id, reason: Arc::new(e) } - })?; - let db_silo = silos_by_id - .get(&silo_id) - .ok_or_else(|| ExternalEndpointError::BadCert { - silo_id, - reason: Arc::new(anyhow!("silo not found")), - })? - .clone(); - Ok((silo_id, db_silo, tls_cert)) - }); - - let mut certs_by_silo_id = BTreeMap::new(); - for parsed_cert in parsed_certificates { - match parsed_cert { - Err(error) => { - warnings.push(error); - } - Ok((silo_id, db_silo, tls_cert)) => { - let silo_entry = certs_by_silo_id - .entry(silo_id) - .or_insert_with(|| ExternalEndpoint { - silo_id, - db_silo, - tls_certs: Vec::new(), - }); - silo_entry.tls_certs.push(tls_cert) - } - }; - } - - let certs_by_silo_id: BTreeMap<_, _> = certs_by_silo_id - .into_iter() - .map(|(k, v)| (k, Arc::new(v))) - .collect(); - - let by_dns_name: BTreeMap<_, _> = dns_names - .into_iter() - .map(|(dns_name, silo_id)| { - let silo_info = certs_by_silo_id - .get(&silo_id) - .cloned() - .unwrap_or_else(|| { - // For something to appear in `dns_names`, we must have - // found it in `silos`, and so it must be in - // `silos_by_id`. - let db_silo = - silos_by_id.get(&silo_id).unwrap().clone(); - Arc::new(ExternalEndpoint { - silo_id, - db_silo, - tls_certs: vec![], - }) - }); - - if silo_info.tls_certs.is_empty() { - warnings.push(ExternalEndpointError::NoSiloCerts { - silo_id, - dns_name: dns_name.clone(), - }) - } - - (dns_name, silo_info) - }) - .collect(); - - if by_dns_name.is_empty() { - warnings.push(ExternalEndpointError::NoEndpoints); - } - - // Pick a default endpoint. This will be used if a request arrives - // without specifying an endpoint via the HTTP/1.1 Host header or the - // HTTP2 URL. This is only intended for development, where external DNS - // may not be set up. - // - // We somewhat arbitrarily choose the first Silo we find that's not JIT. - // This would usually be the recovery Silo. - let default_endpoint = silos_by_id - .values() - .filter(|s| { - // Ignore the built-in Silo, which people are not supposed to - // log into. - s.id() != *SILO_ID - }) - .find(|s| s.authentication_mode == AuthenticationMode::Local) - .and_then(|s| { - by_dns_name - .iter() - .find(|(_, endpoint)| endpoint.silo_id == s.id()) - .map(|(_, endpoint)| endpoint.clone()) - }); - - ExternalEndpoints { by_dns_name, warnings, default_endpoint } - } - - #[cfg(test)] - pub fn has_domain(&self, dns_name: &str) -> bool { - self.by_dns_name.contains_key(dns_name) - } - - #[cfg(test)] - pub fn ndomains(&self) -> usize { - self.by_dns_name.len() - } - - #[cfg(test)] - pub fn nwarnings(&self) -> usize { - self.warnings.len() - } -} - -/// Describes a single external "endpoint", by which we mean an external DNS -/// name that's associated with a particular Silo -#[derive(Debug, PartialEq, Eq, Serialize)] -pub struct ExternalEndpoint { - /// the id of the Silo associated with this endpoint - // This is redundant with `db_silo`, but it's convenient to put it here and - // it shows up in the serialized form this way. - silo_id: Uuid, - /// the silo associated with this endpoint - #[serde(skip)] - db_silo: Arc, - /// the set of TLS certificate chains that could be appropriate for this - /// endpoint - tls_certs: Vec, -} - -impl ExternalEndpoint { - pub fn silo(&self) -> &nexus_db_model::Silo { - &self.db_silo - } - - /// Chooses a TLS certificate (chain) to use when handling connections to - /// this endpoint - fn best_certificate(&self) -> Result<&TlsCertificate, anyhow::Error> { - // We expect the most common case to be that there's only one - // certificate chain here. The next most common case is that there are - // two because the administrator is in the process of rotating - // certificates, usually due to upcoming expiration. In principle, it - // would be useful to allow operators to control which certificate chain - // gets used, and maybe even do something like a canary to mitigate the - // risk of a botched certificate update. Absent that, we're going to do - // our best to pick the best chain automatically. - // - // This could be a lot more sophisticated than it is. We could try to - // avoid using certificates that are clearly not valid based on the - // "not_after" and "not_before" bounds. We could check each certificate - // in the chain, not just the last one. We could use a margin of error - // when doing this to account for small variations in the wall clock - // between us and the client. We could try to avoid using a certificate - // that doesn't appear to be compatible with the SNI value (DNS domain) - // that this request came in on. - // - // IMPORTANT: If we ever decide to do those things, they should only be - // used to decide which of several certificates is preferred. We should - // always pick a certificate if we possibly can, even if it seems to be - // invalid. A client can always choose not to trust it. But in the - // unfortunate case where there are no good certificates, a customer's - // only option may be to instruct their client to trust an invalid - // certificate _so that they can log in and fix the certificate - // problem_. If we provide no certificate at all here, a customer may - // have no way to fix the problem. - // - // Anyway, we don't yet do anything of these things. For now, pick the - // certificate chain whose leaf certificate has the latest expiration - // time. - - // This would be cleaner if Asn1Time impl'd Ord or even just a way to - // convert it to a Unix timestamp or any other comparable timestamp. - let mut latest_expiration: Option<&TlsCertificate> = None; - for t in &self.tls_certs { - // We'll choose this certificate (so far) if we find that it's - // anything other than "earlier" than the best we've seen so far. - // That includes the case where we haven't seen any so far, where - // this one is greater than or equal to the best so far, as well as - // the case where they're incomparable for whatever reason. (This - // ensures that we always pick at least one.) - if latest_expiration.is_none() - || !matches!( - t.parsed.not_after().partial_cmp( - latest_expiration.unwrap().parsed.not_after() - ), - Some(std::cmp::Ordering::Less) - ) - { - latest_expiration = Some(t); - } - } - - latest_expiration.ok_or_else(|| { - anyhow!("silo {} has no usable certificates", self.silo_id) - }) - } -} - -/// Describes a problem encountered while assembling an [`ExternalEndpoints`] -/// object -#[derive(Clone, Debug, Error, SerializeDisplay)] -enum ExternalEndpointError { - #[error( - "ignoring silo {dup_silo_id} ({dup_silo_name:?}): has the same DNS \ - name ({dns_name:?}) as previously-found silo {first_silo_id} \ - ({first_silo_name:?})" - )] - DupDnsName { - dup_silo_id: Uuid, - dup_silo_name: String, - first_silo_id: Uuid, - first_silo_name: String, - dns_name: String, - }, - - #[error("ignoring certificate for silo {silo_id}: {reason:#}")] - BadCert { - silo_id: Uuid, - #[source] - reason: Arc, - }, - - #[error( - "silo {silo_id} with DNS name {dns_name:?} has no usable certificates" - )] - NoSiloCerts { silo_id: Uuid, dns_name: String }, - - #[error("no external endpoints were found")] - NoEndpoints, -} - -impl Eq for ExternalEndpointError {} -impl PartialEq for ExternalEndpointError { - fn eq(&self, other: &Self) -> bool { - self.to_string() == other.to_string() - } -} - -/// A parsed, validated TLS certificate ready to use with an external TLS server -#[derive(Serialize)] -#[serde(transparent)] -struct TlsCertificate { - /// This is what we need to provide to the TLS stack when we decide to use - /// this certificate for an incoming TLS connection - // NOTE: It's important that we do not serialize the private key! - #[serde(skip)] - certified_key: Arc, - - /// Parsed representation of the whole certificate chain - /// - /// This is used to extract metadata like the expiration time. - // NOTE: It's important that we do not serialize the private key! - #[serde(skip)] - parsed: X509, - - /// certificate digest (historically sometimes called a "fingerprint") - // This is the only field that appears in the serialized output or debug - // output. - digest: String, -} - -impl fmt::Debug for TlsCertificate { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // It's important that only the digest appear in the debug output. We - // definitely don't want to leak the private key this way. Really, - // we don't want even the public parts adding noise to debug output. - f.debug_struct("TlsCertificate").field("digest", &self.digest).finish() - } -} - -impl Eq for TlsCertificate {} -impl PartialEq for TlsCertificate { - fn eq(&self, other: &Self) -> bool { - self.digest == other.digest - } -} - -impl TryFrom for TlsCertificate { - type Error = anyhow::Error; - - fn try_from(db_cert: Certificate) -> Result { - // Parse and validate what we've got. - let certs_pem = openssl::x509::X509::stack_from_pem(&db_cert.cert) - .context("parsing PEM stack")?; - let private_key = PKey::private_key_from_pem(&db_cert.key) - .context("parsing private key PEM")?; - - // Assemble a rustls CertifiedKey with both the certificate and the key. - let certified_key = { - let mut cursor = std::io::Cursor::new(db_cert.key.clone()); - let rustls_private_key = rustls_pemfile::private_key(&mut cursor) - .expect("parsing private key PEM") - .expect("no private keys found"); - let rustls_signing_key = - rustls::crypto::ring::sign::any_supported_type( - &rustls_private_key, - ) - .context("parsing DER private key")?; - let rustls_certs = certs_pem - .iter() - .map(|x509| { - x509.to_der() - .context("serializing cert to DER") - .map(rustls::pki_types::CertificateDer::from) - }) - .collect::>()?; - Arc::new(CertifiedKey::new(rustls_certs, rustls_signing_key)) - }; - - let end_cert = certs_pem - .into_iter() - .next() - .ok_or_else(|| anyhow!("no certificates in PEM stack"))?; - anyhow::ensure!( - end_cert - .public_key() - .context("certificate publickey")? - .public_eq(&private_key), - "certificate public key does not match stored private key" - ); - - // Compute a digest (fingerprint) that we can use for debugging. - let digest = { - let digest_bytes = end_cert - .digest(openssl::hash::MessageDigest::sha256()) - .context("computing fingerprint")?; - hex::encode(&digest_bytes) - }; - - Ok(TlsCertificate { certified_key, digest, parsed: end_cert }) - } -} - -/// Read the lists of all Silos, external DNS zones, and external TLS -/// certificates from the database and assemble an `ExternalEndpoints` structure -/// that describes what DNS names exist, which Silos they correspond to, and -/// what TLS certificates can be used for them -// This structure is used to determine what TLS certificates are used for -// incoming connections to the external console/API endpoints. As such, it's -// critical that we produce a usable result if at all possible, even if it's -// incomplete. Otherwise, we won't be able to serve _any_ incoming connections -// to _any_ of our external endpoints! If data from the database is invalid or -// inconsistent, that data is discarded and a warning is produced, but we'll -// still return a usable object. -pub(crate) async fn read_all_endpoints( - datastore: &DataStore, - opctx: &OpContext, -) -> Result { - // We will not look for more than this number of external DNS zones, Silos, - // or certificates. We do not expect very many of any of these objects. - const MAX: u32 = 200; - let pagparams_id = DataPageParams { - marker: None, - limit: NonZeroU32::new(MAX).unwrap(), - direction: dropshot::PaginationOrder::Ascending, - }; - let pagbyid = PaginatedBy::Id(pagparams_id); - let pagparams_name = DataPageParams { - marker: None, - limit: NonZeroU32::new(MAX).unwrap(), - direction: dropshot::PaginationOrder::Ascending, - }; - - let silos = - datastore.silos_list(opctx, &pagbyid, Discoverability::All).await?; - let external_dns_zones = datastore - .dns_zones_list(opctx, DnsGroup::External, &pagparams_name) - .await?; - bail_unless!( - !external_dns_zones.is_empty(), - "expected at least one external DNS zone" - ); - let certs = datastore - .certificate_list_for(opctx, Some(ServiceKind::Nexus), &pagbyid, false) - .await?; - - // If we found too many of any of these things, complain as loudly as we - // can. Our results will be wrong. But we still don't want to fail if we - // can avoid it because we want to be able to serve as many endpoints as we - // can. - // TODO-reliability we should prevent people from creating more than this - // maximum number of Silos and certificates. - let max = usize::try_from(MAX).unwrap(); - if silos.len() >= max { - error!( - &opctx.log, - "reading endpoints: expected at most {} silos, but found at \ - least {}. TLS may not work on some Silos' external endpoints.", - MAX, - silos.len(), - ); - } - if external_dns_zones.len() >= max { - error!( - &opctx.log, - "reading endpoints: expected at most {} external DNS zones, but \ - found at least {}. TLS may not work on some Silos' external \ - endpoints.", - MAX, - external_dns_zones.len(), - ); - } - if certs.len() >= max { - error!( - &opctx.log, - "reading endpoints: expected at most {} certificates, but \ - found at least {}. TLS may not work on some Silos' external \ - endpoints.", - MAX, - certs.len(), - ); - } - - Ok(ExternalEndpoints::new(silos, certs, external_dns_zones)) -} - -/// TLS SNI certificate resolver for use with rustls/Dropshot -/// -/// This object exists to impl `rustls::server::ResolvesServerCert`. This -/// object looks at an incoming TLS session's SNI field, matches it against the -/// latest `ExternalEndpoints` configuration (available via a watch channel), -/// and then determines which certificate (if any) to provide for the new -/// session. -/// -/// See the module-level comment for more details. -#[derive(Debug)] -pub struct NexusCertResolver { - log: slog::Logger, - config_rx: watch::Receiver>, -} - -impl NexusCertResolver { - pub fn new( - log: slog::Logger, - config_rx: watch::Receiver>, - ) -> NexusCertResolver { - NexusCertResolver { log, config_rx } - } - - fn do_resolve_endpoint( - &self, - server_name: Option<&str>, - ) -> Result, anyhow::Error> { - let Some(server_name) = server_name else { - bail!("TLS session had no server name") - }; - - let config_ref = self.config_rx.borrow(); - let config = match &*config_ref { - Some(c) => c, - None => bail!("no TLS config found"), - }; - - config - .by_dns_name - .get(server_name) - .ok_or_else(|| anyhow!("unrecognized server name: {}", server_name)) - .cloned() - } - - fn do_resolve( - &self, - server_name: Option<&str>, - ) -> Option> { - let log = - self.log.new(o!("server_name" => server_name.map(String::from))); - - trace!(&log, "resolving TLS certificate"); - let resolved = self.do_resolve_endpoint(server_name); - let result = match resolved { - Ok(ref endpoint) => match endpoint.best_certificate() { - Ok(certificate) => Ok((endpoint.silo_id, certificate)), - Err(error) => Err(error), - }, - Err(error) => Err(error), - }; - match result { - Ok((silo_id, certificate)) => { - debug!(log, "resolved TLS certificate"; - "silo_id" => silo_id.to_string(), - "certificate" => ?certificate - ); - Some(certificate.certified_key.clone()) - } - Err(error) => { - // TODO-security There is a (limited) DoS risk here, in that the - // client controls the request made to this endpoint and we're - // going to emit something to the log every time this happens. - // But at this stage it's pretty valuable to be able to debug - // this problem. - warn!( - log, - "failed to resolve TLS certificate"; - "error" => format!("{:#}", error), - ); - None - } - } - } -} - -impl rustls::server::ResolvesServerCert for NexusCertResolver { - fn resolve( - &self, - client_hello: rustls::server::ClientHello, - ) -> Option> { - let server_name = client_hello.server_name(); - self.do_resolve(server_name) - } -} impl super::Nexus { /// Attempts to determine which external endpoint the given request is @@ -687,880 +51,3 @@ impl super::Nexus { ) } } - -/// Returns the host and port of the server that the client is trying to -/// reach -/// -/// Recall that Nexus serves many external endpoints on the same set of IP -/// addresses, each corresponding to a particular Silo. We use the standard -/// HTTP 1.1 "host" header or HTTP2 URI authority to determine which -/// Silo's endpoint the client is trying to reach. -pub fn authority_for_request( - rqinfo: &dropshot::RequestInfo, -) -> Result { - if rqinfo.version() > hyper::Version::HTTP_11 { - // For HTTP2, the server name is specified in the URL's "authority". - rqinfo - .uri() - .authority() - .cloned() - .ok_or_else(|| String::from("request URL missing authority")) - } else { - // For earlier versions of HTTP, the server name is specified by the - // "Host" header. - rqinfo - .headers() - .get(http::header::HOST) - .ok_or_else(|| String::from("request missing \"host\" header"))? - .to_str() - .map_err(|e| format!("failed to decode \"host\" header: {:#}", e)) - .and_then(|hostport| { - hostport.parse().map_err(|e| { - format!("unsupported \"host\" header: {:#}", e) - }) - }) - } -} - -// See `Nexus::endpoint_for_request()` above. This is factored out to be able -// to test it without a whole server. -fn endpoint_for_authority( - log: &slog::Logger, - requested_authority: &http::uri::Authority, - config_rx: &tokio::sync::watch::Receiver>, -) -> Result, Error> { - let requested_host = requested_authority.host(); - let log = log.new(o!("server_name" => requested_host.to_string())); - trace!(&log, "determining endpoint"); - - // If we have not successfully loaded the endpoint configuration yet, - // there's nothing we can do here. We could try to do better (e.g., use - // the recovery Silo?). But if we failed to load endpoints, it's likely - // the database is down, and we're not going to get much further anyway. - let endpoint_config = config_rx.borrow(); - let endpoints = endpoint_config.as_ref().ok_or_else(|| { - error!(&log, "received request with no endpoints loaded"); - Error::unavail("endpoints not loaded") - })?; - - // See if there's an endpoint for the requested name. If so, use it. - if let Some(endpoint) = endpoints.by_dns_name.get(requested_host) { - trace!( - &log, - "received request for endpoint"; - "silo_name" => ?endpoint.db_silo.name(), - "silo_id" => ?endpoint.silo_id, - ); - - return Ok(endpoint.clone()); - } - - // There was no endpoint for the requested name. This should generally - // not happen in deployed systems where we expect people to have set up - // DNS to find the external endpoints. But in development, we don't - // always have DNS set up. People may use an IP address to get here. - // To accommodate this use case, we make a best-effort to pick a default - // endpoint when we can't find one for the name we were given. - // - // If this ever does happen in a production system, this might be - // confusing. The best thing to do in a production system is probably - // to return an error saying that the requested server name was unknown. - // Instead, we'll wind up choosing some Silo here. This has no impact - // on authenticated requests because for those we use the authenticated - // identity's Silo. (That's as of this writing. Again, we may want to - // disallow this and produce an error instead.) If the request is not - // authenticated, we may wind up sending them to a login page for this - // Silo that may not be the Silo they meant. - endpoints - .default_endpoint - .as_ref() - .ok_or_else(|| { - error!( - &log, - "received request for unknown host and no default \ - endpoint is available", - ); - Error::invalid_request(&format!( - "HTTP request for unknown server name {:?}", - requested_host, - )) - }) - .map(|c| c.clone()) -} - -#[cfg(test)] -mod test { - use super::endpoint_for_authority; - use super::ExternalEndpoints; - use super::TlsCertificate; - use crate::app::external_endpoints::authority_for_request; - use crate::app::external_endpoints::ExternalEndpointError; - use crate::app::external_endpoints::NexusCertResolver; - use chrono::Utc; - use dropshot::endpoint; - use dropshot::test_util::LogContext; - use dropshot::ConfigLogging; - use dropshot::ConfigLoggingIfExists; - use dropshot::ConfigLoggingLevel; - use http::uri::Authority; - use nexus_db_model::Certificate; - use nexus_db_model::DnsGroup; - use nexus_db_model::DnsZone; - use nexus_db_model::ServiceKind; - use nexus_db_model::Silo; - use nexus_types::external_api::params; - use nexus_types::external_api::shared; - use nexus_types::identity::Resource; - use omicron_common::api::external::Error; - use omicron_common::api::external::IdentityMetadataCreateParams; - use schemars::JsonSchema; - use serde::Deserialize; - use serde::Serialize; - use std::net::SocketAddr; - use uuid::Uuid; - - fn create_silo(silo_id: Option, name: &str, saml: bool) -> Silo { - let identity_mode = if saml { - shared::SiloIdentityMode::SamlJit - } else { - shared::SiloIdentityMode::LocalOnly - }; - let params = params::SiloCreate { - identity: IdentityMetadataCreateParams { - name: name.parse().unwrap(), - description: String::new(), - }, - quotas: params::SiloQuotasCreate::empty(), - discoverable: false, - identity_mode, - admin_group_name: None, - tls_certificates: vec![], - mapped_fleet_roles: Default::default(), - }; - - if let Some(silo_id) = silo_id { - Silo::new_with_id(silo_id, params) - } else { - Silo::new(params) - } - .unwrap() - } - - fn create_certificate( - domain: &str, - expired: bool, - ) -> params::CertificateCreate { - let mut cert_params = - rcgen::CertificateParams::new(vec![domain.to_string()]); - if expired { - cert_params.not_after = std::time::UNIX_EPOCH.into(); - } - let cert = rcgen::Certificate::from_params(cert_params).unwrap(); - let cert_pem = - cert.serialize_pem().expect("serializing certificate as PEM"); - let key_pem = cert.serialize_private_key_pem(); - let namestr = format!("cert-for-{}", domain.replace('.', "-")); - params::CertificateCreate { - identity: IdentityMetadataCreateParams { - name: namestr.parse().unwrap(), - description: String::new(), - }, - cert: cert_pem, - key: key_pem, - service: shared::ServiceUsingCertificate::ExternalApi, - } - } - - fn create_dns_zone(domain: &str) -> DnsZone { - DnsZone { - id: Uuid::new_v4(), - time_created: Utc::now(), - dns_group: DnsGroup::External, - zone_name: format!("{}.test", domain), - } - } - - fn cert_matches(tls_cert: &TlsCertificate, cert: &Certificate) -> bool { - let parse_right = openssl::x509::X509::from_pem(&cert.cert).unwrap(); - tls_cert.parsed == parse_right - } - - #[test] - fn test_external_endpoints_empty() { - // Truly trivial case: no endpoints at all. - let ee1 = ExternalEndpoints::new(vec![], vec![], vec![]); - assert_eq!(ee1.ndomains(), 0); - assert_eq!(ee1.nwarnings(), 1); - assert_eq!( - ee1.warnings[0].to_string(), - "no external endpoints were found" - ); - assert!(ee1.default_endpoint.is_none()); - - // There are also no endpoints if there's a Silo but no external DNS - // zones. - let silo_id: Uuid = - "6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3".parse().unwrap(); - let silo = create_silo(Some(silo_id), "dummy", false); - let ee2 = ExternalEndpoints::new(vec![silo], vec![], vec![]); - assert_eq!(ee2.ndomains(), 0); - assert_eq!(ee2.nwarnings(), 1); - assert_eq!( - ee2.warnings[0].to_string(), - "no external endpoints were found" - ); - assert!(ee2.default_endpoint.is_none()); - // Test PartialEq impl. - assert_eq!(ee1, ee2); - - // There are also no endpoints if there's an external DNS zone but no - // Silo. - let dns_zone1 = create_dns_zone("oxide1"); - let ee2 = ExternalEndpoints::new(vec![], vec![], vec![dns_zone1]); - assert_eq!(ee2.ndomains(), 0); - assert_eq!(ee2.nwarnings(), 1); - assert_eq!( - ee2.warnings[0].to_string(), - "no external endpoints were found" - ); - assert!(ee2.default_endpoint.is_none()); - // Test PartialEq impl. - assert_eq!(ee1, ee2); - - // Finally, there are no endpoints if there's a certificate and nothing - // else. This isn't really valid. But it's useful to verify that we - // won't crash or otherwise fail if we get a certificate with an invalid - // silo_id. - let cert_create = create_certificate("dummy.sys.oxide1.test", false); - let cert = Certificate::new( - silo_id, - Uuid::new_v4(), - ServiceKind::Nexus, - cert_create, - &["dummy.sys.oxide1.test".to_string()], - ) - .unwrap(); - let ee2 = ExternalEndpoints::new(vec![], vec![cert], vec![]); - assert_eq!(ee2.ndomains(), 0); - assert_eq!(ee2.nwarnings(), 2); - assert!(ee2.warnings[0].to_string().contains("silo not found"),); - assert_eq!( - ee2.warnings[1].to_string(), - "no external endpoints were found" - ); - assert!(ee2.default_endpoint.is_none()); - } - - #[test] - fn test_external_endpoints_basic() { - // Empty case for comparison. - let ee1 = ExternalEndpoints::new(vec![], vec![], vec![]); - - // Sample data - let silo_id: Uuid = - "6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3".parse().unwrap(); - let silo = create_silo(Some(silo_id), "dummy", false); - let dns_zone1 = create_dns_zone("oxide1"); - let cert_create = create_certificate("dummy.sys.oxide1.test", false); - let cert = Certificate::new( - silo_id, - Uuid::new_v4(), - ServiceKind::Nexus, - cert_create, - &["dummy.sys.oxide1.test".to_string()], - ) - .unwrap(); - - // Simple case: one silo, one DNS zone. We should see an endpoint for - // the Silo. Since it has no certificates, we'll get a warning. - let ee3 = ExternalEndpoints::new( - vec![silo.clone()], - vec![], - vec![dns_zone1.clone()], - ); - // Test PartialEq impl. - assert_ne!(ee1, ee3); - assert_eq!(ee3.ndomains(), 1); - assert!(ee3.has_domain("dummy.sys.oxide1.test")); - assert_eq!(ee3.nwarnings(), 1); - assert_eq!( - ee3.warnings[0].to_string(), - "silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 with DNS name \ - \"dummy.sys.oxide1.test\" has no usable certificates" - ); - // This also exercises best_certificate() with zero certificates. - assert_eq!( - ee3.by_dns_name["dummy.sys.oxide1.test"] - .best_certificate() - .unwrap_err() - .to_string(), - "silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 has no usable \ - certificates" - ); - assert_eq!(ee3.default_endpoint.as_ref().unwrap().silo_id, silo_id); - - // Now try with a certificate. - let ee4 = ExternalEndpoints::new( - vec![silo.clone()], - vec![cert.clone()], - vec![dns_zone1.clone()], - ); - assert_ne!(ee3, ee4); - assert_eq!(ee4.ndomains(), 1); - assert!(ee4.has_domain("dummy.sys.oxide1.test")); - assert_eq!(ee4.nwarnings(), 0); - let endpoint = &ee4.by_dns_name["dummy.sys.oxide1.test"]; - assert_eq!(endpoint.silo_id, silo_id); - assert_eq!(endpoint.tls_certs.len(), 1); - assert!(cert_matches(&endpoint.tls_certs[0], &cert)); - // This also exercises best_certificate() with one certificate. - assert_eq!( - *endpoint.best_certificate().unwrap(), - endpoint.tls_certs[0] - ); - assert_eq!(ee4.default_endpoint.as_ref().unwrap().silo_id, silo_id); - - // Add a second external DNS zone. There should now be two endpoints, - // both pointing to the same Silo. - let dns_zone2 = DnsZone { - id: Uuid::new_v4(), - time_created: Utc::now(), - dns_group: DnsGroup::External, - zone_name: String::from("oxide2.test"), - }; - let ee5 = ExternalEndpoints::new( - vec![silo.clone()], - vec![cert.clone()], - vec![dns_zone1.clone(), dns_zone2], - ); - assert_ne!(ee4, ee5); - assert_eq!(ee5.ndomains(), 2); - assert!(ee5.has_domain("dummy.sys.oxide1.test")); - assert!(ee5.has_domain("dummy.sys.oxide2.test")); - assert_eq!(ee5.nwarnings(), 0); - assert_eq!(ee5.default_endpoint.as_ref().unwrap().silo_id, silo_id); - let endpoint1 = &ee5.by_dns_name["dummy.sys.oxide1.test"]; - let endpoint2 = &ee5.by_dns_name["dummy.sys.oxide2.test"]; - assert_eq!(endpoint1, endpoint2); - assert_eq!(endpoint1.silo_id, silo_id); - assert_eq!(endpoint1.tls_certs.len(), 1); - assert_eq!(endpoint2.silo_id, silo_id); - assert_eq!(endpoint2.tls_certs.len(), 1); - - // Add a second Silo with the same name as the first one. This should - // not be possible in practice. In the future, we expect other features - // (e.g., DNS aliases) to make it possible for silos' DNS names to - // overlap like this. - let silo2_same_name_id = - "e3f36f20-56c3-c545-8320-c19d98b82c1d".parse().unwrap(); - let silo2_same_name = - create_silo(Some(silo2_same_name_id), "dummy", false); - let ee6 = ExternalEndpoints::new( - vec![silo, silo2_same_name], - vec![cert], - vec![dns_zone1], - ); - assert_ne!(ee5, ee6); - assert_eq!(ee6.ndomains(), 1); - assert!(ee6.has_domain("dummy.sys.oxide1.test")); - assert_eq!(ee6.default_endpoint.as_ref().unwrap().silo_id, silo_id); - let endpoint = &ee6.by_dns_name["dummy.sys.oxide1.test"]; - assert_eq!(endpoint.silo_id, silo_id); - assert_eq!(endpoint.tls_certs.len(), 1); - assert_eq!(ee6.nwarnings(), 1); - assert_eq!( - ee6.warnings[0].to_string(), - "ignoring silo e3f36f20-56c3-c545-8320-c19d98b82c1d (\"dummy\"): \ - has the same DNS name (\"dummy.sys.oxide1.test\") as \ - previously-found silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 \ - (\"dummy\")" - ); - } - - #[test] - fn test_external_endpoints_complex() { - // Set up a somewhat complex scenario: - // - // - four Silos - // - silo1: two certificates, one of which is expired - // - silo2: two certificates, one of which is expired - // (in the other order to make sure it's not working by accident) - // - silo3: one certificate that is invalid - // - silo4: one certificate that is expired - // - two DNS zones - // - // We should wind up with eight endpoints and one warning. - let silo1 = create_silo(None, "silo1", true); - let silo2 = create_silo(None, "silo2", true); - let silo3 = create_silo(None, "silo3", false); - let silo4 = create_silo(None, "silo4", true); - let silo1_cert1_params = - create_certificate("silo1.sys.oxide1.test", false); - let silo1_cert1 = Certificate::new( - silo1.identity().id, - Uuid::new_v4(), - ServiceKind::Nexus, - silo1_cert1_params, - &["silo1.sys.oxide1.test".to_string()], - ) - .unwrap(); - let silo1_cert2_params = - create_certificate("silo1.sys.oxide1.test", true); - let silo1_cert2 = Certificate::new_unvalidated( - silo1.identity().id, - Uuid::new_v4(), - ServiceKind::Nexus, - silo1_cert2_params, - ); - let silo2_cert1_params = - create_certificate("silo2.sys.oxide1.test", true); - let silo2_cert1 = Certificate::new_unvalidated( - silo2.identity().id, - Uuid::new_v4(), - ServiceKind::Nexus, - silo2_cert1_params, - ); - let silo2_cert2_params = - create_certificate("silo2.sys.oxide1.test", false); - let silo2_cert2 = Certificate::new( - silo2.identity().id, - Uuid::new_v4(), - ServiceKind::Nexus, - silo2_cert2_params, - &["silo2.sys.oxide1.test".to_string()], - ) - .unwrap(); - let silo3_cert_params = - create_certificate("silo3.sys.oxide1.test", false); - let mut silo3_cert = Certificate::new( - silo3.identity().id, - Uuid::new_v4(), - ServiceKind::Nexus, - silo3_cert_params, - &["silo3.sys.oxide1.test".to_string()], - ) - .unwrap(); - // Corrupt a byte of this last certificate. (This has to be done after - // constructing it or we would fail validation.) - silo3_cert.cert[0] ^= 1; - let silo4_cert_params = - create_certificate("silo4.sys.oxide1.test", true); - let silo4_cert = Certificate::new_unvalidated( - silo4.identity().id, - Uuid::new_v4(), - ServiceKind::Nexus, - silo4_cert_params, - ); - let dns_zone1 = create_dns_zone("oxide1"); - let dns_zone2 = create_dns_zone("oxide2"); - - let ee = ExternalEndpoints::new( - vec![silo1.clone(), silo2.clone(), silo3.clone(), silo4.clone()], - vec![ - silo1_cert1.clone(), - silo1_cert2.clone(), - silo2_cert1, - silo2_cert2.clone(), - silo3_cert.clone(), - silo4_cert.clone(), - ], - vec![dns_zone1, dns_zone2], - ); - println!("{:?}", ee); - assert_eq!(ee.ndomains(), 8); - assert_eq!(ee.nwarnings(), 3); - assert_eq!( - 2, - ee.warnings - .iter() - .filter(|warning| matches!(warning, - ExternalEndpointError::NoSiloCerts { silo_id, .. } - if *silo_id == silo3.id() - )) - .count() - ); - assert_eq!( - 1, - ee.warnings - .iter() - .filter(|warning| matches!(warning, - ExternalEndpointError::BadCert { silo_id, .. } - if *silo_id == silo3.id() - )) - .count() - ); - - assert_eq!( - ee.by_dns_name["silo1.sys.oxide1.test"], - ee.by_dns_name["silo1.sys.oxide2.test"] - ); - assert_eq!( - ee.by_dns_name["silo2.sys.oxide1.test"], - ee.by_dns_name["silo2.sys.oxide2.test"] - ); - assert_eq!( - ee.by_dns_name["silo3.sys.oxide1.test"], - ee.by_dns_name["silo3.sys.oxide2.test"] - ); - assert_eq!( - ee.by_dns_name["silo4.sys.oxide1.test"], - ee.by_dns_name["silo4.sys.oxide2.test"] - ); - assert_eq!( - ee.default_endpoint.as_ref().unwrap().silo_id, - silo3.identity().id - ); - - let e1 = &ee.by_dns_name["silo1.sys.oxide1.test"]; - assert_eq!(e1.silo_id, silo1.id()); - let c1 = e1.best_certificate().unwrap(); - // It must be cert1 because cert2 is expired. - assert!(cert_matches(c1, &silo1_cert1)); - - let e2 = &ee.by_dns_name["silo2.sys.oxide1.test"]; - assert_eq!(e2.silo_id, silo2.id()); - let c2 = e2.best_certificate().unwrap(); - // It must be cert2 because cert1 is expired. - assert!(cert_matches(c2, &silo2_cert2)); - assert!(!cert_matches(c2, &silo1_cert1)); - assert!(!cert_matches(c2, &silo1_cert2)); - - let e3 = &ee.by_dns_name["silo3.sys.oxide1.test"]; - assert_eq!(e3.silo_id, silo3.id()); - assert!(e3.best_certificate().is_err()); - - // We should get an expired cert if it's the only option. - let e4 = &ee.by_dns_name["silo4.sys.oxide1.test"]; - assert_eq!(e4.silo_id, silo4.id()); - let c4 = e4.best_certificate().unwrap(); - assert!(cert_matches(c4, &silo4_cert)); - - // - // Test endpoint lookup by authority. - // - let logctx = LogContext::new( - "test_external_endpoints_complex", - &ConfigLogging::File { - level: ConfigLoggingLevel::Trace, - path: "UNUSED".into(), - if_exists: ConfigLoggingIfExists::Append, - }, - ); - let log = &logctx.log; - let (_, watch_rx) = tokio::sync::watch::channel(Some(ee.clone())); - - // Basic cases: look up a few Silos by name. - let authority = Authority::from_static("silo1.sys.oxide1.test"); - let ae1 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); - assert_eq!(ae1, *e1); - let authority = Authority::from_static("silo1.sys.oxide2.test"); - let ae1 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); - assert_eq!(ae1, *e1); - let authority = Authority::from_static("silo2.sys.oxide1.test"); - let ae2 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); - assert_eq!(ae2, *e2); - // The port number in the authority should be ignored. - let authority = Authority::from_static("silo3.sys.oxide1.test:456"); - let ae3 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); - assert_eq!(ae3, *e3); - // We should get back a default endpoint if we use a server name that's - // not known. That includes any IPv4 or IPv6 address, too. The default - // endpoint should always be silo3 because it's the only one we've - // created LocalOnly. - for name in [ - "springfield.sys.oxide1.test", - "springfield.sys.oxide1.test:123", - "10.1.2.3:456", - "[fe80::1]:789", - ] { - let authority = Authority::from_static(name); - let ae = - endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); - assert_eq!(ae, *e3); - } - - // - // Now test the NexusCertResolver. - // - let (watch_tx, watch_rx) = tokio::sync::watch::channel(None); - let cert_resolver = - NexusCertResolver::new(logctx.log.clone(), watch_rx); - - // At this point we haven't filled in the configuration so any attempt - // to resolve anything should fail. - assert!(cert_resolver - .do_resolve(Some("silo1.sys.oxide1.test")) - .is_none()); - - // Now pass along the configuration and try again. - watch_tx.send(Some(ee.clone())).unwrap(); - let resolved_c1 = - cert_resolver.do_resolve(Some("silo1.sys.oxide1.test")).unwrap(); - assert_eq!(resolved_c1.cert, c1.certified_key.cert); - let resolved_c2 = - cert_resolver.do_resolve(Some("silo2.sys.oxide1.test")).unwrap(); - assert_eq!(resolved_c2.cert, c2.certified_key.cert); - assert!(cert_resolver - .do_resolve(Some("silo3.sys.oxide1.test")) - .is_none()); - // We should get an expired cert if it's the only option. - let resolved_c4 = - cert_resolver.do_resolve(Some("silo4.sys.oxide1.test")).unwrap(); - assert_eq!(resolved_c4.cert, c4.certified_key.cert); - - logctx.cleanup_successful(); - } - - #[tokio::test] - async fn test_authority() { - // Tests for authority_for_request(). The function itself is pretty - // simple. That makes it easy to test fairly exhaustively. It's also - // useful to verify that we're doing what we think we're doing - // (identifying the name that the client thinks they're connecting to). - - // First, set up a Dropshot server that just echoes back whatever - // authority_for_request() returns for a given request. - let logctx = omicron_test_utils::dev::test_setup_log("test_authority"); - let mut api = dropshot::ApiDescription::new(); - api.register(echo_server_name).unwrap(); - let server = dropshot::HttpServerStarter::new( - &dropshot::ConfigDropshot::default(), - api, - (), - &logctx.log, - ) - .expect("failed to create dropshot server") - .start(); - let local_addr = server.local_addr(); - let port = local_addr.port(); - - #[derive(Debug, PartialEq, Eq, JsonSchema, Serialize, Deserialize)] - struct AuthorityResponse { - host: String, - port: Option, - } - - #[endpoint(method = GET, path = "/server_name")] - async fn echo_server_name( - rqctx: dropshot::RequestContext<()>, - ) -> Result< - dropshot::HttpResponseOk>, - dropshot::HttpError, - > { - Ok(dropshot::HttpResponseOk( - authority_for_request(&rqctx.request).map(|authority| { - AuthorityResponse { - host: authority.host().to_string(), - port: authority.port_u16(), - } - }), - )) - } - - // Generally, the "authority" for a request is determined by the URL - // provided to the client. We can test basically two cases this way: an - // authority with a host and port and an authority with an IP address - // and port. We can't test any cases that require the client to connect - // to a different host/port than what's in the URL. So we can't test - // the case of an authority with no port number in it (since our server - // doesn't run on port 80). - // - // With HTTP 1.1, you can generally override the authority by specifying - // your own "host" header. That lets us exercise the case of an - // authority that has no port number, even though the client would be - // connecting to a URL with a port number in it. It might also let us - // test other cases, like an authority with an invalid DNS name. - // However, it's not clear any of this is possible with HTTP 2 or later. - - async fn test_v2_host( - hostname: &str, - addr: SocketAddr, - ) -> AuthorityResponse { - let v2_client = reqwest::ClientBuilder::new() - .http2_prior_knowledge() - .resolve(hostname, addr) - .build() - .unwrap(); - test_request(&v2_client, &format!("{}:{}", hostname, addr.port())) - .await - } - - async fn test_v2_ip(addr: SocketAddr) -> AuthorityResponse { - let v2_client = reqwest::ClientBuilder::new() - .http2_prior_knowledge() - .build() - .unwrap(); - test_request(&v2_client, &addr.to_string()).await - } - - async fn test_v1_host( - hostname: &str, - addr: SocketAddr, - override_host: Option<&str>, - ) -> AuthorityResponse { - let mut v1_builder = reqwest::ClientBuilder::new() - .http1_only() - .resolve(hostname, addr); - if let Some(host) = override_host { - let mut headers = http::header::HeaderMap::new(); - headers.insert(http::header::HOST, host.try_into().unwrap()); - v1_builder = v1_builder.default_headers(headers); - } - let v1_client = v1_builder.build().unwrap(); - test_request(&v1_client, &format!("{}:{}", hostname, addr.port())) - .await - } - - async fn test_v1_ip( - addr: SocketAddr, - override_host: Option<&str>, - ) -> AuthorityResponse { - let mut v1_builder = reqwest::ClientBuilder::new().http1_only(); - if let Some(host) = override_host { - let mut headers = http::header::HeaderMap::new(); - headers.append(http::header::HOST, host.try_into().unwrap()); - v1_builder = v1_builder.default_headers(headers); - } - let v1_client = v1_builder.build().unwrap(); - test_request(&v1_client, &addr.to_string()).await - } - - async fn test_request( - client: &reqwest::Client, - connect_host: &str, - ) -> AuthorityResponse { - let url = format!("http://{}/server_name", connect_host); - - let result = client - .get(&url) - .send() - .await - .unwrap_or_else(|e| panic!("GET {:?}: {:#}", url, e)); - let status = result.status(); - println!("status: {:?}", status); - if status != http::StatusCode::OK { - panic!("GET {:?}: unexpected status: {:?}", url, status); - } - - let body: Result = - result.json().await.unwrap_or_else(|e| { - panic!("GET {:?}: parse json: {:#}", url, e); - }); - println!("body: {:?}", body); - body.unwrap() - } - - // HTTP 2: regular hostname (with port) - let authority = test_v2_host("foo.example.com", local_addr).await; - assert_eq!(authority.host, "foo.example.com"); - assert_eq!(authority.port, Some(port)); - - // HTTP 2: IP address (with port) - let authority = test_v2_ip(local_addr).await; - assert_eq!(authority.host, local_addr.ip().to_string()); - assert_eq!(authority.port, Some(port)); - - // HTTP 1.1: regular hostname, no overridden "host" header. - let authority = test_v1_host("foo.example.com", local_addr, None).await; - assert_eq!(authority.host, "foo.example.com"); - assert_eq!(authority.port, Some(port)); - - // HTTP 1.1: regular hostname, override "host" header with port. - let authority = test_v1_host( - "foo.example.com", - local_addr, - Some("foo.example.com:123"), - ) - .await; - assert_eq!(authority.host, "foo.example.com"); - assert_eq!(authority.port, Some(123)); - - // HTTP 1.1: regular hostname, override "host" header with no port. - let authority = test_v1_host( - "foo.example.com", - local_addr, - Some("foo.example.com"), - ) - .await; - assert_eq!(authority.host, "foo.example.com"); - assert_eq!(authority.port, None); - - // HTTP 1.1: IP address, no overridden "host" header. - let authority = test_v1_ip(local_addr, None).await; - assert_eq!(authority.host, local_addr.ip().to_string()); - assert_eq!(authority.port, Some(port)); - - // HTTP 1.1: IP address, override "host" header with port. - let authority = - test_v1_ip(local_addr, Some("foo.example.com:123")).await; - assert_eq!(authority.host, "foo.example.com"); - assert_eq!(authority.port, Some(123)); - - // HTTP 1.1: IP address, override "host" header with no port. - let authority = test_v1_ip(local_addr, Some("foo.example.com")).await; - assert_eq!(authority.host, "foo.example.com"); - assert_eq!(authority.port, None); - - server.close().await.expect("failed to shut down dropshot server"); - logctx.cleanup_successful(); - } - - #[tokio::test] - async fn test_no_endpoint() { - let logctx = - omicron_test_utils::dev::test_setup_log("test_no_endpoint"); - let log = &logctx.log; - - // We'll test two configurations at the same time: one where there's no - // configuration at all, and one where there's a configuration but no - // default endpoint. These should always produce errors, no matter what - // endpoint we're looking up. - let ee = ExternalEndpoints::new(vec![], vec![], vec![]); - let (_, none_rx) = - tokio::sync::watch::channel::>(None); - let (_, empty_rx) = - tokio::sync::watch::channel::>(Some(ee)); - - for name in [ - "dummy", - "dummy.example", - "dummy.example:123", - "10.1.2.3:456", - "[fe80::1]:789", - ] { - let authority = Authority::from_static(name); - for (rx_label, rx_channel) in - [("empty", &empty_rx), ("none", &none_rx)] - { - println!("config {:?} endpoint {:?}", rx_label, name); - let result = - endpoint_for_authority(&log, &authority, rx_channel); - match result { - Err(Error::ServiceUnavailable { internal_message }) => { - assert_eq!(rx_label, "none"); - assert_eq!(internal_message, "endpoints not loaded"); - } - Err(Error::InvalidRequest { message }) => { - assert_eq!(rx_label, "empty"); - assert_eq!( - message.external_message(), - format!( - "HTTP request for unknown server name {:?}", - authority.host() - ) - ); - } - result => { - panic!( - "unexpected result looking up endpoint for \ - {:?} with config {:?}: {:?}", - name, rx_label, result - ); - } - } - } - } - - logctx.cleanup_successful(); - } -} diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index a137f19434..9096984a2f 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -4,7 +4,6 @@ //! Rack management -use super::silo::silo_dns_name; use crate::external_api::params; use crate::external_api::params::CertificateCreate; use crate::external_api::shared::ServiceUsingCertificate; @@ -20,6 +19,7 @@ use nexus_db_queries::db; use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder; use nexus_db_queries::db::datastore::RackInit; use nexus_db_queries::db::lookup::LookupPath; +use nexus_external_endpoints::silo_dns_name; use nexus_types::external_api::params::Address; use nexus_types::external_api::params::AddressConfig; use nexus_types::external_api::params::AddressLotBlockCreate; diff --git a/nexus/src/app/silo.rs b/nexus/src/app/silo.rs index 8461be015a..8a2558facb 100644 --- a/nexus/src/app/silo.rs +++ b/nexus/src/app/silo.rs @@ -16,6 +16,7 @@ use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::{self, lookup}; use nexus_db_queries::{authn, authz}; +use nexus_external_endpoints::silo_dns_name; use nexus_types::internal_api::params::DnsRecord; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::ListResultVec; @@ -886,16 +887,3 @@ impl super::Nexus { LookupPath::new(opctx, &self.db_datastore).silo_group_id(*group_id) } } - -/// Returns the (relative) DNS name for this Silo's API and console endpoints -/// _within_ the external DNS zone (i.e., without that zone's suffix) -/// -/// This specific naming scheme is determined under RFD 357. -pub(crate) fn silo_dns_name( - name: &omicron_common::api::external::Name, -) -> String { - // RFD 4 constrains resource names (including Silo names) to DNS-safe - // strings, which is why it's safe to directly put the name of the - // resource into the DNS name rather than doing any kind of escaping. - format!("{}.sys", name) -} diff --git a/nexus/src/external_api/device_auth.rs b/nexus/src/external_api/device_auth.rs index 1697722f6f..4f7b8d83b1 100644 --- a/nexus/src/external_api/device_auth.rs +++ b/nexus/src/external_api/device_auth.rs @@ -11,7 +11,6 @@ use super::console_api::console_index_or_login_redirect; use super::views::DeviceAccessTokenGrant; -use crate::app::external_endpoints::authority_for_request; use crate::ServerContext; use dropshot::{ endpoint, HttpError, HttpResponseUpdatedNoContent, RequestContext, @@ -20,6 +19,7 @@ use dropshot::{ use http::{header, Response, StatusCode}; use hyper::Body; use nexus_db_queries::db::model::DeviceAccessToken; +use nexus_external_endpoints::authority_for_request; use omicron_common::api::external::InternalContext; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; From 34e5c3f0e3c336997453e1184200567f3b73b550 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 6 Mar 2024 17:37:08 -0800 Subject: [PATCH 04/34] Revert "abandoned WIP: move nexus-external-endpoints into a crate" This reverts commit c7fad86b2a7da6296ddefd82722798fce9581740. --- Cargo.lock | 34 - Cargo.toml | 3 - nexus/Cargo.toml | 1 - nexus/external-endpoints/Cargo.toml | 39 - nexus/external-endpoints/build.rs | 10 - nexus/external-endpoints/src/lib.rs | 1540 ----------------- nexus/reconfigurator/execution/Cargo.toml | 1 - nexus/reconfigurator/execution/src/dns.rs | 16 +- .../src/app/background/external_endpoints.rs | 4 +- nexus/src/app/external_endpoints.rs | 1525 +++++++++++++++- nexus/src/app/rack.rs | 2 +- nexus/src/app/silo.rs | 14 +- nexus/src/external_api/device_auth.rs | 2 +- 13 files changed, 1539 insertions(+), 1652 deletions(-) delete mode 100644 nexus/external-endpoints/Cargo.toml delete mode 100644 nexus/external-endpoints/build.rs delete mode 100644 nexus/external-endpoints/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 08c93bb798..18c783037f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4433,38 +4433,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "nexus-external-endpoints" -version = "0.1.0" -dependencies = [ - "anyhow", - "chrono", - "dropshot", - "hex", - "http 0.2.11", - "hyper 0.14.27", - "nexus-db-model", - "nexus-db-queries", - "nexus-types", - "omicron-common", - "omicron-rpaths", - "omicron-test-utils", - "omicron-workspace-hack", - "openssl", - "pq-sys", - "rcgen", - "reqwest", - "rustls 0.22.2", - "rustls-pemfile 2.1.0", - "schemars", - "serde", - "serde_with", - "slog", - "thiserror", - "tokio", - "uuid", -] - [[package]] name = "nexus-inventory" version = "0.1.0" @@ -4517,7 +4485,6 @@ dependencies = [ "nexus-config", "nexus-db-model", "nexus-db-queries", - "nexus-external-endpoints", "nexus-inventory", "nexus-reconfigurator-planning", "nexus-test-utils", @@ -5093,7 +5060,6 @@ dependencies = [ "nexus-db-model", "nexus-db-queries", "nexus-defaults", - "nexus-external-endpoints", "nexus-inventory", "nexus-reconfigurator-execution", "nexus-reconfigurator-planning", diff --git a/Cargo.toml b/Cargo.toml index fcb939cca2..474739a932 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,7 +42,6 @@ members = [ "nexus/db-model", "nexus/db-queries", "nexus/defaults", - "nexus/external-endpoints", "nexus/inventory", "nexus/macros-common", "nexus/reconfigurator/execution", @@ -122,7 +121,6 @@ default-members = [ "nexus/db-model", "nexus/db-queries", "nexus/defaults", - "nexus/external-endpoints", "nexus/inventory", "nexus/reconfigurator/execution", "nexus/reconfigurator/planning", @@ -262,7 +260,6 @@ nexus-config = { path = "nexus-config" } nexus-db-model = { path = "nexus/db-model" } nexus-db-queries = { path = "nexus/db-queries" } nexus-defaults = { path = "nexus/defaults" } -nexus-external-endpoints = { path = "nexus/external-endpoints" } nexus-inventory = { path = "nexus/inventory" } nexus-macros-common = { path = "nexus/macros-common" } nexus-reconfigurator-execution = { path = "nexus/reconfigurator/execution" } diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 271fd0866b..de79f3429d 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -81,7 +81,6 @@ uuid.workspace = true nexus-defaults.workspace = true nexus-db-model.workspace = true nexus-db-queries.workspace = true -nexus-external-endpoints.workspace = true nexus-inventory.workspace = true nexus-reconfigurator-execution.workspace = true nexus-reconfigurator-planning.workspace = true diff --git a/nexus/external-endpoints/Cargo.toml b/nexus/external-endpoints/Cargo.toml deleted file mode 100644 index e356223e0b..0000000000 --- a/nexus/external-endpoints/Cargo.toml +++ /dev/null @@ -1,39 +0,0 @@ -[package] -name = "nexus-external-endpoints" -version = "0.1.0" -edition = "2021" - -[build-dependencies] -omicron-rpaths.workspace = true - -[dependencies] -anyhow.workspace = true -dropshot.workspace = true -hex.workspace = true -http.workspace = true -hyper.workspace = true -nexus-db-model.workspace = true -nexus-db-queries.workspace = true -nexus-types.workspace = true -omicron-common.workspace = true -openssl.workspace = true -# See omicron-rpaths for more about the "pq-sys" dependency. -pq-sys = "*" -rcgen.workspace = true -reqwest.workspace = true -rustls.workspace = true -rustls-pemfile.workspace = true -serde_with.workspace = true -serde.workspace = true -slog.workspace = true -thiserror.workspace = true -tokio.workspace = true -uuid.workspace = true - -omicron-workspace-hack.workspace = true - -[dev-dependencies] -chrono.workspace = true -http.workspace = true -omicron-test-utils.workspace = true -schemars.workspace = true diff --git a/nexus/external-endpoints/build.rs b/nexus/external-endpoints/build.rs deleted file mode 100644 index 1ba9acd41c..0000000000 --- a/nexus/external-endpoints/build.rs +++ /dev/null @@ -1,10 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -// See omicron-rpaths for documentation. -// NOTE: This file MUST be kept in sync with the other build.rs files in this -// repository. -fn main() { - omicron_rpaths::configure_default_omicron_rpaths(); -} diff --git a/nexus/external-endpoints/src/lib.rs b/nexus/external-endpoints/src/lib.rs deleted file mode 100644 index 5136375cb8..0000000000 --- a/nexus/external-endpoints/src/lib.rs +++ /dev/null @@ -1,1540 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Management of external HTTPS endpoints -//! -//! Whenever a client connects to one of our external endpoints and attempts to -//! establish a TLS session, we must provide a TLS certificate to authenticate -//! ourselves to the client. But each Silo has a separate external DNS name and -//! may have its own TLS certificate for that DNS name. These all resolve to -//! the same set of IPs, so we cannot tell from the IP address alone which -//! Silo's endpoint the client is trying to reach nor which certificate to -//! present. TLS provides a mechanism called Server Name Indication (SNI) for -//! clients to specify the name of the server they're trying to reach _before_ -//! the TLS session is established. We use this to determine which Silo -//! endpoint the client is trying to reach and so which TLS certificate to -//! present. -//! -//! To achieve this, we first need to know what DNS names, Silos, and TLS -//! certificates are available at any given time. This is summarized in -//! [`ExternalEndpoints`]. A background task is responsible for maintaining -//! this, providing the latest version to whoever needs it via a `watch` -//! channel. How do we tell the TLS stack what certificate to use? When -//! setting up the Dropshot server in the first place, we provide a -//! [`rustls::ServerConfig`] that describes various TLS settings, including an -//! "certificate resolver" object that impls -//! [`rustls::server::ResolvesServerCert`]. See [`NexusCertResolver`]. - -use anyhow::anyhow; -use anyhow::bail; -use anyhow::Context; -use nexus_db_model::AuthenticationMode; -use nexus_db_model::Certificate; -use nexus_db_model::DnsGroup; -use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::datastore::Discoverability; -use nexus_db_queries::db::fixed_data::silo::SILO_ID; -use nexus_db_queries::db::model::ServiceKind; -use nexus_db_queries::db::DataStore; -use nexus_types::identity::Resource; -use omicron_common::api::external::http_pagination::PaginatedBy; -use omicron_common::api::external::DataPageParams; -use omicron_common::api::external::Error; -use omicron_common::bail_unless; -use openssl::pkey::PKey; -use openssl::x509::X509; -use rustls::sign::CertifiedKey; -use serde::Serialize; -use serde_with::SerializeDisplay; -use slog::{debug, error, o, trace, warn}; -use std::collections::btree_map::Entry; -use std::collections::BTreeMap; -use std::fmt; -use std::num::NonZeroU32; -use std::sync::Arc; -use thiserror::Error; -use tokio::sync::watch; -use uuid::Uuid; - -/// Returns the (relative) DNS name for this Silo's API and console endpoints -/// _within_ the external DNS zone (i.e., without that zone's suffix) -/// -/// This specific naming scheme is determined under RFD 357. -pub fn silo_dns_name(name: &omicron_common::api::external::Name) -> String { - // RFD 4 constrains resource names (including Silo names) to DNS-safe - // strings, which is why it's safe to directly put the name of the - // resource into the DNS name rather than doing any kind of escaping. - format!("{}.sys", name) -} - -/// Describes the set of external endpoints, organized by DNS name -/// -/// This data structure provides a quick way to determine which Silo and TLS -/// certificate(s) make sense for an incoming request, based on the TLS -/// session's SNI (DNS name). See module-level docs for details. -/// -/// This object provides no interfaces outside this module. It's only used by -/// the `NexusCertResolver` that's also in this module. -/// -/// This structure impls `Serialize` only so that background tasks can easily -/// present the latest configuration that they've found (e.g., via a debug API) -#[derive(Clone, Debug, Eq, PartialEq, Serialize)] -pub struct ExternalEndpoints { - by_dns_name: BTreeMap>, - warnings: Vec, - default_endpoint: Option>, -} - -impl ExternalEndpoints { - /// Assemble a list of Silos, TLS certificates, and external DNS zones into - /// a structure that we can use for quickly figuring out which Silo and TLS - /// certificates are associated with each incoming DNS name - pub fn new( - silos: Vec, - certs: Vec, - external_dns_zones: Vec, - ) -> ExternalEndpoints { - // We want to avoid failing this operation even if we encounter problems - // because we want to serve as many DNS certificates as we can (so that - // an operator has a chance of fixing any problems that do exist). - // Instead of returning any errors, keep track of any issues as - // warnings. - let mut warnings = vec![]; - - // Compute a mapping from external DNS name to Silo id. Detect any - // duplicates and leave them out (but report them). There should not - // be any duplicates since the DNS names are constructed from the - // (unique) Silo names. Even if we support aliases in the future, they - // will presumably need to be unique, too. - let silos_by_id: BTreeMap> = silos - .into_iter() - .map(|db_silo| (db_silo.id(), Arc::new(db_silo))) - .collect(); - let mut dns_names: BTreeMap = BTreeMap::new(); - for z in external_dns_zones { - for (_, db_silo) in &silos_by_id { - let dns_name = format!( - "{}.{}", - silo_dns_name(db_silo.name()), - z.zone_name - ); - match dns_names.entry(dns_name.clone()) { - Entry::Vacant(vac) => { - vac.insert(db_silo.id()); - } - Entry::Occupied(occ) => { - let first_silo_id = *occ.get(); - let first_silo_name = silos_by_id - .get(&first_silo_id) - .unwrap() - .name() - .to_string(); - warnings.push(ExternalEndpointError::DupDnsName { - dup_silo_id: db_silo.id(), - dup_silo_name: db_silo.name().to_string(), - first_silo_id, - first_silo_name, - dns_name, - }) - } - }; - } - } - - // Compute a mapping from silo id to a list of usable TLS certificates - // for the Silo. By "usable" here, we just mean that we are capable of - // providing it to the client. This basically means that we can parse - // it. A certificate might be invalid for some other reason (e.g., does - // not match the right DNS name or it's expired). We may later choose - // to prefer some certificates over others, but that'll be decided later - // (see best_certificate()). And in the end it'll be better to provide - // an expired certificate than none at all. - let parsed_certificates = certs.into_iter().map(|db_cert| { - let silo_id = db_cert.silo_id; - let tls_cert = TlsCertificate::try_from(db_cert).map_err(|e| { - ExternalEndpointError::BadCert { silo_id, reason: Arc::new(e) } - })?; - let db_silo = silos_by_id - .get(&silo_id) - .ok_or_else(|| ExternalEndpointError::BadCert { - silo_id, - reason: Arc::new(anyhow!("silo not found")), - })? - .clone(); - Ok((silo_id, db_silo, tls_cert)) - }); - - let mut certs_by_silo_id = BTreeMap::new(); - for parsed_cert in parsed_certificates { - match parsed_cert { - Err(error) => { - warnings.push(error); - } - Ok((silo_id, db_silo, tls_cert)) => { - let silo_entry = certs_by_silo_id - .entry(silo_id) - .or_insert_with(|| ExternalEndpoint { - silo_id, - db_silo, - tls_certs: Vec::new(), - }); - silo_entry.tls_certs.push(tls_cert) - } - }; - } - - let certs_by_silo_id: BTreeMap<_, _> = certs_by_silo_id - .into_iter() - .map(|(k, v)| (k, Arc::new(v))) - .collect(); - - let by_dns_name: BTreeMap<_, _> = dns_names - .into_iter() - .map(|(dns_name, silo_id)| { - let silo_info = certs_by_silo_id - .get(&silo_id) - .cloned() - .unwrap_or_else(|| { - // For something to appear in `dns_names`, we must have - // found it in `silos`, and so it must be in - // `silos_by_id`. - let db_silo = - silos_by_id.get(&silo_id).unwrap().clone(); - Arc::new(ExternalEndpoint { - silo_id, - db_silo, - tls_certs: vec![], - }) - }); - - if silo_info.tls_certs.is_empty() { - warnings.push(ExternalEndpointError::NoSiloCerts { - silo_id, - dns_name: dns_name.clone(), - }) - } - - (dns_name, silo_info) - }) - .collect(); - - if by_dns_name.is_empty() { - warnings.push(ExternalEndpointError::NoEndpoints); - } - - // Pick a default endpoint. This will be used if a request arrives - // without specifying an endpoint via the HTTP/1.1 Host header or the - // HTTP2 URL. This is only intended for development, where external DNS - // may not be set up. - // - // We somewhat arbitrarily choose the first Silo we find that's not JIT. - // This would usually be the recovery Silo. - let default_endpoint = silos_by_id - .values() - .filter(|s| { - // Ignore the built-in Silo, which people are not supposed to - // log into. - s.id() != *SILO_ID - }) - .find(|s| s.authentication_mode == AuthenticationMode::Local) - .and_then(|s| { - by_dns_name - .iter() - .find(|(_, endpoint)| endpoint.silo_id == s.id()) - .map(|(_, endpoint)| endpoint.clone()) - }); - - ExternalEndpoints { by_dns_name, warnings, default_endpoint } - } - - pub fn dns_names(&self) -> impl Iterator { - self.by_dns_name.keys() - } - - pub fn has_domain(&self, dns_name: &str) -> bool { - self.by_dns_name.contains_key(dns_name) - } - - pub fn ndomains(&self) -> usize { - self.by_dns_name.len() - } - - pub fn nwarnings(&self) -> usize { - self.warnings.len() - } -} - -/// Describes a single external "endpoint", by which we mean an external DNS -/// name that's associated with a particular Silo -#[derive(Debug, PartialEq, Eq, Serialize)] -pub struct ExternalEndpoint { - /// the id of the Silo associated with this endpoint - // This is redundant with `db_silo`, but it's convenient to put it here and - // it shows up in the serialized form this way. - silo_id: Uuid, - /// the silo associated with this endpoint - #[serde(skip)] - db_silo: Arc, - /// the set of TLS certificate chains that could be appropriate for this - /// endpoint - tls_certs: Vec, -} - -impl ExternalEndpoint { - pub fn silo(&self) -> &nexus_db_model::Silo { - &self.db_silo - } - - /// Chooses a TLS certificate (chain) to use when handling connections to - /// this endpoint - fn best_certificate(&self) -> Result<&TlsCertificate, anyhow::Error> { - // We expect the most common case to be that there's only one - // certificate chain here. The next most common case is that there are - // two because the administrator is in the process of rotating - // certificates, usually due to upcoming expiration. In principle, it - // would be useful to allow operators to control which certificate chain - // gets used, and maybe even do something like a canary to mitigate the - // risk of a botched certificate update. Absent that, we're going to do - // our best to pick the best chain automatically. - // - // This could be a lot more sophisticated than it is. We could try to - // avoid using certificates that are clearly not valid based on the - // "not_after" and "not_before" bounds. We could check each certificate - // in the chain, not just the last one. We could use a margin of error - // when doing this to account for small variations in the wall clock - // between us and the client. We could try to avoid using a certificate - // that doesn't appear to be compatible with the SNI value (DNS domain) - // that this request came in on. - // - // IMPORTANT: If we ever decide to do those things, they should only be - // used to decide which of several certificates is preferred. We should - // always pick a certificate if we possibly can, even if it seems to be - // invalid. A client can always choose not to trust it. But in the - // unfortunate case where there are no good certificates, a customer's - // only option may be to instruct their client to trust an invalid - // certificate _so that they can log in and fix the certificate - // problem_. If we provide no certificate at all here, a customer may - // have no way to fix the problem. - // - // Anyway, we don't yet do anything of these things. For now, pick the - // certificate chain whose leaf certificate has the latest expiration - // time. - - // This would be cleaner if Asn1Time impl'd Ord or even just a way to - // convert it to a Unix timestamp or any other comparable timestamp. - let mut latest_expiration: Option<&TlsCertificate> = None; - for t in &self.tls_certs { - // We'll choose this certificate (so far) if we find that it's - // anything other than "earlier" than the best we've seen so far. - // That includes the case where we haven't seen any so far, where - // this one is greater than or equal to the best so far, as well as - // the case where they're incomparable for whatever reason. (This - // ensures that we always pick at least one.) - if latest_expiration.is_none() - || !matches!( - t.parsed.not_after().partial_cmp( - latest_expiration.unwrap().parsed.not_after() - ), - Some(std::cmp::Ordering::Less) - ) - { - latest_expiration = Some(t); - } - } - - latest_expiration.ok_or_else(|| { - anyhow!("silo {} has no usable certificates", self.silo_id) - }) - } -} - -/// Describes a problem encountered while assembling an [`ExternalEndpoints`] -/// object -#[derive(Clone, Debug, Error, SerializeDisplay)] -pub enum ExternalEndpointError { - #[error( - "ignoring silo {dup_silo_id} ({dup_silo_name:?}): has the same DNS \ - name ({dns_name:?}) as previously-found silo {first_silo_id} \ - ({first_silo_name:?})" - )] - DupDnsName { - dup_silo_id: Uuid, - dup_silo_name: String, - first_silo_id: Uuid, - first_silo_name: String, - dns_name: String, - }, - - #[error("ignoring certificate for silo {silo_id}: {reason:#}")] - BadCert { - silo_id: Uuid, - #[source] - reason: Arc, - }, - - #[error( - "silo {silo_id} with DNS name {dns_name:?} has no usable certificates" - )] - NoSiloCerts { silo_id: Uuid, dns_name: String }, - - #[error("no external endpoints were found")] - NoEndpoints, -} - -impl Eq for ExternalEndpointError {} -impl PartialEq for ExternalEndpointError { - fn eq(&self, other: &Self) -> bool { - self.to_string() == other.to_string() - } -} - -/// A parsed, validated TLS certificate ready to use with an external TLS server -#[derive(Serialize)] -#[serde(transparent)] -struct TlsCertificate { - /// This is what we need to provide to the TLS stack when we decide to use - /// this certificate for an incoming TLS connection - // NOTE: It's important that we do not serialize the private key! - #[serde(skip)] - certified_key: Arc, - - /// Parsed representation of the whole certificate chain - /// - /// This is used to extract metadata like the expiration time. - // NOTE: It's important that we do not serialize the private key! - #[serde(skip)] - parsed: X509, - - /// certificate digest (historically sometimes called a "fingerprint") - // This is the only field that appears in the serialized output or debug - // output. - digest: String, -} - -impl fmt::Debug for TlsCertificate { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // It's important that only the digest appear in the debug output. We - // definitely don't want to leak the private key this way. Really, - // we don't want even the public parts adding noise to debug output. - f.debug_struct("TlsCertificate").field("digest", &self.digest).finish() - } -} - -impl Eq for TlsCertificate {} -impl PartialEq for TlsCertificate { - fn eq(&self, other: &Self) -> bool { - self.digest == other.digest - } -} - -impl TryFrom for TlsCertificate { - type Error = anyhow::Error; - - fn try_from(db_cert: Certificate) -> Result { - // Parse and validate what we've got. - let certs_pem = openssl::x509::X509::stack_from_pem(&db_cert.cert) - .context("parsing PEM stack")?; - let private_key = PKey::private_key_from_pem(&db_cert.key) - .context("parsing private key PEM")?; - - // Assemble a rustls CertifiedKey with both the certificate and the key. - let certified_key = { - let mut cursor = std::io::Cursor::new(db_cert.key.clone()); - let rustls_private_key = rustls_pemfile::private_key(&mut cursor) - .expect("parsing private key PEM") - .expect("no private keys found"); - let rustls_signing_key = - rustls::crypto::ring::sign::any_supported_type( - &rustls_private_key, - ) - .context("parsing DER private key")?; - let rustls_certs = certs_pem - .iter() - .map(|x509| { - x509.to_der() - .context("serializing cert to DER") - .map(rustls::pki_types::CertificateDer::from) - }) - .collect::>()?; - Arc::new(CertifiedKey::new(rustls_certs, rustls_signing_key)) - }; - - let end_cert = certs_pem - .into_iter() - .next() - .ok_or_else(|| anyhow!("no certificates in PEM stack"))?; - anyhow::ensure!( - end_cert - .public_key() - .context("certificate publickey")? - .public_eq(&private_key), - "certificate public key does not match stored private key" - ); - - // Compute a digest (fingerprint) that we can use for debugging. - let digest = { - let digest_bytes = end_cert - .digest(openssl::hash::MessageDigest::sha256()) - .context("computing fingerprint")?; - hex::encode(&digest_bytes) - }; - - Ok(TlsCertificate { certified_key, digest, parsed: end_cert }) - } -} - -/// Read the lists of all Silos, external DNS zones, and external TLS -/// certificates from the database and assemble an `ExternalEndpoints` structure -/// that describes what DNS names exist, which Silos they correspond to, and -/// what TLS certificates can be used for them -// This structure is used to determine what TLS certificates are used for -// incoming connections to the external console/API endpoints. As such, it's -// critical that we produce a usable result if at all possible, even if it's -// incomplete. Otherwise, we won't be able to serve _any_ incoming connections -// to _any_ of our external endpoints! If data from the database is invalid or -// inconsistent, that data is discarded and a warning is produced, but we'll -// still return a usable object. -pub async fn read_all_endpoints( - datastore: &DataStore, - opctx: &OpContext, -) -> Result { - // We will not look for more than this number of external DNS zones, Silos, - // or certificates. We do not expect very many of any of these objects. - const MAX: u32 = 200; - let pagparams_id = DataPageParams { - marker: None, - limit: NonZeroU32::new(MAX).unwrap(), - direction: dropshot::PaginationOrder::Ascending, - }; - let pagbyid = PaginatedBy::Id(pagparams_id); - let pagparams_name = DataPageParams { - marker: None, - limit: NonZeroU32::new(MAX).unwrap(), - direction: dropshot::PaginationOrder::Ascending, - }; - - let silos = - datastore.silos_list(opctx, &pagbyid, Discoverability::All).await?; - let external_dns_zones = datastore - .dns_zones_list(opctx, DnsGroup::External, &pagparams_name) - .await?; - bail_unless!( - !external_dns_zones.is_empty(), - "expected at least one external DNS zone" - ); - let certs = datastore - .certificate_list_for(opctx, Some(ServiceKind::Nexus), &pagbyid, false) - .await?; - - // If we found too many of any of these things, complain as loudly as we - // can. Our results will be wrong. But we still don't want to fail if we - // can avoid it because we want to be able to serve as many endpoints as we - // can. - // TODO-reliability we should prevent people from creating more than this - // maximum number of Silos and certificates. - let max = usize::try_from(MAX).unwrap(); - if silos.len() >= max { - error!( - &opctx.log, - "reading endpoints: expected at most {} silos, but found at \ - least {}. TLS may not work on some Silos' external endpoints.", - MAX, - silos.len(), - ); - } - if external_dns_zones.len() >= max { - error!( - &opctx.log, - "reading endpoints: expected at most {} external DNS zones, but \ - found at least {}. TLS may not work on some Silos' external \ - endpoints.", - MAX, - external_dns_zones.len(), - ); - } - if certs.len() >= max { - error!( - &opctx.log, - "reading endpoints: expected at most {} certificates, but \ - found at least {}. TLS may not work on some Silos' external \ - endpoints.", - MAX, - certs.len(), - ); - } - - Ok(ExternalEndpoints::new(silos, certs, external_dns_zones)) -} - -/// TLS SNI certificate resolver for use with rustls/Dropshot -/// -/// This object exists to impl `rustls::server::ResolvesServerCert`. This -/// object looks at an incoming TLS session's SNI field, matches it against the -/// latest `ExternalEndpoints` configuration (available via a watch channel), -/// and then determines which certificate (if any) to provide for the new -/// session. -/// -/// See the module-level comment for more details. -#[derive(Debug)] -pub struct NexusCertResolver { - log: slog::Logger, - config_rx: watch::Receiver>, -} - -impl NexusCertResolver { - pub fn new( - log: slog::Logger, - config_rx: watch::Receiver>, - ) -> NexusCertResolver { - NexusCertResolver { log, config_rx } - } - - fn do_resolve_endpoint( - &self, - server_name: Option<&str>, - ) -> Result, anyhow::Error> { - let Some(server_name) = server_name else { - bail!("TLS session had no server name") - }; - - let config_ref = self.config_rx.borrow(); - let config = match &*config_ref { - Some(c) => c, - None => bail!("no TLS config found"), - }; - - config - .by_dns_name - .get(server_name) - .ok_or_else(|| anyhow!("unrecognized server name: {}", server_name)) - .cloned() - } - - fn do_resolve( - &self, - server_name: Option<&str>, - ) -> Option> { - let log = - self.log.new(o!("server_name" => server_name.map(String::from))); - - trace!(&log, "resolving TLS certificate"); - let resolved = self.do_resolve_endpoint(server_name); - let result = match resolved { - Ok(ref endpoint) => match endpoint.best_certificate() { - Ok(certificate) => Ok((endpoint.silo_id, certificate)), - Err(error) => Err(error), - }, - Err(error) => Err(error), - }; - match result { - Ok((silo_id, certificate)) => { - debug!(log, "resolved TLS certificate"; - "silo_id" => silo_id.to_string(), - "certificate" => ?certificate - ); - Some(certificate.certified_key.clone()) - } - Err(error) => { - // TODO-security There is a (limited) DoS risk here, in that the - // client controls the request made to this endpoint and we're - // going to emit something to the log every time this happens. - // But at this stage it's pretty valuable to be able to debug - // this problem. - warn!( - log, - "failed to resolve TLS certificate"; - "error" => format!("{:#}", error), - ); - None - } - } - } -} - -impl rustls::server::ResolvesServerCert for NexusCertResolver { - fn resolve( - &self, - client_hello: rustls::server::ClientHello, - ) -> Option> { - let server_name = client_hello.server_name(); - self.do_resolve(server_name) - } -} - -/// Returns the host and port of the server that the client is trying to -/// reach -/// -/// Recall that Nexus serves many external endpoints on the same set of IP -/// addresses, each corresponding to a particular Silo. We use the standard -/// HTTP 1.1 "host" header or HTTP2 URI authority to determine which -/// Silo's endpoint the client is trying to reach. -pub fn authority_for_request( - rqinfo: &dropshot::RequestInfo, -) -> Result { - if rqinfo.version() > hyper::Version::HTTP_11 { - // For HTTP2, the server name is specified in the URL's "authority". - rqinfo - .uri() - .authority() - .cloned() - .ok_or_else(|| String::from("request URL missing authority")) - } else { - // For earlier versions of HTTP, the server name is specified by the - // "Host" header. - rqinfo - .headers() - .get(http::header::HOST) - .ok_or_else(|| String::from("request missing \"host\" header"))? - .to_str() - .map_err(|e| format!("failed to decode \"host\" header: {:#}", e)) - .and_then(|hostport| { - hostport.parse().map_err(|e| { - format!("unsupported \"host\" header: {:#}", e) - }) - }) - } -} - -// See `Nexus::endpoint_for_request()`. This is factored out to be able to test -// it without a whole server. -pub fn endpoint_for_authority( - log: &slog::Logger, - requested_authority: &http::uri::Authority, - config_rx: &tokio::sync::watch::Receiver>, -) -> Result, Error> { - let requested_host = requested_authority.host(); - let log = log.new(o!("server_name" => requested_host.to_string())); - trace!(&log, "determining endpoint"); - - // If we have not successfully loaded the endpoint configuration yet, - // there's nothing we can do here. We could try to do better (e.g., use - // the recovery Silo?). But if we failed to load endpoints, it's likely - // the database is down, and we're not going to get much further anyway. - let endpoint_config = config_rx.borrow(); - let endpoints = endpoint_config.as_ref().ok_or_else(|| { - error!(&log, "received request with no endpoints loaded"); - Error::unavail("endpoints not loaded") - })?; - - // See if there's an endpoint for the requested name. If so, use it. - if let Some(endpoint) = endpoints.by_dns_name.get(requested_host) { - trace!( - &log, - "received request for endpoint"; - "silo_name" => ?endpoint.db_silo.name(), - "silo_id" => ?endpoint.silo_id, - ); - - return Ok(endpoint.clone()); - } - - // There was no endpoint for the requested name. This should generally - // not happen in deployed systems where we expect people to have set up - // DNS to find the external endpoints. But in development, we don't - // always have DNS set up. People may use an IP address to get here. - // To accommodate this use case, we make a best-effort to pick a default - // endpoint when we can't find one for the name we were given. - // - // If this ever does happen in a production system, this might be - // confusing. The best thing to do in a production system is probably - // to return an error saying that the requested server name was unknown. - // Instead, we'll wind up choosing some Silo here. This has no impact - // on authenticated requests because for those we use the authenticated - // identity's Silo. (That's as of this writing. Again, we may want to - // disallow this and produce an error instead.) If the request is not - // authenticated, we may wind up sending them to a login page for this - // Silo that may not be the Silo they meant. - endpoints - .default_endpoint - .as_ref() - .ok_or_else(|| { - error!( - &log, - "received request for unknown host and no default \ - endpoint is available", - ); - Error::invalid_request(&format!( - "HTTP request for unknown server name {:?}", - requested_host, - )) - }) - .map(|c| c.clone()) -} - -#[cfg(test)] -mod test { - use super::authority_for_request; - use super::endpoint_for_authority; - use super::ExternalEndpointError; - use super::ExternalEndpoints; - use super::NexusCertResolver; - use super::TlsCertificate; - use chrono::Utc; - use dropshot::endpoint; - use dropshot::test_util::LogContext; - use dropshot::ConfigLogging; - use dropshot::ConfigLoggingIfExists; - use dropshot::ConfigLoggingLevel; - use http::uri::Authority; - use nexus_db_model::Certificate; - use nexus_db_model::DnsGroup; - use nexus_db_model::DnsZone; - use nexus_db_model::ServiceKind; - use nexus_db_model::Silo; - use nexus_types::external_api::params; - use nexus_types::external_api::shared; - use nexus_types::identity::Resource; - use omicron_common::api::external::Error; - use omicron_common::api::external::IdentityMetadataCreateParams; - use schemars::JsonSchema; - use serde::Deserialize; - use serde::Serialize; - use std::net::SocketAddr; - use uuid::Uuid; - - fn create_silo(silo_id: Option, name: &str, saml: bool) -> Silo { - let identity_mode = if saml { - shared::SiloIdentityMode::SamlJit - } else { - shared::SiloIdentityMode::LocalOnly - }; - let params = params::SiloCreate { - identity: IdentityMetadataCreateParams { - name: name.parse().unwrap(), - description: String::new(), - }, - quotas: params::SiloQuotasCreate::empty(), - discoverable: false, - identity_mode, - admin_group_name: None, - tls_certificates: vec![], - mapped_fleet_roles: Default::default(), - }; - - if let Some(silo_id) = silo_id { - Silo::new_with_id(silo_id, params) - } else { - Silo::new(params) - } - .unwrap() - } - - fn create_certificate( - domain: &str, - expired: bool, - ) -> params::CertificateCreate { - let mut cert_params = - rcgen::CertificateParams::new(vec![domain.to_string()]); - if expired { - cert_params.not_after = std::time::UNIX_EPOCH.into(); - } - let cert = rcgen::Certificate::from_params(cert_params).unwrap(); - let cert_pem = - cert.serialize_pem().expect("serializing certificate as PEM"); - let key_pem = cert.serialize_private_key_pem(); - let namestr = format!("cert-for-{}", domain.replace('.', "-")); - params::CertificateCreate { - identity: IdentityMetadataCreateParams { - name: namestr.parse().unwrap(), - description: String::new(), - }, - cert: cert_pem, - key: key_pem, - service: shared::ServiceUsingCertificate::ExternalApi, - } - } - - fn create_dns_zone(domain: &str) -> DnsZone { - DnsZone { - id: Uuid::new_v4(), - time_created: Utc::now(), - dns_group: DnsGroup::External, - zone_name: format!("{}.test", domain), - } - } - - fn cert_matches(tls_cert: &TlsCertificate, cert: &Certificate) -> bool { - let parse_right = openssl::x509::X509::from_pem(&cert.cert).unwrap(); - tls_cert.parsed == parse_right - } - - #[test] - fn test_external_endpoints_empty() { - // Truly trivial case: no endpoints at all. - let ee1 = ExternalEndpoints::new(vec![], vec![], vec![]); - assert_eq!(ee1.ndomains(), 0); - assert_eq!(ee1.nwarnings(), 1); - assert_eq!( - ee1.warnings[0].to_string(), - "no external endpoints were found" - ); - assert!(ee1.default_endpoint.is_none()); - - // There are also no endpoints if there's a Silo but no external DNS - // zones. - let silo_id: Uuid = - "6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3".parse().unwrap(); - let silo = create_silo(Some(silo_id), "dummy", false); - let ee2 = ExternalEndpoints::new(vec![silo], vec![], vec![]); - assert_eq!(ee2.ndomains(), 0); - assert_eq!(ee2.nwarnings(), 1); - assert_eq!( - ee2.warnings[0].to_string(), - "no external endpoints were found" - ); - assert!(ee2.default_endpoint.is_none()); - // Test PartialEq impl. - assert_eq!(ee1, ee2); - - // There are also no endpoints if there's an external DNS zone but no - // Silo. - let dns_zone1 = create_dns_zone("oxide1"); - let ee2 = ExternalEndpoints::new(vec![], vec![], vec![dns_zone1]); - assert_eq!(ee2.ndomains(), 0); - assert_eq!(ee2.nwarnings(), 1); - assert_eq!( - ee2.warnings[0].to_string(), - "no external endpoints were found" - ); - assert!(ee2.default_endpoint.is_none()); - // Test PartialEq impl. - assert_eq!(ee1, ee2); - - // Finally, there are no endpoints if there's a certificate and nothing - // else. This isn't really valid. But it's useful to verify that we - // won't crash or otherwise fail if we get a certificate with an invalid - // silo_id. - let cert_create = create_certificate("dummy.sys.oxide1.test", false); - let cert = Certificate::new( - silo_id, - Uuid::new_v4(), - ServiceKind::Nexus, - cert_create, - &["dummy.sys.oxide1.test".to_string()], - ) - .unwrap(); - let ee2 = ExternalEndpoints::new(vec![], vec![cert], vec![]); - assert_eq!(ee2.ndomains(), 0); - assert_eq!(ee2.nwarnings(), 2); - assert!(ee2.warnings[0].to_string().contains("silo not found"),); - assert_eq!( - ee2.warnings[1].to_string(), - "no external endpoints were found" - ); - assert!(ee2.default_endpoint.is_none()); - } - - #[test] - fn test_external_endpoints_basic() { - // Empty case for comparison. - let ee1 = ExternalEndpoints::new(vec![], vec![], vec![]); - - // Sample data - let silo_id: Uuid = - "6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3".parse().unwrap(); - let silo = create_silo(Some(silo_id), "dummy", false); - let dns_zone1 = create_dns_zone("oxide1"); - let cert_create = create_certificate("dummy.sys.oxide1.test", false); - let cert = Certificate::new( - silo_id, - Uuid::new_v4(), - ServiceKind::Nexus, - cert_create, - &["dummy.sys.oxide1.test".to_string()], - ) - .unwrap(); - - // Simple case: one silo, one DNS zone. We should see an endpoint for - // the Silo. Since it has no certificates, we'll get a warning. - let ee3 = ExternalEndpoints::new( - vec![silo.clone()], - vec![], - vec![dns_zone1.clone()], - ); - // Test PartialEq impl. - assert_ne!(ee1, ee3); - assert_eq!(ee3.ndomains(), 1); - assert!(ee3.has_domain("dummy.sys.oxide1.test")); - assert_eq!(ee3.nwarnings(), 1); - assert_eq!( - ee3.warnings[0].to_string(), - "silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 with DNS name \ - \"dummy.sys.oxide1.test\" has no usable certificates" - ); - // This also exercises best_certificate() with zero certificates. - assert_eq!( - ee3.by_dns_name["dummy.sys.oxide1.test"] - .best_certificate() - .unwrap_err() - .to_string(), - "silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 has no usable \ - certificates" - ); - assert_eq!(ee3.default_endpoint.as_ref().unwrap().silo_id, silo_id); - - // Now try with a certificate. - let ee4 = ExternalEndpoints::new( - vec![silo.clone()], - vec![cert.clone()], - vec![dns_zone1.clone()], - ); - assert_ne!(ee3, ee4); - assert_eq!(ee4.ndomains(), 1); - assert!(ee4.has_domain("dummy.sys.oxide1.test")); - assert_eq!(ee4.nwarnings(), 0); - let endpoint = &ee4.by_dns_name["dummy.sys.oxide1.test"]; - assert_eq!(endpoint.silo_id, silo_id); - assert_eq!(endpoint.tls_certs.len(), 1); - assert!(cert_matches(&endpoint.tls_certs[0], &cert)); - // This also exercises best_certificate() with one certificate. - assert_eq!( - *endpoint.best_certificate().unwrap(), - endpoint.tls_certs[0] - ); - assert_eq!(ee4.default_endpoint.as_ref().unwrap().silo_id, silo_id); - - // Add a second external DNS zone. There should now be two endpoints, - // both pointing to the same Silo. - let dns_zone2 = DnsZone { - id: Uuid::new_v4(), - time_created: Utc::now(), - dns_group: DnsGroup::External, - zone_name: String::from("oxide2.test"), - }; - let ee5 = ExternalEndpoints::new( - vec![silo.clone()], - vec![cert.clone()], - vec![dns_zone1.clone(), dns_zone2], - ); - assert_ne!(ee4, ee5); - assert_eq!(ee5.ndomains(), 2); - assert!(ee5.has_domain("dummy.sys.oxide1.test")); - assert!(ee5.has_domain("dummy.sys.oxide2.test")); - assert_eq!(ee5.nwarnings(), 0); - assert_eq!(ee5.default_endpoint.as_ref().unwrap().silo_id, silo_id); - let endpoint1 = &ee5.by_dns_name["dummy.sys.oxide1.test"]; - let endpoint2 = &ee5.by_dns_name["dummy.sys.oxide2.test"]; - assert_eq!(endpoint1, endpoint2); - assert_eq!(endpoint1.silo_id, silo_id); - assert_eq!(endpoint1.tls_certs.len(), 1); - assert_eq!(endpoint2.silo_id, silo_id); - assert_eq!(endpoint2.tls_certs.len(), 1); - - // Add a second Silo with the same name as the first one. This should - // not be possible in practice. In the future, we expect other features - // (e.g., DNS aliases) to make it possible for silos' DNS names to - // overlap like this. - let silo2_same_name_id = - "e3f36f20-56c3-c545-8320-c19d98b82c1d".parse().unwrap(); - let silo2_same_name = - create_silo(Some(silo2_same_name_id), "dummy", false); - let ee6 = ExternalEndpoints::new( - vec![silo, silo2_same_name], - vec![cert], - vec![dns_zone1], - ); - assert_ne!(ee5, ee6); - assert_eq!(ee6.ndomains(), 1); - assert!(ee6.has_domain("dummy.sys.oxide1.test")); - assert_eq!(ee6.default_endpoint.as_ref().unwrap().silo_id, silo_id); - let endpoint = &ee6.by_dns_name["dummy.sys.oxide1.test"]; - assert_eq!(endpoint.silo_id, silo_id); - assert_eq!(endpoint.tls_certs.len(), 1); - assert_eq!(ee6.nwarnings(), 1); - assert_eq!( - ee6.warnings[0].to_string(), - "ignoring silo e3f36f20-56c3-c545-8320-c19d98b82c1d (\"dummy\"): \ - has the same DNS name (\"dummy.sys.oxide1.test\") as \ - previously-found silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 \ - (\"dummy\")" - ); - } - - #[test] - fn test_external_endpoints_complex() { - // Set up a somewhat complex scenario: - // - // - four Silos - // - silo1: two certificates, one of which is expired - // - silo2: two certificates, one of which is expired - // (in the other order to make sure it's not working by accident) - // - silo3: one certificate that is invalid - // - silo4: one certificate that is expired - // - two DNS zones - // - // We should wind up with eight endpoints and one warning. - let silo1 = create_silo(None, "silo1", true); - let silo2 = create_silo(None, "silo2", true); - let silo3 = create_silo(None, "silo3", false); - let silo4 = create_silo(None, "silo4", true); - let silo1_cert1_params = - create_certificate("silo1.sys.oxide1.test", false); - let silo1_cert1 = Certificate::new( - silo1.identity().id, - Uuid::new_v4(), - ServiceKind::Nexus, - silo1_cert1_params, - &["silo1.sys.oxide1.test".to_string()], - ) - .unwrap(); - let silo1_cert2_params = - create_certificate("silo1.sys.oxide1.test", true); - let silo1_cert2 = Certificate::new_unvalidated( - silo1.identity().id, - Uuid::new_v4(), - ServiceKind::Nexus, - silo1_cert2_params, - ); - let silo2_cert1_params = - create_certificate("silo2.sys.oxide1.test", true); - let silo2_cert1 = Certificate::new_unvalidated( - silo2.identity().id, - Uuid::new_v4(), - ServiceKind::Nexus, - silo2_cert1_params, - ); - let silo2_cert2_params = - create_certificate("silo2.sys.oxide1.test", false); - let silo2_cert2 = Certificate::new( - silo2.identity().id, - Uuid::new_v4(), - ServiceKind::Nexus, - silo2_cert2_params, - &["silo2.sys.oxide1.test".to_string()], - ) - .unwrap(); - let silo3_cert_params = - create_certificate("silo3.sys.oxide1.test", false); - let mut silo3_cert = Certificate::new( - silo3.identity().id, - Uuid::new_v4(), - ServiceKind::Nexus, - silo3_cert_params, - &["silo3.sys.oxide1.test".to_string()], - ) - .unwrap(); - // Corrupt a byte of this last certificate. (This has to be done after - // constructing it or we would fail validation.) - silo3_cert.cert[0] ^= 1; - let silo4_cert_params = - create_certificate("silo4.sys.oxide1.test", true); - let silo4_cert = Certificate::new_unvalidated( - silo4.identity().id, - Uuid::new_v4(), - ServiceKind::Nexus, - silo4_cert_params, - ); - let dns_zone1 = create_dns_zone("oxide1"); - let dns_zone2 = create_dns_zone("oxide2"); - - let ee = ExternalEndpoints::new( - vec![silo1.clone(), silo2.clone(), silo3.clone(), silo4.clone()], - vec![ - silo1_cert1.clone(), - silo1_cert2.clone(), - silo2_cert1, - silo2_cert2.clone(), - silo3_cert.clone(), - silo4_cert.clone(), - ], - vec![dns_zone1, dns_zone2], - ); - println!("{:?}", ee); - assert_eq!(ee.ndomains(), 8); - assert_eq!(ee.nwarnings(), 3); - assert_eq!( - 2, - ee.warnings - .iter() - .filter(|warning| matches!(warning, - ExternalEndpointError::NoSiloCerts { silo_id, .. } - if *silo_id == silo3.id() - )) - .count() - ); - assert_eq!( - 1, - ee.warnings - .iter() - .filter(|warning| matches!(warning, - ExternalEndpointError::BadCert { silo_id, .. } - if *silo_id == silo3.id() - )) - .count() - ); - - assert_eq!( - ee.by_dns_name["silo1.sys.oxide1.test"], - ee.by_dns_name["silo1.sys.oxide2.test"] - ); - assert_eq!( - ee.by_dns_name["silo2.sys.oxide1.test"], - ee.by_dns_name["silo2.sys.oxide2.test"] - ); - assert_eq!( - ee.by_dns_name["silo3.sys.oxide1.test"], - ee.by_dns_name["silo3.sys.oxide2.test"] - ); - assert_eq!( - ee.by_dns_name["silo4.sys.oxide1.test"], - ee.by_dns_name["silo4.sys.oxide2.test"] - ); - assert_eq!( - ee.default_endpoint.as_ref().unwrap().silo_id, - silo3.identity().id - ); - - let e1 = &ee.by_dns_name["silo1.sys.oxide1.test"]; - assert_eq!(e1.silo_id, silo1.id()); - let c1 = e1.best_certificate().unwrap(); - // It must be cert1 because cert2 is expired. - assert!(cert_matches(c1, &silo1_cert1)); - - let e2 = &ee.by_dns_name["silo2.sys.oxide1.test"]; - assert_eq!(e2.silo_id, silo2.id()); - let c2 = e2.best_certificate().unwrap(); - // It must be cert2 because cert1 is expired. - assert!(cert_matches(c2, &silo2_cert2)); - assert!(!cert_matches(c2, &silo1_cert1)); - assert!(!cert_matches(c2, &silo1_cert2)); - - let e3 = &ee.by_dns_name["silo3.sys.oxide1.test"]; - assert_eq!(e3.silo_id, silo3.id()); - assert!(e3.best_certificate().is_err()); - - // We should get an expired cert if it's the only option. - let e4 = &ee.by_dns_name["silo4.sys.oxide1.test"]; - assert_eq!(e4.silo_id, silo4.id()); - let c4 = e4.best_certificate().unwrap(); - assert!(cert_matches(c4, &silo4_cert)); - - // - // Test endpoint lookup by authority. - // - let logctx = LogContext::new( - "test_external_endpoints_complex", - &ConfigLogging::File { - level: ConfigLoggingLevel::Trace, - path: "UNUSED".into(), - if_exists: ConfigLoggingIfExists::Append, - }, - ); - let log = &logctx.log; - let (_, watch_rx) = tokio::sync::watch::channel(Some(ee.clone())); - - // Basic cases: look up a few Silos by name. - let authority = Authority::from_static("silo1.sys.oxide1.test"); - let ae1 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); - assert_eq!(ae1, *e1); - let authority = Authority::from_static("silo1.sys.oxide2.test"); - let ae1 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); - assert_eq!(ae1, *e1); - let authority = Authority::from_static("silo2.sys.oxide1.test"); - let ae2 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); - assert_eq!(ae2, *e2); - // The port number in the authority should be ignored. - let authority = Authority::from_static("silo3.sys.oxide1.test:456"); - let ae3 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); - assert_eq!(ae3, *e3); - // We should get back a default endpoint if we use a server name that's - // not known. That includes any IPv4 or IPv6 address, too. The default - // endpoint should always be silo3 because it's the only one we've - // created LocalOnly. - for name in [ - "springfield.sys.oxide1.test", - "springfield.sys.oxide1.test:123", - "10.1.2.3:456", - "[fe80::1]:789", - ] { - let authority = Authority::from_static(name); - let ae = - endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); - assert_eq!(ae, *e3); - } - - // - // Now test the NexusCertResolver. - // - let (watch_tx, watch_rx) = tokio::sync::watch::channel(None); - let cert_resolver = - NexusCertResolver::new(logctx.log.clone(), watch_rx); - - // At this point we haven't filled in the configuration so any attempt - // to resolve anything should fail. - assert!(cert_resolver - .do_resolve(Some("silo1.sys.oxide1.test")) - .is_none()); - - // Now pass along the configuration and try again. - watch_tx.send(Some(ee.clone())).unwrap(); - let resolved_c1 = - cert_resolver.do_resolve(Some("silo1.sys.oxide1.test")).unwrap(); - assert_eq!(resolved_c1.cert, c1.certified_key.cert); - let resolved_c2 = - cert_resolver.do_resolve(Some("silo2.sys.oxide1.test")).unwrap(); - assert_eq!(resolved_c2.cert, c2.certified_key.cert); - assert!(cert_resolver - .do_resolve(Some("silo3.sys.oxide1.test")) - .is_none()); - // We should get an expired cert if it's the only option. - let resolved_c4 = - cert_resolver.do_resolve(Some("silo4.sys.oxide1.test")).unwrap(); - assert_eq!(resolved_c4.cert, c4.certified_key.cert); - - logctx.cleanup_successful(); - } - - #[tokio::test] - async fn test_authority() { - // Tests for authority_for_request(). The function itself is pretty - // simple. That makes it easy to test fairly exhaustively. It's also - // useful to verify that we're doing what we think we're doing - // (identifying the name that the client thinks they're connecting to). - - // First, set up a Dropshot server that just echoes back whatever - // authority_for_request() returns for a given request. - let logctx = omicron_test_utils::dev::test_setup_log("test_authority"); - let mut api = dropshot::ApiDescription::new(); - api.register(echo_server_name).unwrap(); - let server = dropshot::HttpServerStarter::new( - &dropshot::ConfigDropshot::default(), - api, - (), - &logctx.log, - ) - .expect("failed to create dropshot server") - .start(); - let local_addr = server.local_addr(); - let port = local_addr.port(); - - #[derive(Debug, PartialEq, Eq, JsonSchema, Serialize, Deserialize)] - struct AuthorityResponse { - host: String, - port: Option, - } - - #[endpoint(method = GET, path = "/server_name")] - async fn echo_server_name( - rqctx: dropshot::RequestContext<()>, - ) -> Result< - dropshot::HttpResponseOk>, - dropshot::HttpError, - > { - Ok(dropshot::HttpResponseOk( - authority_for_request(&rqctx.request).map(|authority| { - AuthorityResponse { - host: authority.host().to_string(), - port: authority.port_u16(), - } - }), - )) - } - - // Generally, the "authority" for a request is determined by the URL - // provided to the client. We can test basically two cases this way: an - // authority with a host and port and an authority with an IP address - // and port. We can't test any cases that require the client to connect - // to a different host/port than what's in the URL. So we can't test - // the case of an authority with no port number in it (since our server - // doesn't run on port 80). - // - // With HTTP 1.1, you can generally override the authority by specifying - // your own "host" header. That lets us exercise the case of an - // authority that has no port number, even though the client would be - // connecting to a URL with a port number in it. It might also let us - // test other cases, like an authority with an invalid DNS name. - // However, it's not clear any of this is possible with HTTP 2 or later. - - async fn test_v2_host( - hostname: &str, - addr: SocketAddr, - ) -> AuthorityResponse { - let v2_client = reqwest::ClientBuilder::new() - .http2_prior_knowledge() - .resolve(hostname, addr) - .build() - .unwrap(); - test_request(&v2_client, &format!("{}:{}", hostname, addr.port())) - .await - } - - async fn test_v2_ip(addr: SocketAddr) -> AuthorityResponse { - let v2_client = reqwest::ClientBuilder::new() - .http2_prior_knowledge() - .build() - .unwrap(); - test_request(&v2_client, &addr.to_string()).await - } - - async fn test_v1_host( - hostname: &str, - addr: SocketAddr, - override_host: Option<&str>, - ) -> AuthorityResponse { - let mut v1_builder = reqwest::ClientBuilder::new() - .http1_only() - .resolve(hostname, addr); - if let Some(host) = override_host { - let mut headers = http::header::HeaderMap::new(); - headers.insert(http::header::HOST, host.try_into().unwrap()); - v1_builder = v1_builder.default_headers(headers); - } - let v1_client = v1_builder.build().unwrap(); - test_request(&v1_client, &format!("{}:{}", hostname, addr.port())) - .await - } - - async fn test_v1_ip( - addr: SocketAddr, - override_host: Option<&str>, - ) -> AuthorityResponse { - let mut v1_builder = reqwest::ClientBuilder::new().http1_only(); - if let Some(host) = override_host { - let mut headers = http::header::HeaderMap::new(); - headers.append(http::header::HOST, host.try_into().unwrap()); - v1_builder = v1_builder.default_headers(headers); - } - let v1_client = v1_builder.build().unwrap(); - test_request(&v1_client, &addr.to_string()).await - } - - async fn test_request( - client: &reqwest::Client, - connect_host: &str, - ) -> AuthorityResponse { - let url = format!("http://{}/server_name", connect_host); - - let result = client - .get(&url) - .send() - .await - .unwrap_or_else(|e| panic!("GET {:?}: {:#}", url, e)); - let status = result.status(); - println!("status: {:?}", status); - if status != http::StatusCode::OK { - panic!("GET {:?}: unexpected status: {:?}", url, status); - } - - let body: Result = - result.json().await.unwrap_or_else(|e| { - panic!("GET {:?}: parse json: {:#}", url, e); - }); - println!("body: {:?}", body); - body.unwrap() - } - - // HTTP 2: regular hostname (with port) - let authority = test_v2_host("foo.example.com", local_addr).await; - assert_eq!(authority.host, "foo.example.com"); - assert_eq!(authority.port, Some(port)); - - // HTTP 2: IP address (with port) - let authority = test_v2_ip(local_addr).await; - assert_eq!(authority.host, local_addr.ip().to_string()); - assert_eq!(authority.port, Some(port)); - - // HTTP 1.1: regular hostname, no overridden "host" header. - let authority = test_v1_host("foo.example.com", local_addr, None).await; - assert_eq!(authority.host, "foo.example.com"); - assert_eq!(authority.port, Some(port)); - - // HTTP 1.1: regular hostname, override "host" header with port. - let authority = test_v1_host( - "foo.example.com", - local_addr, - Some("foo.example.com:123"), - ) - .await; - assert_eq!(authority.host, "foo.example.com"); - assert_eq!(authority.port, Some(123)); - - // HTTP 1.1: regular hostname, override "host" header with no port. - let authority = test_v1_host( - "foo.example.com", - local_addr, - Some("foo.example.com"), - ) - .await; - assert_eq!(authority.host, "foo.example.com"); - assert_eq!(authority.port, None); - - // HTTP 1.1: IP address, no overridden "host" header. - let authority = test_v1_ip(local_addr, None).await; - assert_eq!(authority.host, local_addr.ip().to_string()); - assert_eq!(authority.port, Some(port)); - - // HTTP 1.1: IP address, override "host" header with port. - let authority = - test_v1_ip(local_addr, Some("foo.example.com:123")).await; - assert_eq!(authority.host, "foo.example.com"); - assert_eq!(authority.port, Some(123)); - - // HTTP 1.1: IP address, override "host" header with no port. - let authority = test_v1_ip(local_addr, Some("foo.example.com")).await; - assert_eq!(authority.host, "foo.example.com"); - assert_eq!(authority.port, None); - - server.close().await.expect("failed to shut down dropshot server"); - logctx.cleanup_successful(); - } - - #[tokio::test] - async fn test_no_endpoint() { - let logctx = - omicron_test_utils::dev::test_setup_log("test_no_endpoint"); - let log = &logctx.log; - - // We'll test two configurations at the same time: one where there's no - // configuration at all, and one where there's a configuration but no - // default endpoint. These should always produce errors, no matter what - // endpoint we're looking up. - let ee = ExternalEndpoints::new(vec![], vec![], vec![]); - let (_, none_rx) = - tokio::sync::watch::channel::>(None); - let (_, empty_rx) = - tokio::sync::watch::channel::>(Some(ee)); - - for name in [ - "dummy", - "dummy.example", - "dummy.example:123", - "10.1.2.3:456", - "[fe80::1]:789", - ] { - let authority = Authority::from_static(name); - for (rx_label, rx_channel) in - [("empty", &empty_rx), ("none", &none_rx)] - { - println!("config {:?} endpoint {:?}", rx_label, name); - let result = - endpoint_for_authority(&log, &authority, rx_channel); - match result { - Err(Error::ServiceUnavailable { internal_message }) => { - assert_eq!(rx_label, "none"); - assert_eq!(internal_message, "endpoints not loaded"); - } - Err(Error::InvalidRequest { message }) => { - assert_eq!(rx_label, "empty"); - assert_eq!( - message.external_message(), - format!( - "HTTP request for unknown server name {:?}", - authority.host() - ) - ); - } - result => { - panic!( - "unexpected result looking up endpoint for \ - {:?} with config {:?}: {:?}", - name, rx_label, result - ); - } - } - } - } - - logctx.cleanup_successful(); - } -} diff --git a/nexus/reconfigurator/execution/Cargo.toml b/nexus/reconfigurator/execution/Cargo.toml index 07b97457d7..62155d9783 100644 --- a/nexus/reconfigurator/execution/Cargo.toml +++ b/nexus/reconfigurator/execution/Cargo.toml @@ -15,7 +15,6 @@ internal-dns.workspace = true nexus-config.workspace = true nexus-db-model.workspace = true nexus-db-queries.workspace = true -nexus-external-endpoints.workspace = true nexus-types.workspace = true omicron-common.workspace = true reqwest.workspace = true diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index fcf9b1961c..dd23822502 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -13,7 +13,6 @@ use nexus_db_model::Silo; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder; use nexus_db_queries::db::DataStore; -use nexus_external_endpoints::ExternalEndpoints; use nexus_types::deployment::Blueprint; use nexus_types::deployment::OmicronZoneType; use nexus_types::internal_api::params::DnsConfigParams; @@ -63,11 +62,9 @@ pub(crate) async fn deploy_dns( // Next, construct the DNS config represented by the blueprint. let internal_dns_config_blueprint = blueprint_internal_dns_config(blueprint, sleds_by_id); - let external_endpoints = read_all_endpoints(datastore, opctx) - .await - .internal_context("reading external endpoints to deploy DNS")?; + let silos = todo!(); // XXX-dap let external_dns_config_blueprint = - blueprint_external_dns_config(blueprint, &external_endpoints); + blueprint_external_dns_config(blueprint, silos); // Deploy the changes. deploy_dns_one( @@ -305,15 +302,8 @@ pub fn blueprint_internal_dns_config( pub fn blueprint_external_dns_config( blueprint: &Blueprint, - endpoints: &ExternalEndpoints, + silos: Vec, ) -> DnsConfigParams { - let dns_params = DnsConfigParams { - generation: blueprint.external-dns_version.next(), - time_created: Utc::now(), - zones: vec![DnsConfigZone { - - }] - }; todo!(); // XXX-dap } diff --git a/nexus/src/app/background/external_endpoints.rs b/nexus/src/app/background/external_endpoints.rs index 1bad865e2a..ed530e0775 100644 --- a/nexus/src/app/background/external_endpoints.rs +++ b/nexus/src/app/background/external_endpoints.rs @@ -7,12 +7,12 @@ //! associated with those names use super::common::BackgroundTask; +use crate::app::external_endpoints::read_all_endpoints; +pub use crate::app::external_endpoints::ExternalEndpoints; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; -use nexus_external_endpoints::read_all_endpoints; -pub use nexus_external_endpoints::ExternalEndpoints; use serde_json::json; use std::sync::Arc; use tokio::sync::watch; diff --git a/nexus/src/app/external_endpoints.rs b/nexus/src/app/external_endpoints.rs index 66d31e2ead..bcfec667ce 100644 --- a/nexus/src/app/external_endpoints.rs +++ b/nexus/src/app/external_endpoints.rs @@ -4,16 +4,652 @@ //! Management of external HTTPS endpoints //! -//! The guts of this subsystem are in the separate `nexus-external-endpoints` -//! crate. +//! Whenever a client connects to one of our external endpoints and attempts to +//! establish a TLS session, we must provide a TLS certificate to authenticate +//! ourselves to the client. But each Silo has a separate external DNS name and +//! may have its own TLS certificate for that DNS name. These all resolve to +//! the same set of IPs, so we cannot tell from the IP address alone which +//! Silo's endpoint the client is trying to reach nor which certificate to +//! present. TLS provides a mechanism called Server Name Indication (SNI) for +//! clients to specify the name of the server they're trying to reach _before_ +//! the TLS session is established. We use this to determine which Silo +//! endpoint the client is trying to reach and so which TLS certificate to +//! present. +//! +//! To achieve this, we first need to know what DNS names, Silos, and TLS +//! certificates are available at any given time. This is summarized in +//! [`ExternalEndpoints`]. A background task is responsible for maintaining +//! this, providing the latest version to whoever needs it via a `watch` +//! channel. How do we tell the TLS stack what certificate to use? When +//! setting up the Dropshot server in the first place, we provide a +//! [`rustls::ServerConfig`] that describes various TLS settings, including an +//! "certificate resolver" object that impls +//! [`rustls::server::ResolvesServerCert`]. See [`NexusCertResolver`]. +use super::silo::silo_dns_name; use crate::ServerContext; -use nexus_external_endpoints::authority_for_request; -use nexus_external_endpoints::endpoint_for_authority; -use nexus_external_endpoints::ExternalEndpoint; -pub use nexus_external_endpoints::NexusCertResolver; +use anyhow::anyhow; +use anyhow::bail; +use anyhow::Context; +use nexus_db_model::AuthenticationMode; +use nexus_db_model::Certificate; +use nexus_db_model::DnsGroup; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::datastore::Discoverability; +use nexus_db_queries::db::fixed_data::silo::SILO_ID; +use nexus_db_queries::db::model::ServiceKind; +use nexus_db_queries::db::DataStore; +use nexus_types::identity::Resource; +use omicron_common::api::external::http_pagination::PaginatedBy; +use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; +use omicron_common::bail_unless; +use openssl::pkey::PKey; +use openssl::x509::X509; +use rustls::sign::CertifiedKey; +use serde::Serialize; +use serde_with::SerializeDisplay; +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; +use std::fmt; +use std::num::NonZeroU32; use std::sync::Arc; +use thiserror::Error; +use tokio::sync::watch; +use uuid::Uuid; + +/// Describes the set of external endpoints, organized by DNS name +/// +/// This data structure provides a quick way to determine which Silo and TLS +/// certificate(s) make sense for an incoming request, based on the TLS +/// session's SNI (DNS name). See module-level docs for details. +/// +/// This object provides no interfaces outside this module. It's only used by +/// the `NexusCertResolver` that's also in this module. +/// +/// This structure impls `Serialize` only so that background tasks can easily +/// present the latest configuration that they've found (e.g., via a debug API) +#[derive(Clone, Debug, Eq, PartialEq, Serialize)] +pub struct ExternalEndpoints { + by_dns_name: BTreeMap>, + warnings: Vec, + default_endpoint: Option>, +} + +impl ExternalEndpoints { + /// Assemble a list of Silos, TLS certificates, and external DNS zones into + /// a structure that we can use for quickly figuring out which Silo and TLS + /// certificates are associated with each incoming DNS name + fn new( + silos: Vec, + certs: Vec, + external_dns_zones: Vec, + ) -> ExternalEndpoints { + // We want to avoid failing this operation even if we encounter problems + // because we want to serve as many DNS certificates as we can (so that + // an operator has a chance of fixing any problems that do exist). + // Instead of returning any errors, keep track of any issues as + // warnings. + let mut warnings = vec![]; + + // Compute a mapping from external DNS name to Silo id. Detect any + // duplicates and leave them out (but report them). There should not + // be any duplicates since the DNS names are constructed from the + // (unique) Silo names. Even if we support aliases in the future, they + // will presumably need to be unique, too. + let silos_by_id: BTreeMap> = silos + .into_iter() + .map(|db_silo| (db_silo.id(), Arc::new(db_silo))) + .collect(); + let mut dns_names: BTreeMap = BTreeMap::new(); + for z in external_dns_zones { + for (_, db_silo) in &silos_by_id { + let dns_name = format!( + "{}.{}", + silo_dns_name(db_silo.name()), + z.zone_name + ); + match dns_names.entry(dns_name.clone()) { + Entry::Vacant(vac) => { + vac.insert(db_silo.id()); + } + Entry::Occupied(occ) => { + let first_silo_id = *occ.get(); + let first_silo_name = silos_by_id + .get(&first_silo_id) + .unwrap() + .name() + .to_string(); + warnings.push(ExternalEndpointError::DupDnsName { + dup_silo_id: db_silo.id(), + dup_silo_name: db_silo.name().to_string(), + first_silo_id, + first_silo_name, + dns_name, + }) + } + }; + } + } + + // Compute a mapping from silo id to a list of usable TLS certificates + // for the Silo. By "usable" here, we just mean that we are capable of + // providing it to the client. This basically means that we can parse + // it. A certificate might be invalid for some other reason (e.g., does + // not match the right DNS name or it's expired). We may later choose + // to prefer some certificates over others, but that'll be decided later + // (see best_certificate()). And in the end it'll be better to provide + // an expired certificate than none at all. + let parsed_certificates = certs.into_iter().map(|db_cert| { + let silo_id = db_cert.silo_id; + let tls_cert = TlsCertificate::try_from(db_cert).map_err(|e| { + ExternalEndpointError::BadCert { silo_id, reason: Arc::new(e) } + })?; + let db_silo = silos_by_id + .get(&silo_id) + .ok_or_else(|| ExternalEndpointError::BadCert { + silo_id, + reason: Arc::new(anyhow!("silo not found")), + })? + .clone(); + Ok((silo_id, db_silo, tls_cert)) + }); + + let mut certs_by_silo_id = BTreeMap::new(); + for parsed_cert in parsed_certificates { + match parsed_cert { + Err(error) => { + warnings.push(error); + } + Ok((silo_id, db_silo, tls_cert)) => { + let silo_entry = certs_by_silo_id + .entry(silo_id) + .or_insert_with(|| ExternalEndpoint { + silo_id, + db_silo, + tls_certs: Vec::new(), + }); + silo_entry.tls_certs.push(tls_cert) + } + }; + } + + let certs_by_silo_id: BTreeMap<_, _> = certs_by_silo_id + .into_iter() + .map(|(k, v)| (k, Arc::new(v))) + .collect(); + + let by_dns_name: BTreeMap<_, _> = dns_names + .into_iter() + .map(|(dns_name, silo_id)| { + let silo_info = certs_by_silo_id + .get(&silo_id) + .cloned() + .unwrap_or_else(|| { + // For something to appear in `dns_names`, we must have + // found it in `silos`, and so it must be in + // `silos_by_id`. + let db_silo = + silos_by_id.get(&silo_id).unwrap().clone(); + Arc::new(ExternalEndpoint { + silo_id, + db_silo, + tls_certs: vec![], + }) + }); + + if silo_info.tls_certs.is_empty() { + warnings.push(ExternalEndpointError::NoSiloCerts { + silo_id, + dns_name: dns_name.clone(), + }) + } + + (dns_name, silo_info) + }) + .collect(); + + if by_dns_name.is_empty() { + warnings.push(ExternalEndpointError::NoEndpoints); + } + + // Pick a default endpoint. This will be used if a request arrives + // without specifying an endpoint via the HTTP/1.1 Host header or the + // HTTP2 URL. This is only intended for development, where external DNS + // may not be set up. + // + // We somewhat arbitrarily choose the first Silo we find that's not JIT. + // This would usually be the recovery Silo. + let default_endpoint = silos_by_id + .values() + .filter(|s| { + // Ignore the built-in Silo, which people are not supposed to + // log into. + s.id() != *SILO_ID + }) + .find(|s| s.authentication_mode == AuthenticationMode::Local) + .and_then(|s| { + by_dns_name + .iter() + .find(|(_, endpoint)| endpoint.silo_id == s.id()) + .map(|(_, endpoint)| endpoint.clone()) + }); + + ExternalEndpoints { by_dns_name, warnings, default_endpoint } + } + + #[cfg(test)] + pub fn has_domain(&self, dns_name: &str) -> bool { + self.by_dns_name.contains_key(dns_name) + } + + #[cfg(test)] + pub fn ndomains(&self) -> usize { + self.by_dns_name.len() + } + + #[cfg(test)] + pub fn nwarnings(&self) -> usize { + self.warnings.len() + } +} + +/// Describes a single external "endpoint", by which we mean an external DNS +/// name that's associated with a particular Silo +#[derive(Debug, PartialEq, Eq, Serialize)] +pub struct ExternalEndpoint { + /// the id of the Silo associated with this endpoint + // This is redundant with `db_silo`, but it's convenient to put it here and + // it shows up in the serialized form this way. + silo_id: Uuid, + /// the silo associated with this endpoint + #[serde(skip)] + db_silo: Arc, + /// the set of TLS certificate chains that could be appropriate for this + /// endpoint + tls_certs: Vec, +} + +impl ExternalEndpoint { + pub fn silo(&self) -> &nexus_db_model::Silo { + &self.db_silo + } + + /// Chooses a TLS certificate (chain) to use when handling connections to + /// this endpoint + fn best_certificate(&self) -> Result<&TlsCertificate, anyhow::Error> { + // We expect the most common case to be that there's only one + // certificate chain here. The next most common case is that there are + // two because the administrator is in the process of rotating + // certificates, usually due to upcoming expiration. In principle, it + // would be useful to allow operators to control which certificate chain + // gets used, and maybe even do something like a canary to mitigate the + // risk of a botched certificate update. Absent that, we're going to do + // our best to pick the best chain automatically. + // + // This could be a lot more sophisticated than it is. We could try to + // avoid using certificates that are clearly not valid based on the + // "not_after" and "not_before" bounds. We could check each certificate + // in the chain, not just the last one. We could use a margin of error + // when doing this to account for small variations in the wall clock + // between us and the client. We could try to avoid using a certificate + // that doesn't appear to be compatible with the SNI value (DNS domain) + // that this request came in on. + // + // IMPORTANT: If we ever decide to do those things, they should only be + // used to decide which of several certificates is preferred. We should + // always pick a certificate if we possibly can, even if it seems to be + // invalid. A client can always choose not to trust it. But in the + // unfortunate case where there are no good certificates, a customer's + // only option may be to instruct their client to trust an invalid + // certificate _so that they can log in and fix the certificate + // problem_. If we provide no certificate at all here, a customer may + // have no way to fix the problem. + // + // Anyway, we don't yet do anything of these things. For now, pick the + // certificate chain whose leaf certificate has the latest expiration + // time. + + // This would be cleaner if Asn1Time impl'd Ord or even just a way to + // convert it to a Unix timestamp or any other comparable timestamp. + let mut latest_expiration: Option<&TlsCertificate> = None; + for t in &self.tls_certs { + // We'll choose this certificate (so far) if we find that it's + // anything other than "earlier" than the best we've seen so far. + // That includes the case where we haven't seen any so far, where + // this one is greater than or equal to the best so far, as well as + // the case where they're incomparable for whatever reason. (This + // ensures that we always pick at least one.) + if latest_expiration.is_none() + || !matches!( + t.parsed.not_after().partial_cmp( + latest_expiration.unwrap().parsed.not_after() + ), + Some(std::cmp::Ordering::Less) + ) + { + latest_expiration = Some(t); + } + } + + latest_expiration.ok_or_else(|| { + anyhow!("silo {} has no usable certificates", self.silo_id) + }) + } +} + +/// Describes a problem encountered while assembling an [`ExternalEndpoints`] +/// object +#[derive(Clone, Debug, Error, SerializeDisplay)] +enum ExternalEndpointError { + #[error( + "ignoring silo {dup_silo_id} ({dup_silo_name:?}): has the same DNS \ + name ({dns_name:?}) as previously-found silo {first_silo_id} \ + ({first_silo_name:?})" + )] + DupDnsName { + dup_silo_id: Uuid, + dup_silo_name: String, + first_silo_id: Uuid, + first_silo_name: String, + dns_name: String, + }, + + #[error("ignoring certificate for silo {silo_id}: {reason:#}")] + BadCert { + silo_id: Uuid, + #[source] + reason: Arc, + }, + + #[error( + "silo {silo_id} with DNS name {dns_name:?} has no usable certificates" + )] + NoSiloCerts { silo_id: Uuid, dns_name: String }, + + #[error("no external endpoints were found")] + NoEndpoints, +} + +impl Eq for ExternalEndpointError {} +impl PartialEq for ExternalEndpointError { + fn eq(&self, other: &Self) -> bool { + self.to_string() == other.to_string() + } +} + +/// A parsed, validated TLS certificate ready to use with an external TLS server +#[derive(Serialize)] +#[serde(transparent)] +struct TlsCertificate { + /// This is what we need to provide to the TLS stack when we decide to use + /// this certificate for an incoming TLS connection + // NOTE: It's important that we do not serialize the private key! + #[serde(skip)] + certified_key: Arc, + + /// Parsed representation of the whole certificate chain + /// + /// This is used to extract metadata like the expiration time. + // NOTE: It's important that we do not serialize the private key! + #[serde(skip)] + parsed: X509, + + /// certificate digest (historically sometimes called a "fingerprint") + // This is the only field that appears in the serialized output or debug + // output. + digest: String, +} + +impl fmt::Debug for TlsCertificate { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // It's important that only the digest appear in the debug output. We + // definitely don't want to leak the private key this way. Really, + // we don't want even the public parts adding noise to debug output. + f.debug_struct("TlsCertificate").field("digest", &self.digest).finish() + } +} + +impl Eq for TlsCertificate {} +impl PartialEq for TlsCertificate { + fn eq(&self, other: &Self) -> bool { + self.digest == other.digest + } +} + +impl TryFrom for TlsCertificate { + type Error = anyhow::Error; + + fn try_from(db_cert: Certificate) -> Result { + // Parse and validate what we've got. + let certs_pem = openssl::x509::X509::stack_from_pem(&db_cert.cert) + .context("parsing PEM stack")?; + let private_key = PKey::private_key_from_pem(&db_cert.key) + .context("parsing private key PEM")?; + + // Assemble a rustls CertifiedKey with both the certificate and the key. + let certified_key = { + let mut cursor = std::io::Cursor::new(db_cert.key.clone()); + let rustls_private_key = rustls_pemfile::private_key(&mut cursor) + .expect("parsing private key PEM") + .expect("no private keys found"); + let rustls_signing_key = + rustls::crypto::ring::sign::any_supported_type( + &rustls_private_key, + ) + .context("parsing DER private key")?; + let rustls_certs = certs_pem + .iter() + .map(|x509| { + x509.to_der() + .context("serializing cert to DER") + .map(rustls::pki_types::CertificateDer::from) + }) + .collect::>()?; + Arc::new(CertifiedKey::new(rustls_certs, rustls_signing_key)) + }; + + let end_cert = certs_pem + .into_iter() + .next() + .ok_or_else(|| anyhow!("no certificates in PEM stack"))?; + anyhow::ensure!( + end_cert + .public_key() + .context("certificate publickey")? + .public_eq(&private_key), + "certificate public key does not match stored private key" + ); + + // Compute a digest (fingerprint) that we can use for debugging. + let digest = { + let digest_bytes = end_cert + .digest(openssl::hash::MessageDigest::sha256()) + .context("computing fingerprint")?; + hex::encode(&digest_bytes) + }; + + Ok(TlsCertificate { certified_key, digest, parsed: end_cert }) + } +} + +/// Read the lists of all Silos, external DNS zones, and external TLS +/// certificates from the database and assemble an `ExternalEndpoints` structure +/// that describes what DNS names exist, which Silos they correspond to, and +/// what TLS certificates can be used for them +// This structure is used to determine what TLS certificates are used for +// incoming connections to the external console/API endpoints. As such, it's +// critical that we produce a usable result if at all possible, even if it's +// incomplete. Otherwise, we won't be able to serve _any_ incoming connections +// to _any_ of our external endpoints! If data from the database is invalid or +// inconsistent, that data is discarded and a warning is produced, but we'll +// still return a usable object. +pub(crate) async fn read_all_endpoints( + datastore: &DataStore, + opctx: &OpContext, +) -> Result { + // We will not look for more than this number of external DNS zones, Silos, + // or certificates. We do not expect very many of any of these objects. + const MAX: u32 = 200; + let pagparams_id = DataPageParams { + marker: None, + limit: NonZeroU32::new(MAX).unwrap(), + direction: dropshot::PaginationOrder::Ascending, + }; + let pagbyid = PaginatedBy::Id(pagparams_id); + let pagparams_name = DataPageParams { + marker: None, + limit: NonZeroU32::new(MAX).unwrap(), + direction: dropshot::PaginationOrder::Ascending, + }; + + let silos = + datastore.silos_list(opctx, &pagbyid, Discoverability::All).await?; + let external_dns_zones = datastore + .dns_zones_list(opctx, DnsGroup::External, &pagparams_name) + .await?; + bail_unless!( + !external_dns_zones.is_empty(), + "expected at least one external DNS zone" + ); + let certs = datastore + .certificate_list_for(opctx, Some(ServiceKind::Nexus), &pagbyid, false) + .await?; + + // If we found too many of any of these things, complain as loudly as we + // can. Our results will be wrong. But we still don't want to fail if we + // can avoid it because we want to be able to serve as many endpoints as we + // can. + // TODO-reliability we should prevent people from creating more than this + // maximum number of Silos and certificates. + let max = usize::try_from(MAX).unwrap(); + if silos.len() >= max { + error!( + &opctx.log, + "reading endpoints: expected at most {} silos, but found at \ + least {}. TLS may not work on some Silos' external endpoints.", + MAX, + silos.len(), + ); + } + if external_dns_zones.len() >= max { + error!( + &opctx.log, + "reading endpoints: expected at most {} external DNS zones, but \ + found at least {}. TLS may not work on some Silos' external \ + endpoints.", + MAX, + external_dns_zones.len(), + ); + } + if certs.len() >= max { + error!( + &opctx.log, + "reading endpoints: expected at most {} certificates, but \ + found at least {}. TLS may not work on some Silos' external \ + endpoints.", + MAX, + certs.len(), + ); + } + + Ok(ExternalEndpoints::new(silos, certs, external_dns_zones)) +} + +/// TLS SNI certificate resolver for use with rustls/Dropshot +/// +/// This object exists to impl `rustls::server::ResolvesServerCert`. This +/// object looks at an incoming TLS session's SNI field, matches it against the +/// latest `ExternalEndpoints` configuration (available via a watch channel), +/// and then determines which certificate (if any) to provide for the new +/// session. +/// +/// See the module-level comment for more details. +#[derive(Debug)] +pub struct NexusCertResolver { + log: slog::Logger, + config_rx: watch::Receiver>, +} + +impl NexusCertResolver { + pub fn new( + log: slog::Logger, + config_rx: watch::Receiver>, + ) -> NexusCertResolver { + NexusCertResolver { log, config_rx } + } + + fn do_resolve_endpoint( + &self, + server_name: Option<&str>, + ) -> Result, anyhow::Error> { + let Some(server_name) = server_name else { + bail!("TLS session had no server name") + }; + + let config_ref = self.config_rx.borrow(); + let config = match &*config_ref { + Some(c) => c, + None => bail!("no TLS config found"), + }; + + config + .by_dns_name + .get(server_name) + .ok_or_else(|| anyhow!("unrecognized server name: {}", server_name)) + .cloned() + } + + fn do_resolve( + &self, + server_name: Option<&str>, + ) -> Option> { + let log = + self.log.new(o!("server_name" => server_name.map(String::from))); + + trace!(&log, "resolving TLS certificate"); + let resolved = self.do_resolve_endpoint(server_name); + let result = match resolved { + Ok(ref endpoint) => match endpoint.best_certificate() { + Ok(certificate) => Ok((endpoint.silo_id, certificate)), + Err(error) => Err(error), + }, + Err(error) => Err(error), + }; + match result { + Ok((silo_id, certificate)) => { + debug!(log, "resolved TLS certificate"; + "silo_id" => silo_id.to_string(), + "certificate" => ?certificate + ); + Some(certificate.certified_key.clone()) + } + Err(error) => { + // TODO-security There is a (limited) DoS risk here, in that the + // client controls the request made to this endpoint and we're + // going to emit something to the log every time this happens. + // But at this stage it's pretty valuable to be able to debug + // this problem. + warn!( + log, + "failed to resolve TLS certificate"; + "error" => format!("{:#}", error), + ); + None + } + } + } +} + +impl rustls::server::ResolvesServerCert for NexusCertResolver { + fn resolve( + &self, + client_hello: rustls::server::ClientHello, + ) -> Option> { + let server_name = client_hello.server_name(); + self.do_resolve(server_name) + } +} impl super::Nexus { /// Attempts to determine which external endpoint the given request is @@ -51,3 +687,880 @@ impl super::Nexus { ) } } + +/// Returns the host and port of the server that the client is trying to +/// reach +/// +/// Recall that Nexus serves many external endpoints on the same set of IP +/// addresses, each corresponding to a particular Silo. We use the standard +/// HTTP 1.1 "host" header or HTTP2 URI authority to determine which +/// Silo's endpoint the client is trying to reach. +pub fn authority_for_request( + rqinfo: &dropshot::RequestInfo, +) -> Result { + if rqinfo.version() > hyper::Version::HTTP_11 { + // For HTTP2, the server name is specified in the URL's "authority". + rqinfo + .uri() + .authority() + .cloned() + .ok_or_else(|| String::from("request URL missing authority")) + } else { + // For earlier versions of HTTP, the server name is specified by the + // "Host" header. + rqinfo + .headers() + .get(http::header::HOST) + .ok_or_else(|| String::from("request missing \"host\" header"))? + .to_str() + .map_err(|e| format!("failed to decode \"host\" header: {:#}", e)) + .and_then(|hostport| { + hostport.parse().map_err(|e| { + format!("unsupported \"host\" header: {:#}", e) + }) + }) + } +} + +// See `Nexus::endpoint_for_request()` above. This is factored out to be able +// to test it without a whole server. +fn endpoint_for_authority( + log: &slog::Logger, + requested_authority: &http::uri::Authority, + config_rx: &tokio::sync::watch::Receiver>, +) -> Result, Error> { + let requested_host = requested_authority.host(); + let log = log.new(o!("server_name" => requested_host.to_string())); + trace!(&log, "determining endpoint"); + + // If we have not successfully loaded the endpoint configuration yet, + // there's nothing we can do here. We could try to do better (e.g., use + // the recovery Silo?). But if we failed to load endpoints, it's likely + // the database is down, and we're not going to get much further anyway. + let endpoint_config = config_rx.borrow(); + let endpoints = endpoint_config.as_ref().ok_or_else(|| { + error!(&log, "received request with no endpoints loaded"); + Error::unavail("endpoints not loaded") + })?; + + // See if there's an endpoint for the requested name. If so, use it. + if let Some(endpoint) = endpoints.by_dns_name.get(requested_host) { + trace!( + &log, + "received request for endpoint"; + "silo_name" => ?endpoint.db_silo.name(), + "silo_id" => ?endpoint.silo_id, + ); + + return Ok(endpoint.clone()); + } + + // There was no endpoint for the requested name. This should generally + // not happen in deployed systems where we expect people to have set up + // DNS to find the external endpoints. But in development, we don't + // always have DNS set up. People may use an IP address to get here. + // To accommodate this use case, we make a best-effort to pick a default + // endpoint when we can't find one for the name we were given. + // + // If this ever does happen in a production system, this might be + // confusing. The best thing to do in a production system is probably + // to return an error saying that the requested server name was unknown. + // Instead, we'll wind up choosing some Silo here. This has no impact + // on authenticated requests because for those we use the authenticated + // identity's Silo. (That's as of this writing. Again, we may want to + // disallow this and produce an error instead.) If the request is not + // authenticated, we may wind up sending them to a login page for this + // Silo that may not be the Silo they meant. + endpoints + .default_endpoint + .as_ref() + .ok_or_else(|| { + error!( + &log, + "received request for unknown host and no default \ + endpoint is available", + ); + Error::invalid_request(&format!( + "HTTP request for unknown server name {:?}", + requested_host, + )) + }) + .map(|c| c.clone()) +} + +#[cfg(test)] +mod test { + use super::endpoint_for_authority; + use super::ExternalEndpoints; + use super::TlsCertificate; + use crate::app::external_endpoints::authority_for_request; + use crate::app::external_endpoints::ExternalEndpointError; + use crate::app::external_endpoints::NexusCertResolver; + use chrono::Utc; + use dropshot::endpoint; + use dropshot::test_util::LogContext; + use dropshot::ConfigLogging; + use dropshot::ConfigLoggingIfExists; + use dropshot::ConfigLoggingLevel; + use http::uri::Authority; + use nexus_db_model::Certificate; + use nexus_db_model::DnsGroup; + use nexus_db_model::DnsZone; + use nexus_db_model::ServiceKind; + use nexus_db_model::Silo; + use nexus_types::external_api::params; + use nexus_types::external_api::shared; + use nexus_types::identity::Resource; + use omicron_common::api::external::Error; + use omicron_common::api::external::IdentityMetadataCreateParams; + use schemars::JsonSchema; + use serde::Deserialize; + use serde::Serialize; + use std::net::SocketAddr; + use uuid::Uuid; + + fn create_silo(silo_id: Option, name: &str, saml: bool) -> Silo { + let identity_mode = if saml { + shared::SiloIdentityMode::SamlJit + } else { + shared::SiloIdentityMode::LocalOnly + }; + let params = params::SiloCreate { + identity: IdentityMetadataCreateParams { + name: name.parse().unwrap(), + description: String::new(), + }, + quotas: params::SiloQuotasCreate::empty(), + discoverable: false, + identity_mode, + admin_group_name: None, + tls_certificates: vec![], + mapped_fleet_roles: Default::default(), + }; + + if let Some(silo_id) = silo_id { + Silo::new_with_id(silo_id, params) + } else { + Silo::new(params) + } + .unwrap() + } + + fn create_certificate( + domain: &str, + expired: bool, + ) -> params::CertificateCreate { + let mut cert_params = + rcgen::CertificateParams::new(vec![domain.to_string()]); + if expired { + cert_params.not_after = std::time::UNIX_EPOCH.into(); + } + let cert = rcgen::Certificate::from_params(cert_params).unwrap(); + let cert_pem = + cert.serialize_pem().expect("serializing certificate as PEM"); + let key_pem = cert.serialize_private_key_pem(); + let namestr = format!("cert-for-{}", domain.replace('.', "-")); + params::CertificateCreate { + identity: IdentityMetadataCreateParams { + name: namestr.parse().unwrap(), + description: String::new(), + }, + cert: cert_pem, + key: key_pem, + service: shared::ServiceUsingCertificate::ExternalApi, + } + } + + fn create_dns_zone(domain: &str) -> DnsZone { + DnsZone { + id: Uuid::new_v4(), + time_created: Utc::now(), + dns_group: DnsGroup::External, + zone_name: format!("{}.test", domain), + } + } + + fn cert_matches(tls_cert: &TlsCertificate, cert: &Certificate) -> bool { + let parse_right = openssl::x509::X509::from_pem(&cert.cert).unwrap(); + tls_cert.parsed == parse_right + } + + #[test] + fn test_external_endpoints_empty() { + // Truly trivial case: no endpoints at all. + let ee1 = ExternalEndpoints::new(vec![], vec![], vec![]); + assert_eq!(ee1.ndomains(), 0); + assert_eq!(ee1.nwarnings(), 1); + assert_eq!( + ee1.warnings[0].to_string(), + "no external endpoints were found" + ); + assert!(ee1.default_endpoint.is_none()); + + // There are also no endpoints if there's a Silo but no external DNS + // zones. + let silo_id: Uuid = + "6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3".parse().unwrap(); + let silo = create_silo(Some(silo_id), "dummy", false); + let ee2 = ExternalEndpoints::new(vec![silo], vec![], vec![]); + assert_eq!(ee2.ndomains(), 0); + assert_eq!(ee2.nwarnings(), 1); + assert_eq!( + ee2.warnings[0].to_string(), + "no external endpoints were found" + ); + assert!(ee2.default_endpoint.is_none()); + // Test PartialEq impl. + assert_eq!(ee1, ee2); + + // There are also no endpoints if there's an external DNS zone but no + // Silo. + let dns_zone1 = create_dns_zone("oxide1"); + let ee2 = ExternalEndpoints::new(vec![], vec![], vec![dns_zone1]); + assert_eq!(ee2.ndomains(), 0); + assert_eq!(ee2.nwarnings(), 1); + assert_eq!( + ee2.warnings[0].to_string(), + "no external endpoints were found" + ); + assert!(ee2.default_endpoint.is_none()); + // Test PartialEq impl. + assert_eq!(ee1, ee2); + + // Finally, there are no endpoints if there's a certificate and nothing + // else. This isn't really valid. But it's useful to verify that we + // won't crash or otherwise fail if we get a certificate with an invalid + // silo_id. + let cert_create = create_certificate("dummy.sys.oxide1.test", false); + let cert = Certificate::new( + silo_id, + Uuid::new_v4(), + ServiceKind::Nexus, + cert_create, + &["dummy.sys.oxide1.test".to_string()], + ) + .unwrap(); + let ee2 = ExternalEndpoints::new(vec![], vec![cert], vec![]); + assert_eq!(ee2.ndomains(), 0); + assert_eq!(ee2.nwarnings(), 2); + assert!(ee2.warnings[0].to_string().contains("silo not found"),); + assert_eq!( + ee2.warnings[1].to_string(), + "no external endpoints were found" + ); + assert!(ee2.default_endpoint.is_none()); + } + + #[test] + fn test_external_endpoints_basic() { + // Empty case for comparison. + let ee1 = ExternalEndpoints::new(vec![], vec![], vec![]); + + // Sample data + let silo_id: Uuid = + "6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3".parse().unwrap(); + let silo = create_silo(Some(silo_id), "dummy", false); + let dns_zone1 = create_dns_zone("oxide1"); + let cert_create = create_certificate("dummy.sys.oxide1.test", false); + let cert = Certificate::new( + silo_id, + Uuid::new_v4(), + ServiceKind::Nexus, + cert_create, + &["dummy.sys.oxide1.test".to_string()], + ) + .unwrap(); + + // Simple case: one silo, one DNS zone. We should see an endpoint for + // the Silo. Since it has no certificates, we'll get a warning. + let ee3 = ExternalEndpoints::new( + vec![silo.clone()], + vec![], + vec![dns_zone1.clone()], + ); + // Test PartialEq impl. + assert_ne!(ee1, ee3); + assert_eq!(ee3.ndomains(), 1); + assert!(ee3.has_domain("dummy.sys.oxide1.test")); + assert_eq!(ee3.nwarnings(), 1); + assert_eq!( + ee3.warnings[0].to_string(), + "silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 with DNS name \ + \"dummy.sys.oxide1.test\" has no usable certificates" + ); + // This also exercises best_certificate() with zero certificates. + assert_eq!( + ee3.by_dns_name["dummy.sys.oxide1.test"] + .best_certificate() + .unwrap_err() + .to_string(), + "silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 has no usable \ + certificates" + ); + assert_eq!(ee3.default_endpoint.as_ref().unwrap().silo_id, silo_id); + + // Now try with a certificate. + let ee4 = ExternalEndpoints::new( + vec![silo.clone()], + vec![cert.clone()], + vec![dns_zone1.clone()], + ); + assert_ne!(ee3, ee4); + assert_eq!(ee4.ndomains(), 1); + assert!(ee4.has_domain("dummy.sys.oxide1.test")); + assert_eq!(ee4.nwarnings(), 0); + let endpoint = &ee4.by_dns_name["dummy.sys.oxide1.test"]; + assert_eq!(endpoint.silo_id, silo_id); + assert_eq!(endpoint.tls_certs.len(), 1); + assert!(cert_matches(&endpoint.tls_certs[0], &cert)); + // This also exercises best_certificate() with one certificate. + assert_eq!( + *endpoint.best_certificate().unwrap(), + endpoint.tls_certs[0] + ); + assert_eq!(ee4.default_endpoint.as_ref().unwrap().silo_id, silo_id); + + // Add a second external DNS zone. There should now be two endpoints, + // both pointing to the same Silo. + let dns_zone2 = DnsZone { + id: Uuid::new_v4(), + time_created: Utc::now(), + dns_group: DnsGroup::External, + zone_name: String::from("oxide2.test"), + }; + let ee5 = ExternalEndpoints::new( + vec![silo.clone()], + vec![cert.clone()], + vec![dns_zone1.clone(), dns_zone2], + ); + assert_ne!(ee4, ee5); + assert_eq!(ee5.ndomains(), 2); + assert!(ee5.has_domain("dummy.sys.oxide1.test")); + assert!(ee5.has_domain("dummy.sys.oxide2.test")); + assert_eq!(ee5.nwarnings(), 0); + assert_eq!(ee5.default_endpoint.as_ref().unwrap().silo_id, silo_id); + let endpoint1 = &ee5.by_dns_name["dummy.sys.oxide1.test"]; + let endpoint2 = &ee5.by_dns_name["dummy.sys.oxide2.test"]; + assert_eq!(endpoint1, endpoint2); + assert_eq!(endpoint1.silo_id, silo_id); + assert_eq!(endpoint1.tls_certs.len(), 1); + assert_eq!(endpoint2.silo_id, silo_id); + assert_eq!(endpoint2.tls_certs.len(), 1); + + // Add a second Silo with the same name as the first one. This should + // not be possible in practice. In the future, we expect other features + // (e.g., DNS aliases) to make it possible for silos' DNS names to + // overlap like this. + let silo2_same_name_id = + "e3f36f20-56c3-c545-8320-c19d98b82c1d".parse().unwrap(); + let silo2_same_name = + create_silo(Some(silo2_same_name_id), "dummy", false); + let ee6 = ExternalEndpoints::new( + vec![silo, silo2_same_name], + vec![cert], + vec![dns_zone1], + ); + assert_ne!(ee5, ee6); + assert_eq!(ee6.ndomains(), 1); + assert!(ee6.has_domain("dummy.sys.oxide1.test")); + assert_eq!(ee6.default_endpoint.as_ref().unwrap().silo_id, silo_id); + let endpoint = &ee6.by_dns_name["dummy.sys.oxide1.test"]; + assert_eq!(endpoint.silo_id, silo_id); + assert_eq!(endpoint.tls_certs.len(), 1); + assert_eq!(ee6.nwarnings(), 1); + assert_eq!( + ee6.warnings[0].to_string(), + "ignoring silo e3f36f20-56c3-c545-8320-c19d98b82c1d (\"dummy\"): \ + has the same DNS name (\"dummy.sys.oxide1.test\") as \ + previously-found silo 6bcbd3bb-f93b-e8b3-d41c-dce6d98281d3 \ + (\"dummy\")" + ); + } + + #[test] + fn test_external_endpoints_complex() { + // Set up a somewhat complex scenario: + // + // - four Silos + // - silo1: two certificates, one of which is expired + // - silo2: two certificates, one of which is expired + // (in the other order to make sure it's not working by accident) + // - silo3: one certificate that is invalid + // - silo4: one certificate that is expired + // - two DNS zones + // + // We should wind up with eight endpoints and one warning. + let silo1 = create_silo(None, "silo1", true); + let silo2 = create_silo(None, "silo2", true); + let silo3 = create_silo(None, "silo3", false); + let silo4 = create_silo(None, "silo4", true); + let silo1_cert1_params = + create_certificate("silo1.sys.oxide1.test", false); + let silo1_cert1 = Certificate::new( + silo1.identity().id, + Uuid::new_v4(), + ServiceKind::Nexus, + silo1_cert1_params, + &["silo1.sys.oxide1.test".to_string()], + ) + .unwrap(); + let silo1_cert2_params = + create_certificate("silo1.sys.oxide1.test", true); + let silo1_cert2 = Certificate::new_unvalidated( + silo1.identity().id, + Uuid::new_v4(), + ServiceKind::Nexus, + silo1_cert2_params, + ); + let silo2_cert1_params = + create_certificate("silo2.sys.oxide1.test", true); + let silo2_cert1 = Certificate::new_unvalidated( + silo2.identity().id, + Uuid::new_v4(), + ServiceKind::Nexus, + silo2_cert1_params, + ); + let silo2_cert2_params = + create_certificate("silo2.sys.oxide1.test", false); + let silo2_cert2 = Certificate::new( + silo2.identity().id, + Uuid::new_v4(), + ServiceKind::Nexus, + silo2_cert2_params, + &["silo2.sys.oxide1.test".to_string()], + ) + .unwrap(); + let silo3_cert_params = + create_certificate("silo3.sys.oxide1.test", false); + let mut silo3_cert = Certificate::new( + silo3.identity().id, + Uuid::new_v4(), + ServiceKind::Nexus, + silo3_cert_params, + &["silo3.sys.oxide1.test".to_string()], + ) + .unwrap(); + // Corrupt a byte of this last certificate. (This has to be done after + // constructing it or we would fail validation.) + silo3_cert.cert[0] ^= 1; + let silo4_cert_params = + create_certificate("silo4.sys.oxide1.test", true); + let silo4_cert = Certificate::new_unvalidated( + silo4.identity().id, + Uuid::new_v4(), + ServiceKind::Nexus, + silo4_cert_params, + ); + let dns_zone1 = create_dns_zone("oxide1"); + let dns_zone2 = create_dns_zone("oxide2"); + + let ee = ExternalEndpoints::new( + vec![silo1.clone(), silo2.clone(), silo3.clone(), silo4.clone()], + vec![ + silo1_cert1.clone(), + silo1_cert2.clone(), + silo2_cert1, + silo2_cert2.clone(), + silo3_cert.clone(), + silo4_cert.clone(), + ], + vec![dns_zone1, dns_zone2], + ); + println!("{:?}", ee); + assert_eq!(ee.ndomains(), 8); + assert_eq!(ee.nwarnings(), 3); + assert_eq!( + 2, + ee.warnings + .iter() + .filter(|warning| matches!(warning, + ExternalEndpointError::NoSiloCerts { silo_id, .. } + if *silo_id == silo3.id() + )) + .count() + ); + assert_eq!( + 1, + ee.warnings + .iter() + .filter(|warning| matches!(warning, + ExternalEndpointError::BadCert { silo_id, .. } + if *silo_id == silo3.id() + )) + .count() + ); + + assert_eq!( + ee.by_dns_name["silo1.sys.oxide1.test"], + ee.by_dns_name["silo1.sys.oxide2.test"] + ); + assert_eq!( + ee.by_dns_name["silo2.sys.oxide1.test"], + ee.by_dns_name["silo2.sys.oxide2.test"] + ); + assert_eq!( + ee.by_dns_name["silo3.sys.oxide1.test"], + ee.by_dns_name["silo3.sys.oxide2.test"] + ); + assert_eq!( + ee.by_dns_name["silo4.sys.oxide1.test"], + ee.by_dns_name["silo4.sys.oxide2.test"] + ); + assert_eq!( + ee.default_endpoint.as_ref().unwrap().silo_id, + silo3.identity().id + ); + + let e1 = &ee.by_dns_name["silo1.sys.oxide1.test"]; + assert_eq!(e1.silo_id, silo1.id()); + let c1 = e1.best_certificate().unwrap(); + // It must be cert1 because cert2 is expired. + assert!(cert_matches(c1, &silo1_cert1)); + + let e2 = &ee.by_dns_name["silo2.sys.oxide1.test"]; + assert_eq!(e2.silo_id, silo2.id()); + let c2 = e2.best_certificate().unwrap(); + // It must be cert2 because cert1 is expired. + assert!(cert_matches(c2, &silo2_cert2)); + assert!(!cert_matches(c2, &silo1_cert1)); + assert!(!cert_matches(c2, &silo1_cert2)); + + let e3 = &ee.by_dns_name["silo3.sys.oxide1.test"]; + assert_eq!(e3.silo_id, silo3.id()); + assert!(e3.best_certificate().is_err()); + + // We should get an expired cert if it's the only option. + let e4 = &ee.by_dns_name["silo4.sys.oxide1.test"]; + assert_eq!(e4.silo_id, silo4.id()); + let c4 = e4.best_certificate().unwrap(); + assert!(cert_matches(c4, &silo4_cert)); + + // + // Test endpoint lookup by authority. + // + let logctx = LogContext::new( + "test_external_endpoints_complex", + &ConfigLogging::File { + level: ConfigLoggingLevel::Trace, + path: "UNUSED".into(), + if_exists: ConfigLoggingIfExists::Append, + }, + ); + let log = &logctx.log; + let (_, watch_rx) = tokio::sync::watch::channel(Some(ee.clone())); + + // Basic cases: look up a few Silos by name. + let authority = Authority::from_static("silo1.sys.oxide1.test"); + let ae1 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); + assert_eq!(ae1, *e1); + let authority = Authority::from_static("silo1.sys.oxide2.test"); + let ae1 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); + assert_eq!(ae1, *e1); + let authority = Authority::from_static("silo2.sys.oxide1.test"); + let ae2 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); + assert_eq!(ae2, *e2); + // The port number in the authority should be ignored. + let authority = Authority::from_static("silo3.sys.oxide1.test:456"); + let ae3 = endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); + assert_eq!(ae3, *e3); + // We should get back a default endpoint if we use a server name that's + // not known. That includes any IPv4 or IPv6 address, too. The default + // endpoint should always be silo3 because it's the only one we've + // created LocalOnly. + for name in [ + "springfield.sys.oxide1.test", + "springfield.sys.oxide1.test:123", + "10.1.2.3:456", + "[fe80::1]:789", + ] { + let authority = Authority::from_static(name); + let ae = + endpoint_for_authority(&log, &authority, &watch_rx).unwrap(); + assert_eq!(ae, *e3); + } + + // + // Now test the NexusCertResolver. + // + let (watch_tx, watch_rx) = tokio::sync::watch::channel(None); + let cert_resolver = + NexusCertResolver::new(logctx.log.clone(), watch_rx); + + // At this point we haven't filled in the configuration so any attempt + // to resolve anything should fail. + assert!(cert_resolver + .do_resolve(Some("silo1.sys.oxide1.test")) + .is_none()); + + // Now pass along the configuration and try again. + watch_tx.send(Some(ee.clone())).unwrap(); + let resolved_c1 = + cert_resolver.do_resolve(Some("silo1.sys.oxide1.test")).unwrap(); + assert_eq!(resolved_c1.cert, c1.certified_key.cert); + let resolved_c2 = + cert_resolver.do_resolve(Some("silo2.sys.oxide1.test")).unwrap(); + assert_eq!(resolved_c2.cert, c2.certified_key.cert); + assert!(cert_resolver + .do_resolve(Some("silo3.sys.oxide1.test")) + .is_none()); + // We should get an expired cert if it's the only option. + let resolved_c4 = + cert_resolver.do_resolve(Some("silo4.sys.oxide1.test")).unwrap(); + assert_eq!(resolved_c4.cert, c4.certified_key.cert); + + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_authority() { + // Tests for authority_for_request(). The function itself is pretty + // simple. That makes it easy to test fairly exhaustively. It's also + // useful to verify that we're doing what we think we're doing + // (identifying the name that the client thinks they're connecting to). + + // First, set up a Dropshot server that just echoes back whatever + // authority_for_request() returns for a given request. + let logctx = omicron_test_utils::dev::test_setup_log("test_authority"); + let mut api = dropshot::ApiDescription::new(); + api.register(echo_server_name).unwrap(); + let server = dropshot::HttpServerStarter::new( + &dropshot::ConfigDropshot::default(), + api, + (), + &logctx.log, + ) + .expect("failed to create dropshot server") + .start(); + let local_addr = server.local_addr(); + let port = local_addr.port(); + + #[derive(Debug, PartialEq, Eq, JsonSchema, Serialize, Deserialize)] + struct AuthorityResponse { + host: String, + port: Option, + } + + #[endpoint(method = GET, path = "/server_name")] + async fn echo_server_name( + rqctx: dropshot::RequestContext<()>, + ) -> Result< + dropshot::HttpResponseOk>, + dropshot::HttpError, + > { + Ok(dropshot::HttpResponseOk( + authority_for_request(&rqctx.request).map(|authority| { + AuthorityResponse { + host: authority.host().to_string(), + port: authority.port_u16(), + } + }), + )) + } + + // Generally, the "authority" for a request is determined by the URL + // provided to the client. We can test basically two cases this way: an + // authority with a host and port and an authority with an IP address + // and port. We can't test any cases that require the client to connect + // to a different host/port than what's in the URL. So we can't test + // the case of an authority with no port number in it (since our server + // doesn't run on port 80). + // + // With HTTP 1.1, you can generally override the authority by specifying + // your own "host" header. That lets us exercise the case of an + // authority that has no port number, even though the client would be + // connecting to a URL with a port number in it. It might also let us + // test other cases, like an authority with an invalid DNS name. + // However, it's not clear any of this is possible with HTTP 2 or later. + + async fn test_v2_host( + hostname: &str, + addr: SocketAddr, + ) -> AuthorityResponse { + let v2_client = reqwest::ClientBuilder::new() + .http2_prior_knowledge() + .resolve(hostname, addr) + .build() + .unwrap(); + test_request(&v2_client, &format!("{}:{}", hostname, addr.port())) + .await + } + + async fn test_v2_ip(addr: SocketAddr) -> AuthorityResponse { + let v2_client = reqwest::ClientBuilder::new() + .http2_prior_knowledge() + .build() + .unwrap(); + test_request(&v2_client, &addr.to_string()).await + } + + async fn test_v1_host( + hostname: &str, + addr: SocketAddr, + override_host: Option<&str>, + ) -> AuthorityResponse { + let mut v1_builder = reqwest::ClientBuilder::new() + .http1_only() + .resolve(hostname, addr); + if let Some(host) = override_host { + let mut headers = http::header::HeaderMap::new(); + headers.insert(http::header::HOST, host.try_into().unwrap()); + v1_builder = v1_builder.default_headers(headers); + } + let v1_client = v1_builder.build().unwrap(); + test_request(&v1_client, &format!("{}:{}", hostname, addr.port())) + .await + } + + async fn test_v1_ip( + addr: SocketAddr, + override_host: Option<&str>, + ) -> AuthorityResponse { + let mut v1_builder = reqwest::ClientBuilder::new().http1_only(); + if let Some(host) = override_host { + let mut headers = http::header::HeaderMap::new(); + headers.append(http::header::HOST, host.try_into().unwrap()); + v1_builder = v1_builder.default_headers(headers); + } + let v1_client = v1_builder.build().unwrap(); + test_request(&v1_client, &addr.to_string()).await + } + + async fn test_request( + client: &reqwest::Client, + connect_host: &str, + ) -> AuthorityResponse { + let url = format!("http://{}/server_name", connect_host); + + let result = client + .get(&url) + .send() + .await + .unwrap_or_else(|e| panic!("GET {:?}: {:#}", url, e)); + let status = result.status(); + println!("status: {:?}", status); + if status != http::StatusCode::OK { + panic!("GET {:?}: unexpected status: {:?}", url, status); + } + + let body: Result = + result.json().await.unwrap_or_else(|e| { + panic!("GET {:?}: parse json: {:#}", url, e); + }); + println!("body: {:?}", body); + body.unwrap() + } + + // HTTP 2: regular hostname (with port) + let authority = test_v2_host("foo.example.com", local_addr).await; + assert_eq!(authority.host, "foo.example.com"); + assert_eq!(authority.port, Some(port)); + + // HTTP 2: IP address (with port) + let authority = test_v2_ip(local_addr).await; + assert_eq!(authority.host, local_addr.ip().to_string()); + assert_eq!(authority.port, Some(port)); + + // HTTP 1.1: regular hostname, no overridden "host" header. + let authority = test_v1_host("foo.example.com", local_addr, None).await; + assert_eq!(authority.host, "foo.example.com"); + assert_eq!(authority.port, Some(port)); + + // HTTP 1.1: regular hostname, override "host" header with port. + let authority = test_v1_host( + "foo.example.com", + local_addr, + Some("foo.example.com:123"), + ) + .await; + assert_eq!(authority.host, "foo.example.com"); + assert_eq!(authority.port, Some(123)); + + // HTTP 1.1: regular hostname, override "host" header with no port. + let authority = test_v1_host( + "foo.example.com", + local_addr, + Some("foo.example.com"), + ) + .await; + assert_eq!(authority.host, "foo.example.com"); + assert_eq!(authority.port, None); + + // HTTP 1.1: IP address, no overridden "host" header. + let authority = test_v1_ip(local_addr, None).await; + assert_eq!(authority.host, local_addr.ip().to_string()); + assert_eq!(authority.port, Some(port)); + + // HTTP 1.1: IP address, override "host" header with port. + let authority = + test_v1_ip(local_addr, Some("foo.example.com:123")).await; + assert_eq!(authority.host, "foo.example.com"); + assert_eq!(authority.port, Some(123)); + + // HTTP 1.1: IP address, override "host" header with no port. + let authority = test_v1_ip(local_addr, Some("foo.example.com")).await; + assert_eq!(authority.host, "foo.example.com"); + assert_eq!(authority.port, None); + + server.close().await.expect("failed to shut down dropshot server"); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_no_endpoint() { + let logctx = + omicron_test_utils::dev::test_setup_log("test_no_endpoint"); + let log = &logctx.log; + + // We'll test two configurations at the same time: one where there's no + // configuration at all, and one where there's a configuration but no + // default endpoint. These should always produce errors, no matter what + // endpoint we're looking up. + let ee = ExternalEndpoints::new(vec![], vec![], vec![]); + let (_, none_rx) = + tokio::sync::watch::channel::>(None); + let (_, empty_rx) = + tokio::sync::watch::channel::>(Some(ee)); + + for name in [ + "dummy", + "dummy.example", + "dummy.example:123", + "10.1.2.3:456", + "[fe80::1]:789", + ] { + let authority = Authority::from_static(name); + for (rx_label, rx_channel) in + [("empty", &empty_rx), ("none", &none_rx)] + { + println!("config {:?} endpoint {:?}", rx_label, name); + let result = + endpoint_for_authority(&log, &authority, rx_channel); + match result { + Err(Error::ServiceUnavailable { internal_message }) => { + assert_eq!(rx_label, "none"); + assert_eq!(internal_message, "endpoints not loaded"); + } + Err(Error::InvalidRequest { message }) => { + assert_eq!(rx_label, "empty"); + assert_eq!( + message.external_message(), + format!( + "HTTP request for unknown server name {:?}", + authority.host() + ) + ); + } + result => { + panic!( + "unexpected result looking up endpoint for \ + {:?} with config {:?}: {:?}", + name, rx_label, result + ); + } + } + } + } + + logctx.cleanup_successful(); + } +} diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 9096984a2f..a137f19434 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -4,6 +4,7 @@ //! Rack management +use super::silo::silo_dns_name; use crate::external_api::params; use crate::external_api::params::CertificateCreate; use crate::external_api::shared::ServiceUsingCertificate; @@ -19,7 +20,6 @@ use nexus_db_queries::db; use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder; use nexus_db_queries::db::datastore::RackInit; use nexus_db_queries::db::lookup::LookupPath; -use nexus_external_endpoints::silo_dns_name; use nexus_types::external_api::params::Address; use nexus_types::external_api::params::AddressConfig; use nexus_types::external_api::params::AddressLotBlockCreate; diff --git a/nexus/src/app/silo.rs b/nexus/src/app/silo.rs index 8a2558facb..8461be015a 100644 --- a/nexus/src/app/silo.rs +++ b/nexus/src/app/silo.rs @@ -16,7 +16,6 @@ use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::{self, lookup}; use nexus_db_queries::{authn, authz}; -use nexus_external_endpoints::silo_dns_name; use nexus_types::internal_api::params::DnsRecord; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::ListResultVec; @@ -887,3 +886,16 @@ impl super::Nexus { LookupPath::new(opctx, &self.db_datastore).silo_group_id(*group_id) } } + +/// Returns the (relative) DNS name for this Silo's API and console endpoints +/// _within_ the external DNS zone (i.e., without that zone's suffix) +/// +/// This specific naming scheme is determined under RFD 357. +pub(crate) fn silo_dns_name( + name: &omicron_common::api::external::Name, +) -> String { + // RFD 4 constrains resource names (including Silo names) to DNS-safe + // strings, which is why it's safe to directly put the name of the + // resource into the DNS name rather than doing any kind of escaping. + format!("{}.sys", name) +} diff --git a/nexus/src/external_api/device_auth.rs b/nexus/src/external_api/device_auth.rs index 4f7b8d83b1..1697722f6f 100644 --- a/nexus/src/external_api/device_auth.rs +++ b/nexus/src/external_api/device_auth.rs @@ -11,6 +11,7 @@ use super::console_api::console_index_or_login_redirect; use super::views::DeviceAccessTokenGrant; +use crate::app::external_endpoints::authority_for_request; use crate::ServerContext; use dropshot::{ endpoint, HttpError, HttpResponseUpdatedNoContent, RequestContext, @@ -19,7 +20,6 @@ use dropshot::{ use http::{header, Response, StatusCode}; use hyper::Body; use nexus_db_queries::db::model::DeviceAccessToken; -use nexus_external_endpoints::authority_for_request; use omicron_common::api::external::InternalContext; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; From f2771fda847bfe14b413e42c539f27ecf35879dd Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 6 Mar 2024 18:07:05 -0800 Subject: [PATCH 05/34] finish implementation --- nexus/db-queries/src/db/datastore/silo.rs | 24 ++++++++ nexus/reconfigurator/execution/Cargo.toml | 2 +- nexus/reconfigurator/execution/src/dns.rs | 72 +++++++++++++++++++++-- 3 files changed, 91 insertions(+), 7 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/silo.rs b/nexus/db-queries/src/db/datastore/silo.rs index df10b1c072..a9f8812a90 100644 --- a/nexus/db-queries/src/db/datastore/silo.rs +++ b/nexus/db-queries/src/db/datastore/silo.rs @@ -6,6 +6,7 @@ use super::dns::DnsVersionUpdateBuilder; use super::DataStore; +use super::SQL_BATCH_SIZE; use crate::authz; use crate::context::OpContext; use crate::db; @@ -22,6 +23,7 @@ use crate::db::model::Name; use crate::db::model::Silo; use crate::db::model::VirtualProvisioningCollection; use crate::db::pagination::paginated; +use crate::db::pagination::Paginator; use crate::db::pool::DbConnection; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; @@ -351,6 +353,28 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + /// List all Silos, making as many queries as needed to get them all + /// + /// This should generally not be used in API handlers or other + /// latency-sensitive contexts, but it can make sense in saga actions or + /// background tasks. + pub async fn silo_list_all_batched( + &self, + opctx: &OpContext, + ) -> ListResultVec { + opctx.check_complex_operations_allowed()?; + let mut all_silos = Vec::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + while let Some(p) = paginator.next() { + let batch = + self.silos_list_by_id(opctx, &p.current_pagparams()).await?; + paginator = + p.found_batch(&batch, &|s: &nexus_db_model::Silo| s.id()); + all_silos.extend(batch); + } + Ok(all_silos) + } + pub async fn silo_delete( &self, opctx: &OpContext, diff --git a/nexus/reconfigurator/execution/Cargo.toml b/nexus/reconfigurator/execution/Cargo.toml index 62155d9783..72fed3044e 100644 --- a/nexus/reconfigurator/execution/Cargo.toml +++ b/nexus/reconfigurator/execution/Cargo.toml @@ -9,6 +9,7 @@ omicron-rpaths.workspace = true [dependencies] anyhow.workspace = true dns-service-client.workspace = true +chrono.workspace = true futures.workspace = true illumos-utils.workspace = true internal-dns.workspace = true @@ -31,7 +32,6 @@ pq-sys = "*" omicron-workspace-hack.workspace = true [dev-dependencies] -chrono.workspace = true httptest.workspace = true ipnet.workspace = true nexus-reconfigurator-planning.workspace = true diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index dd23822502..a4320bc926 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -15,7 +15,10 @@ use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; use nexus_types::deployment::OmicronZoneType; +use nexus_types::identity::Resource; use nexus_types::internal_api::params::DnsConfigParams; +use nexus_types::internal_api::params::DnsConfigZone; +use nexus_types::internal_api::params::DnsRecord; use omicron_common::address::get_switch_zone_address; use omicron_common::address::CLICKHOUSE_KEEPER_PORT; use omicron_common::address::CLICKHOUSE_PORT; @@ -34,6 +37,8 @@ use omicron_common::api::external::Generation; use omicron_common::api::external::InternalContext; use slog::{debug, info, o}; use std::collections::BTreeMap; +use std::collections::HashMap; +use std::net::IpAddr; use uuid::Uuid; pub(crate) async fn deploy_dns( @@ -62,15 +67,29 @@ pub(crate) async fn deploy_dns( // Next, construct the DNS config represented by the blueprint. let internal_dns_config_blueprint = blueprint_internal_dns_config(blueprint, sleds_by_id); - let silos = todo!(); // XXX-dap - let external_dns_config_blueprint = - blueprint_external_dns_config(blueprint, silos); + let silos = datastore + .silo_list_all_batched(opctx) + .await + .internal_context("listing Silos (for configuring external DNS)")?; + + let (nexus_external_ips, nexus_external_dns_zones) = + datastore.nexus_external_addresses(opctx).await?; + let nexus_external_dns_zone_names = nexus_external_dns_zones + .into_iter() + .map(|z| z.zone_name) + .collect::>(); + let external_dns_config_blueprint = blueprint_external_dns_config( + blueprint, + &silos, + &nexus_external_ips, + &nexus_external_dns_zone_names, + ); // Deploy the changes. deploy_dns_one( opctx, datastore, - creator, + creator.clone(), blueprint, &internal_dns_config_current, &internal_dns_config_blueprint, @@ -302,9 +321,36 @@ pub fn blueprint_internal_dns_config( pub fn blueprint_external_dns_config( blueprint: &Blueprint, - silos: Vec, + silos: &[Silo], + nexus_external_ips: &[IpAddr], + external_dns_zone_names: &[String], ) -> DnsConfigParams { - todo!(); // XXX-dap + let dns_records: Vec = nexus_external_ips + .into_iter() + .map(|addr| match addr { + IpAddr::V4(addr) => DnsRecord::A(*addr), + IpAddr::V6(addr) => DnsRecord::Aaaa(*addr), + }) + .collect(); + + let records = silos + .into_iter() + .map(|silo| (silo_dns_name(&silo.name()), dns_records.clone())) + .collect::>>(); + + let zones = external_dns_zone_names + .into_iter() + .map(|zone_name| DnsConfigZone { + zone_name: zone_name.to_owned(), + records: records.clone(), + }) + .collect(); + + DnsConfigParams { + generation: u64::from(blueprint.external_dns_version.next()), + time_created: chrono::Utc::now(), + zones, + } } fn dns_compute_update( @@ -803,3 +849,17 @@ mod test { logctx.cleanup_successful(); } } + +// XXX-dap duplicated -- figure out where to put this +/// Returns the (relative) DNS name for this Silo's API and console endpoints +/// _within_ the external DNS zone (i.e., without that zone's suffix) +/// +/// This specific naming scheme is determined under RFD 357. +pub(crate) fn silo_dns_name( + name: &omicron_common::api::external::Name, +) -> String { + // RFD 4 constrains resource names (including Silo names) to DNS-safe + // strings, which is why it's safe to directly put the name of the + // resource into the DNS name rather than doing any kind of escaping. + format!("{}.sys", name) +} From 7903de96f721e6130064c1b6d956c9951e49b4e2 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 7 Mar 2024 10:10:16 -0800 Subject: [PATCH 06/34] add basic test for external DNS computation --- nexus/db-queries/src/db/datastore/rack.rs | 4 + nexus/reconfigurator/execution/src/dns.rs | 144 ++++++++++++++++-- nexus/reconfigurator/planning/Cargo.toml | 6 +- .../planning/src/blueprint_builder.rs | 3 +- 4 files changed, 145 insertions(+), 12 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index 32e059bf81..bc9a7bf395 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -796,6 +796,10 @@ impl DataStore { ) -> Result<(Vec, Vec), Error> { opctx.authorize(authz::Action::Read, &authz::DNS_CONFIG).await?; + // XXX-dap use the current target blueprint here? + // Or maybe do that in the caller, since the other caller is doing + // blueprint execution for what might not be the current target. + use crate::db::schema::external_ip::dsl as extip_dsl; use crate::db::schema::service::dsl as service_dsl; diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index a4320bc926..f27aa73aa6 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -72,7 +72,7 @@ pub(crate) async fn deploy_dns( .await .internal_context("listing Silos (for configuring external DNS)")?; - let (nexus_external_ips, nexus_external_dns_zones) = + let (_, nexus_external_dns_zones) = datastore.nexus_external_addresses(opctx).await?; let nexus_external_dns_zone_names = nexus_external_dns_zones .into_iter() @@ -80,8 +80,7 @@ pub(crate) async fn deploy_dns( .collect::>(); let external_dns_config_blueprint = blueprint_external_dns_config( blueprint, - &silos, - &nexus_external_ips, + &silos.iter().collect::>(), &nexus_external_dns_zone_names, ); @@ -321,15 +320,25 @@ pub fn blueprint_internal_dns_config( pub fn blueprint_external_dns_config( blueprint: &Blueprint, - silos: &[Silo], - nexus_external_ips: &[IpAddr], + silos: &[&Silo], external_dns_zone_names: &[String], ) -> DnsConfigParams { + let nexus_external_ips = + blueprint.all_omicron_zones().filter_map(|(_, z)| { + if blueprint.zones_in_service.contains(&z.id) { + if let OmicronZoneType::Nexus { external_ip, .. } = &z.zone_type + { + return Some(*external_ip); + } + } + + None + }); let dns_records: Vec = nexus_external_ips .into_iter() .map(|addr| match addr { - IpAddr::V4(addr) => DnsRecord::A(*addr), - IpAddr::V6(addr) => DnsRecord::Aaaa(*addr), + IpAddr::V4(addr) => DnsRecord::A(addr), + IpAddr::V6(addr) => DnsRecord::Aaaa(addr), }) .collect(); @@ -415,11 +424,15 @@ fn dns_compute_update( mod test { use super::blueprint_internal_dns_config; use super::dns_compute_update; + use crate::dns::blueprint_external_dns_config; + use crate::dns::silo_dns_name; use crate::Sled; use internal_dns::ServiceName; use internal_dns::DNS_ZONE; use nexus_db_model::DnsGroup; + use nexus_db_model::Silo; use nexus_inventory::CollectionBuilder; + use nexus_reconfigurator_planning::blueprint_builder::test::example; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_types::deployment::Blueprint; use nexus_types::deployment::OmicronZoneConfig; @@ -427,8 +440,11 @@ mod test { use nexus_types::deployment::Policy; use nexus_types::deployment::SledResources; use nexus_types::deployment::ZpoolName; + use nexus_types::external_api::params; + use nexus_types::external_api::shared; use nexus_types::external_api::views::SledPolicy; use nexus_types::external_api::views::SledState; + use nexus_types::identity::Resource; use nexus_types::internal_api::params::DnsConfigParams; use nexus_types::internal_api::params::DnsConfigZone; use nexus_types::internal_api::params::DnsRecord; @@ -438,10 +454,12 @@ mod test { use omicron_common::address::RACK_PREFIX; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Generation; + use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_test_utils::dev::test_setup_log; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::HashMap; + use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; use std::net::SocketAddrV6; @@ -477,9 +495,9 @@ mod test { } } - /// test blueprint_dns_config(): trivial case of an empty blueprint + /// test blueprint_internal_dns_config(): trivial case of an empty blueprint #[test] - fn test_blueprint_dns_empty() { + fn test_blueprint_internal_dns_empty() { let blueprint = blueprint_empty(); let blueprint_dns = blueprint_internal_dns_config(&blueprint, &BTreeMap::new()); @@ -490,7 +508,7 @@ mod test { /// - one of each type of zone in service /// - some zones not in service #[test] - fn test_blueprint_dns_basic() { + fn test_blueprint_internal_dns_basic() { // We'll use the standard representative inventory collection to build a // blueprint. The main thing we care about here is that we have at // least one zone of each type. Later, we'll mark a couple of the sleds @@ -757,6 +775,112 @@ mod test { assert!(srv_kinds_expected.is_empty()); } + #[tokio::test] + async fn test_blueprint_external_dns_basic() { + let (collection, policy) = example(5); + let initial_external_dns_generation = Generation::new(); + let blueprint = BlueprintBuilder::build_initial_from_collection( + &collection, + Generation::new(), + initial_external_dns_generation, + &policy, + "test suite", + ) + .expect("failed to generate initial blueprint"); + + let my_silo = Silo::new(params::SiloCreate { + identity: IdentityMetadataCreateParams { + name: "my-silo".parse().unwrap(), + description: String::new(), + }, + quotas: params::SiloQuotasCreate::empty(), + discoverable: false, + identity_mode: shared::SiloIdentityMode::SamlJit, + admin_group_name: None, + tls_certificates: vec![], + mapped_fleet_roles: Default::default(), + }) + .unwrap(); + + // It shouldn't ever be possible to have no Silos at all, but at least + // make sure we don't panic. + let external_dns_config = blueprint_external_dns_config( + &blueprint, + &[], + &[String::from("oxide.test")], + ); + assert_eq!( + external_dns_config.generation, + u64::from(initial_external_dns_generation.next()) + ); + assert_eq!(external_dns_config.zones.len(), 1); + assert_eq!(external_dns_config.zones[0].zone_name, "oxide.test"); + assert!(external_dns_config.zones[0].records.is_empty()); + + // Same with external DNS zones. + let external_dns_config = + blueprint_external_dns_config(&blueprint, &[&my_silo], &[]); + assert_eq!( + external_dns_config.generation, + u64::from(initial_external_dns_generation.next()) + ); + assert!(external_dns_config.zones.is_empty()); + + // Now check a more typical case. (Although we wouldn't normally have + // more than one external DNS zone, it's a more general case and pretty + // easy to test.) + let external_dns_config = blueprint_external_dns_config( + &blueprint, + &[&my_silo], + &[String::from("oxide1.test"), String::from("oxide2.test")], + ); + assert_eq!( + external_dns_config.generation, + u64::from(initial_external_dns_generation.next()) + ); + assert_eq!(external_dns_config.zones.len(), 2); + assert_eq!( + external_dns_config.zones[0].records, + external_dns_config.zones[1].records + ); + assert_eq!( + external_dns_config.zones[0].zone_name, + String::from("oxide1.test"), + ); + assert_eq!( + external_dns_config.zones[1].zone_name, + String::from("oxide2.test"), + ); + let records = &external_dns_config.zones[0].records; + assert_eq!(records.len(), 1); + let silo_records = records + .get(&silo_dns_name(my_silo.name())) + .expect("missing silo DNS records"); + + // Here we're hardcoding the contents of the example blueprint. It + // currently puts one Nexus zone on each sled. If we change the example + // blueprint, change the expected set of IPs here. + let mut silo_record_ips: Vec<_> = silo_records + .into_iter() + .map(|record| match record { + DnsRecord::A(v) => IpAddr::V4(*v), + DnsRecord::Aaaa(v) => IpAddr::V6(*v), + DnsRecord::Srv(_) => panic!("unexpected SRV record"), + }) + .collect(); + silo_record_ips.sort(); + assert_eq!( + silo_record_ips, + &[ + "192.0.2.2".parse::().unwrap(), + "192.0.2.3".parse::().unwrap(), + "192.0.2.4".parse::().unwrap(), + "192.0.2.5".parse::().unwrap(), + "192.0.2.6".parse::().unwrap(), + ] + ); + } + #[test] fn test_dns_compute_update() { let logctx = test_setup_log("dns_compute_update"); diff --git a/nexus/reconfigurator/planning/Cargo.toml b/nexus/reconfigurator/planning/Cargo.toml index 3ec616168c..086f189b09 100644 --- a/nexus/reconfigurator/planning/Cargo.toml +++ b/nexus/reconfigurator/planning/Cargo.toml @@ -17,8 +17,12 @@ slog.workspace = true thiserror.workspace = true uuid.workspace = true +# XXX-dap this needs to move back to dev-dependencies. I moved it so I can +# expose the test module to another crate. +sled-agent-client.workspace = true + omicron-workspace-hack.workspace = true [dev-dependencies] omicron-test-utils.workspace = true -sled-agent-client.workspace = true +#sled-agent-client.workspace = true diff --git a/nexus/reconfigurator/planning/src/blueprint_builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder.rs index 9a82a3720a..f85672d54e 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder.rs @@ -727,7 +727,8 @@ impl<'a> BlueprintZones<'a> { } } -#[cfg(test)] +// XXX-dap need this until my other PR lands +// #[cfg(test)] pub mod test { use super::*; use nexus_types::external_api::views::SledPolicy; From 453f851797c1a2b99e7f74ed469dd146e25973cc Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 7 Mar 2024 14:09:43 -0800 Subject: [PATCH 07/34] move blueprint example() out of [cfg(test)] --- dev-tools/reconfigurator-cli/src/main.rs | 2 + nexus/reconfigurator/execution/src/dns.rs | 6 +- .../planning/src/blueprint_builder.rs | 120 +----------------- nexus/reconfigurator/planning/src/example.rs | 119 +++++++++++++++++ nexus/reconfigurator/planning/src/lib.rs | 1 + nexus/reconfigurator/planning/src/planner.rs | 4 +- 6 files changed, 132 insertions(+), 120 deletions(-) create mode 100644 nexus/reconfigurator/planning/src/example.rs diff --git a/dev-tools/reconfigurator-cli/src/main.rs b/dev-tools/reconfigurator-cli/src/main.rs index 9cdf09a6f9..e8069fe70c 100644 --- a/dev-tools/reconfigurator-cli/src/main.rs +++ b/dev-tools/reconfigurator-cli/src/main.rs @@ -454,6 +454,7 @@ fn cmd_blueprint_from_inventory( let blueprint = BlueprintBuilder::build_initial_from_collection( collection, dns_version, + dns_version, &policy, creator, ) @@ -487,6 +488,7 @@ fn cmd_blueprint_plan( sim.log.clone(), parent_blueprint, dns_version, + dns_version, &policy, creator, collection, diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index f27aa73aa6..8aebdc3f21 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -432,8 +432,8 @@ mod test { use nexus_db_model::DnsGroup; use nexus_db_model::Silo; use nexus_inventory::CollectionBuilder; - use nexus_reconfigurator_planning::blueprint_builder::test::example; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; + use nexus_reconfigurator_planning::example::example; use nexus_types::deployment::Blueprint; use nexus_types::deployment::OmicronZoneConfig; use nexus_types::deployment::OmicronZoneType; @@ -777,7 +777,8 @@ mod test { #[tokio::test] async fn test_blueprint_external_dns_basic() { - let (collection, policy) = example(5); + let logctx = test_setup_log("test_blueprint_external_dns_basic"); + let (collection, policy) = example(&logctx.log, 5); let initial_external_dns_generation = Generation::new(); let blueprint = BlueprintBuilder::build_initial_from_collection( &collection, @@ -879,6 +880,7 @@ mod test { "192.0.2.6".parse::().unwrap(), ] ); + logctx.cleanup_successful(); } #[test] diff --git a/nexus/reconfigurator/planning/src/blueprint_builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder.rs index 3b6d8fb915..51de7299dd 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder.rs @@ -753,130 +753,18 @@ impl<'a> BlueprintZones<'a> { } } -// XXX-dap need this until my other PR lands -// #[cfg(test)] +#[cfg(test)] pub mod test { use super::*; + use crate::example::example; + use crate::example::ExampleSystem; use crate::system::SledBuilder; - use crate::system::SystemDescription; use omicron_common::address::IpRange; use omicron_test_utils::dev::test_setup_log; - use sled_agent_client::types::{ - OmicronZoneConfig, OmicronZoneType, OmicronZonesConfig, - }; + use sled_agent_client::types::{OmicronZoneConfig, OmicronZoneType}; pub const DEFAULT_N_SLEDS: usize = 3; - pub struct ExampleSystem { - pub system: SystemDescription, - pub policy: Policy, - pub collection: Collection, - pub blueprint: Blueprint, - } - - impl ExampleSystem { - pub fn new(log: &slog::Logger, nsleds: usize) -> ExampleSystem { - let mut system = SystemDescription::new(); - let sled_ids: Vec<_> = - (0..nsleds).map(|_| Uuid::new_v4()).collect(); - for sled_id in &sled_ids { - let _ = system.sled(SledBuilder::new().id(*sled_id)).unwrap(); - } - - let policy = system.to_policy().expect("failed to make policy"); - let mut inventory_builder = system - .to_collection_builder() - .expect("failed to build collection"); - - // For each sled, have it report 0 zones in the initial inventory. - // This will enable us to build a blueprint from the initial - // inventory, which we can then use to build new blueprints. - for sled_id in &sled_ids { - inventory_builder - .found_sled_omicron_zones( - "fake sled agent", - *sled_id, - OmicronZonesConfig { - generation: Generation::new(), - zones: vec![], - }, - ) - .expect("recording Omicron zones"); - } - - let empty_zone_inventory = inventory_builder.build(); - let initial_blueprint = - BlueprintBuilder::build_initial_from_collection( - &empty_zone_inventory, - Generation::new(), - &policy, - "test suite", - ) - .unwrap(); - - // Now make a blueprint and collection with some zones on each sled. - let mut builder = BlueprintBuilder::new_based_on( - &log, - &initial_blueprint, - Generation::new(), - &policy, - "test suite", - ) - .unwrap(); - for (sled_id, sled_resources) in &policy.sleds { - let _ = builder.sled_ensure_zone_ntp(*sled_id).unwrap(); - let _ = builder - .sled_ensure_zone_multiple_nexus_with_config( - *sled_id, - 1, - false, - vec![], - ) - .unwrap(); - for pool_name in &sled_resources.zpools { - let _ = builder - .sled_ensure_zone_crucible(*sled_id, pool_name.clone()) - .unwrap(); - } - } - - let blueprint = builder.build(); - let mut builder = system - .to_collection_builder() - .expect("failed to build collection"); - - for sled_id in blueprint.sleds() { - let Some(zones) = blueprint.omicron_zones.get(&sled_id) else { - continue; - }; - builder - .found_sled_omicron_zones( - "fake sled agent", - sled_id, - zones.clone(), - ) - .unwrap(); - } - - ExampleSystem { - system, - policy, - collection: builder.build(), - blueprint, - } - } - } - - /// Returns a collection and policy describing a pretty simple system. - /// - /// `n_sleds` is the number of sleds supported. Currently, this value can - /// be anywhere between 0 and 5. (More can be added in the future if - /// necessary.) - pub fn example(log: &slog::Logger, nsleds: usize) -> (Collection, Policy) { - let example = ExampleSystem::new(log, nsleds); - (example.collection, example.policy) - } - /// Checks various conditions that should be true for all blueprints pub fn verify_blueprint(blueprint: &Blueprint) { let mut underlay_ips: BTreeMap = diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs new file mode 100644 index 0000000000..f99d2493ba --- /dev/null +++ b/nexus/reconfigurator/planning/src/example.rs @@ -0,0 +1,119 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Example blueprints + +use crate::blueprint_builder::BlueprintBuilder; +use crate::system::SledBuilder; +use crate::system::SystemDescription; +use nexus_types::deployment::Blueprint; +use nexus_types::deployment::Policy; +use nexus_types::inventory::Collection; +use omicron_common::api::external::Generation; +use sled_agent_client::types::OmicronZonesConfig; +use uuid::Uuid; + +pub struct ExampleSystem { + pub system: SystemDescription, + pub policy: Policy, + pub collection: Collection, + pub blueprint: Blueprint, +} + +impl ExampleSystem { + pub fn new(log: &slog::Logger, nsleds: usize) -> ExampleSystem { + let mut system = SystemDescription::new(); + let sled_ids: Vec<_> = (0..nsleds).map(|_| Uuid::new_v4()).collect(); + for sled_id in &sled_ids { + let _ = system.sled(SledBuilder::new().id(*sled_id)).unwrap(); + } + + let policy = system.to_policy().expect("failed to make policy"); + let mut inventory_builder = + system.to_collection_builder().expect("failed to build collection"); + + // For each sled, have it report 0 zones in the initial inventory. + // This will enable us to build a blueprint from the initial + // inventory, which we can then use to build new blueprints. + for sled_id in &sled_ids { + inventory_builder + .found_sled_omicron_zones( + "fake sled agent", + *sled_id, + OmicronZonesConfig { + generation: Generation::new(), + zones: vec![], + }, + ) + .expect("recording Omicron zones"); + } + + let empty_zone_inventory = inventory_builder.build(); + let initial_blueprint = + BlueprintBuilder::build_initial_from_collection( + &empty_zone_inventory, + Generation::new(), + Generation::new(), + &policy, + "test suite", + ) + .unwrap(); + + // Now make a blueprint and collection with some zones on each sled. + let mut builder = BlueprintBuilder::new_based_on( + &log, + &initial_blueprint, + Generation::new(), + Generation::new(), + &policy, + "test suite", + ) + .unwrap(); + for (sled_id, sled_resources) in &policy.sleds { + let _ = builder.sled_ensure_zone_ntp(*sled_id).unwrap(); + let _ = builder + .sled_ensure_zone_multiple_nexus_with_config( + *sled_id, + 1, + false, + vec![], + ) + .unwrap(); + for pool_name in &sled_resources.zpools { + let _ = builder + .sled_ensure_zone_crucible(*sled_id, pool_name.clone()) + .unwrap(); + } + } + + let blueprint = builder.build(); + let mut builder = + system.to_collection_builder().expect("failed to build collection"); + + for sled_id in blueprint.sleds() { + let Some(zones) = blueprint.omicron_zones.get(&sled_id) else { + continue; + }; + builder + .found_sled_omicron_zones( + "fake sled agent", + sled_id, + zones.clone(), + ) + .unwrap(); + } + + ExampleSystem { system, policy, collection: builder.build(), blueprint } + } +} + +/// Returns a collection and policy describing a pretty simple system. +/// +/// `n_sleds` is the number of sleds supported. Currently, this value can +/// be anywhere between 0 and 5. (More can be added in the future if +/// necessary.) +pub fn example(log: &slog::Logger, nsleds: usize) -> (Collection, Policy) { + let example = ExampleSystem::new(log, nsleds); + (example.collection, example.policy) +} diff --git a/nexus/reconfigurator/planning/src/lib.rs b/nexus/reconfigurator/planning/src/lib.rs index e0a61826f0..3d6cc6a778 100644 --- a/nexus/reconfigurator/planning/src/lib.rs +++ b/nexus/reconfigurator/planning/src/lib.rs @@ -116,6 +116,7 @@ //! updates, etc. pub mod blueprint_builder; +pub mod example; mod ip_allocator; pub mod planner; pub mod system; diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 447b3a573c..8b7385a75a 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -320,11 +320,11 @@ impl<'a> Planner<'a> { #[cfg(test)] mod test { use super::Planner; - use crate::blueprint_builder::test::example; use crate::blueprint_builder::test::verify_blueprint; - use crate::blueprint_builder::test::ExampleSystem; use crate::blueprint_builder::test::DEFAULT_N_SLEDS; use crate::blueprint_builder::BlueprintBuilder; + use crate::example::example; + use crate::example::ExampleSystem; use crate::system::SledBuilder; use nexus_inventory::now_db_precision; use nexus_types::external_api::views::SledPolicy; From ca6959554c198872658505fe7bbf12d1727cc1f5 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 7 Mar 2024 17:00:07 -0800 Subject: [PATCH 08/34] local change to deploy one extra Nexus zone --- nexus/src/app/deployment.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index 82cb497078..c0d59f49b3 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -123,7 +123,7 @@ impl super::Nexus { &sled_rows, &zpool_rows, &ip_pool_range_rows, - NEXUS_REDUNDANCY, + NEXUS_REDUNDANCY + 1, // XXX-dap )?; // The choice of which inventory collection to use here is not From 728537f1584990f31d125c9b5ec1cc6b1fe6b9f4 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 8 Mar 2024 02:36:50 +0000 Subject: [PATCH 09/34] fixes --- dev-tools/omdb/src/bin/omdb/nexus.rs | 5 ++++- nexus/db-queries/src/db/datastore/silo.rs | 11 +++++++++-- nexus/reconfigurator/execution/src/dns.rs | 3 ++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 692931db51..969fbbdf5b 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -947,7 +947,10 @@ async fn cmd_nexus_blueprints_generate_from_collection( ) .await .context("creating blueprint from collection id")?; - eprintln!("created blueprint {} from collection id", blueprint.id); + eprintln!( + "created blueprint {} from collection id {}", + blueprint.id, args.collection_id + ); Ok(()) } diff --git a/nexus/db-queries/src/db/datastore/silo.rs b/nexus/db-queries/src/db/datastore/silo.rs index a9f8812a90..59c5e80232 100644 --- a/nexus/db-queries/src/db/datastore/silo.rs +++ b/nexus/db-queries/src/db/datastore/silo.rs @@ -47,6 +47,7 @@ use ref_cast::RefCast; use uuid::Uuid; /// Filter a "silo_list" query based on silos' discoverability +#[derive(Clone, Copy)] pub enum Discoverability { /// Show all Silos All, @@ -361,13 +362,19 @@ impl DataStore { pub async fn silo_list_all_batched( &self, opctx: &OpContext, + discoverability: Discoverability, ) -> ListResultVec { opctx.check_complex_operations_allowed()?; let mut all_silos = Vec::new(); let mut paginator = Paginator::new(SQL_BATCH_SIZE); while let Some(p) = paginator.next() { - let batch = - self.silos_list_by_id(opctx, &p.current_pagparams()).await?; + let batch = self + .silos_list( + opctx, + &PaginatedBy::Id(p.current_pagparams()), + discoverability, + ) + .await?; paginator = p.found_batch(&batch, &|s: &nexus_db_model::Silo| s.id()); all_silos.extend(batch); diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 8aebdc3f21..dc15315aa8 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -11,6 +11,7 @@ use internal_dns::ServiceName; use nexus_db_model::DnsGroup; use nexus_db_model::Silo; use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::datastore::Discoverability; use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; @@ -68,7 +69,7 @@ pub(crate) async fn deploy_dns( let internal_dns_config_blueprint = blueprint_internal_dns_config(blueprint, sleds_by_id); let silos = datastore - .silo_list_all_batched(opctx) + .silo_list_all_batched(opctx, Discoverability::All) .await .internal_context("listing Silos (for configuring external DNS)")?; From 139a28e3c6aebbdff9849b9b82518a79274fe925 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 8 Mar 2024 03:41:37 +0000 Subject: [PATCH 10/34] various fixes --- clients/dns-service-client/src/diff.rs | 7 ++-- clients/dns-service-client/src/lib.rs | 39 +++++++++++++++++++++++ nexus/reconfigurator/execution/src/dns.rs | 7 +++- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/clients/dns-service-client/src/diff.rs b/clients/dns-service-client/src/diff.rs index 39d51cc974..3a04bc6729 100644 --- a/clients/dns-service-client/src/diff.rs +++ b/clients/dns-service-client/src/diff.rs @@ -59,8 +59,11 @@ impl<'a> DnsDiff<'a> { &self, ) -> impl Iterator { self.left.iter().filter_map(|(k, v1)| match self.right.get(k) { - Some(v2) if v1 != v2 => { - Some((k.as_ref(), v1.as_ref(), v2.as_ref())) + Some(v2) => { + let v1_sorted = v1.clone().sort(); + let v2_sorted = v2.clone().sort(); + (v1_sorted != v2_sorted) + .then(|| (k.as_ref(), v1.as_ref(), v2.as_ref())) } _ => None, }) diff --git a/clients/dns-service-client/src/lib.rs b/clients/dns-service-client/src/lib.rs index cd17a1559c..316c4787b0 100644 --- a/clients/dns-service-client/src/lib.rs +++ b/clients/dns-service-client/src/lib.rs @@ -109,3 +109,42 @@ impl types::DnsConfigParams { Ok(&self.zones[0]) } } + +impl Ord for types::DnsRecord { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + use types::DnsRecord; + match (self, other) { + // Same kinds: compare the items in them + (DnsRecord::A(addr1), DnsRecord::A(addr2)) => addr1.cmp(addr2), + (DnsRecord::Aaaa(addr1), DnsRecord::Aaaa(addr2)) => { + addr1.cmp(addr2) + } + (DnsRecord::Srv(srv1), DnsRecord::Srv(srv2)) => srv1 + .target + .cmp(&srv2.target) + .then_with(|| srv1.port.cmp(&srv2.port)), + + // Different kinds: define an arbitrary order among the kinds. + // We could use std::mem::discriminant() here but it'd be nice if + // this were stable over time. + // We define (arbitrarily): A < Aaaa < Srv + (DnsRecord::A(_), DnsRecord::Aaaa(_) | DnsRecord::Srv(_)) => { + std::cmp::Ordering::Less + } + (DnsRecord::Aaaa(_), DnsRecord::Srv(_)) => std::cmp::Ordering::Less, + + // Anything else will result in "Greater". But let's be explicit. + (DnsRecord::Aaaa(_), DnsRecord::A(_)) + | (DnsRecord::Srv(_), DnsRecord::A(_)) + | (DnsRecord::Srv(_), DnsRecord::Aaaa(_)) => { + std::cmp::Ordering::Greater + } + } + } +} + +impl PartialOrd for types::DnsRecord { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index dc15315aa8..c09a397721 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -13,6 +13,7 @@ use nexus_db_model::Silo; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::datastore::Discoverability; use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder; +use nexus_db_queries::db::fixed_data::silo::SILO_ID; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; use nexus_types::deployment::OmicronZoneType; @@ -71,7 +72,11 @@ pub(crate) async fn deploy_dns( let silos = datastore .silo_list_all_batched(opctx, Discoverability::All) .await - .internal_context("listing Silos (for configuring external DNS)")?; + .internal_context("listing Silos (for configuring external DNS)")? + .into_iter() + // We do not generate a DNS name for the "default" Silo. + .filter(|silo| silo.id() != *SILO_ID) + .collect::>(); let (_, nexus_external_dns_zones) = datastore.nexus_external_addresses(opctx).await?; From e88c2c4023b4cb7c388f68337cf30850f6bf43dc Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 8 Mar 2024 12:03:08 -0800 Subject: [PATCH 11/34] that is not how you do an order-insensitive comparison --- clients/dns-service-client/src/diff.rs | 6 +- nexus/reconfigurator/execution/src/dns.rs | 83 +++++++++++++++++++++++ 2 files changed, 87 insertions(+), 2 deletions(-) diff --git a/clients/dns-service-client/src/diff.rs b/clients/dns-service-client/src/diff.rs index 3a04bc6729..ce04319dff 100644 --- a/clients/dns-service-client/src/diff.rs +++ b/clients/dns-service-client/src/diff.rs @@ -60,8 +60,10 @@ impl<'a> DnsDiff<'a> { ) -> impl Iterator { self.left.iter().filter_map(|(k, v1)| match self.right.get(k) { Some(v2) => { - let v1_sorted = v1.clone().sort(); - let v2_sorted = v2.clone().sort(); + let mut v1_sorted = v1.clone(); + let mut v2_sorted = v2.clone(); + v1_sorted.sort(); + v2_sorted.sort(); (v1_sorted != v2_sorted) .then(|| (k.as_ref(), v1.as_ref(), v2.as_ref())) } diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index c09a397721..dd191b17e8 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -454,6 +454,7 @@ mod test { use nexus_types::internal_api::params::DnsConfigParams; use nexus_types::internal_api::params::DnsConfigZone; use nexus_types::internal_api::params::DnsRecord; + use nexus_types::internal_api::params::Srv; use omicron_common::address::get_sled_address; use omicron_common::address::get_switch_zone_address; use omicron_common::address::Ipv6Subnet; @@ -978,6 +979,88 @@ mod test { ] ); + // Test the difference between two configs whose SRV records differ. + let mut dns_config1 = dns_config1.clone(); + dns_config1.zones[0].records.insert( + String::from("_nexus._tcp"), + vec![ + DnsRecord::Srv(Srv { + port: 123, + prio: 1, + target: String::from("ex1.my-zone"), + weight: 2, + }), + DnsRecord::Srv(Srv { + port: 123, + prio: 1, + target: String::from("ex2.my-zone"), + weight: 2, + }), + ], + ); + // A clone of the same one should of course be the same as the original. + let mut dns_config2 = dns_config1.clone(); + let update = dns_compute_update( + &logctx.log, + DnsGroup::Internal, + "test-suite".to_string(), + "test-suite".to_string(), + &dns_config1, + &dns_config2, + ) + .expect("failed to compute update"); + assert!(update.is_none()); + + // If we shift the order of the items, it should still reflect no + // changes. + let records = + dns_config2.zones[0].records.get_mut("_nexus._tcp").unwrap(); + records.rotate_left(1); + assert!( + records != dns_config1.zones[0].records.get("_nexus._tcp").unwrap() + ); + let update = dns_compute_update( + &logctx.log, + DnsGroup::Internal, + "test-suite".to_string(), + "test-suite".to_string(), + &dns_config1, + &dns_config2, + ) + .expect("failed to compute update"); + assert!(update.is_none()); + + // If we add another record, there should indeed be a new update. + let records = + dns_config2.zones[0].records.get_mut("_nexus._tcp").unwrap(); + records.push(DnsRecord::Srv(Srv { + port: 123, + prio: 1, + target: String::from("ex3.my-zone"), + weight: 2, + })); + let final_records = records.clone(); + + let update = dns_compute_update( + &logctx.log, + DnsGroup::Internal, + "test-suite".to_string(), + "test-suite".to_string(), + &dns_config1, + &dns_config2, + ) + .expect("failed to compute update") + .expect("expected an update"); + + assert_eq!( + update.names_removed().collect::>(), + &["_nexus._tcp"] + ); + assert_eq!( + update.names_added().collect::>(), + &[("_nexus._tcp", final_records.as_slice())] + ); + logctx.cleanup_successful(); } } From ca1d630a0319862ae313665f6911b5cd29b2c7e5 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 8 Mar 2024 14:32:16 -0800 Subject: [PATCH 12/34] WIP: test runner should more faithfully fake inventory so we can write better tests --- Cargo.lock | 2 + nexus/reconfigurator/execution/src/dns.rs | 124 +++++++++++++++++++++- nexus/test-utils/Cargo.toml | 2 + nexus/test-utils/src/lib.rs | 41 +++++++ 4 files changed, 165 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fd9155bb8e..59ec3de7a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4581,6 +4581,7 @@ dependencies = [ "headers", "http 0.2.12", "hyper 0.14.27", + "illumos-utils", "internal-dns", "nexus-config", "nexus-db-queries", @@ -4597,6 +4598,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", + "sled-agent-client", "slog", "tokio", "tokio-util", diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index dd191b17e8..4beacd5fa9 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -24,7 +24,6 @@ use nexus_types::internal_api::params::DnsRecord; use omicron_common::address::get_switch_zone_address; use omicron_common::address::CLICKHOUSE_KEEPER_PORT; use omicron_common::address::CLICKHOUSE_PORT; -use omicron_common::address::COCKROACH_PORT; use omicron_common::address::CRUCIBLE_PANTRY_PORT; use omicron_common::address::CRUCIBLE_PORT; use omicron_common::address::DENDRITE_PORT; @@ -41,6 +40,7 @@ use slog::{debug, info, o}; use std::collections::BTreeMap; use std::collections::HashMap; use std::net::IpAddr; +use std::net::SocketAddrV6; use uuid::Uuid; pub(crate) async fn deploy_dns( @@ -240,6 +240,11 @@ pub fn blueprint_internal_dns_config( // the details. let mut dns_builder = DnsConfigBuilder::new(); + // XXX-dap don't panic + fn parse_port(address: &str) -> u16 { + address.parse::().unwrap().port() + } + // The code below assumes that all zones are using the default port numbers. // That should be true, as those are the only ports ever used today. // In an ideal world, the correct port would be pulled out of the @@ -253,7 +258,7 @@ pub fn blueprint_internal_dns_config( continue; } - let (service_name, port) = match omicron_zone.zone_type { + let (service_name, port) = match &omicron_zone.zone_type { OmicronZoneType::BoundaryNtp { .. } => { (ServiceName::BoundaryNtp, NTP_PORT) } @@ -266,8 +271,9 @@ pub fn blueprint_internal_dns_config( OmicronZoneType::ClickhouseKeeper { .. } => { (ServiceName::ClickhouseKeeper, CLICKHOUSE_KEEPER_PORT) } - OmicronZoneType::CockroachDb { .. } => { - (ServiceName::Cockroach, COCKROACH_PORT) + OmicronZoneType::CockroachDb { address, .. } => { + let port = parse_port(&address); + (ServiceName::Cockroach, port) } OmicronZoneType::Nexus { .. } => { (ServiceName::Nexus, NEXUS_INTERNAL_PORT) @@ -437,9 +443,11 @@ mod test { use internal_dns::DNS_ZONE; use nexus_db_model::DnsGroup; use nexus_db_model::Silo; + use nexus_db_queries::context::OpContext; use nexus_inventory::CollectionBuilder; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_reconfigurator_planning::example::example; + use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::Blueprint; use nexus_types::deployment::OmicronZoneConfig; use nexus_types::deployment::OmicronZoneType; @@ -460,9 +468,13 @@ mod test { use omicron_common::address::Ipv6Subnet; use omicron_common::address::RACK_PREFIX; use omicron_common::address::SLED_PREFIX; + use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::IdentityMetadataCreateParams; + use omicron_test_utils::dev::poll::wait_for_condition; + use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::test_setup_log; + use slog::{debug, info}; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::HashMap; @@ -471,8 +483,12 @@ mod test { use std::net::Ipv6Addr; use std::net::SocketAddrV6; use std::str::FromStr; + use std::time::Duration; use uuid::Uuid; + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + fn blueprint_empty() -> Blueprint { let builder = CollectionBuilder::new("test-suite"); let collection = builder.build(); @@ -1063,6 +1079,106 @@ mod test { logctx.cleanup_successful(); } + + // Tests end-to-end DNS behavior: + // + // - If we create a blueprint matching the current system, and then apply + // it, there are no changes to either internal or external DNS + // + // - If we then generate a blueprint with a Nexus zone and execute the DNS + // part of that, then: + // + // - internal DNS SRV record for _nexus._tcp is updated + // - internal DNS AAAA record for the new zone is added + // - external DNS gets a A record for the new zone's external IP + // + // - If we subsequently create a new Silo, the new Silo's DNS record + // reflects the Nexus zone that was added. + // XXX-dap move to crate-level test since it uses realize_blueprint()? + #[nexus_test] + async fn test_silos_external_dns_end_to_end( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.apictx().nexus; + let datastore = nexus.datastore(); + let log = &cptestctx.logctx.log; + let opctx = OpContext::for_tests(log.clone(), datastore.clone()); + + // First, wait until Nexus has successfully completed an inventory + // collection. + let collection = wait_for_condition( + || async { + let result = + datastore.inventory_get_latest_collection(&opctx).await; + let log_result = match &result { + Ok(Some(_)) => Ok("found"), + Ok(None) => Ok("not found"), + Err(error) => Err(error), + }; + debug!( + log, + "attempt to fetch latest inventory collection"; + "result" => ?log_result, + ); + + match result { + Ok(None) => Err(CondCheckError::NotYet), + Ok(Some(c)) => Ok(c), + Err(Error::ServiceUnavailable { .. }) => { + Err(CondCheckError::NotYet) + } + Err(error) => Err(CondCheckError::Failed(error)), + } + }, + &Duration::from_millis(50), + &Duration::from_secs(30), + ) + .await + .expect("expected to find inventory collection"); + + // Fetch the initial contents of internal and external DNS. + let dns_initial_internal = datastore + .dns_config_read(&opctx, DnsGroup::Internal) + .await + .expect("fetching initial internal DNS"); + let dns_initial_external = datastore + .dns_config_read(&opctx, DnsGroup::External) + .await + .expect("fetching initial external DNS"); + + // Now, use it to construct an initial blueprint. + info!(log, "using collection"; "collection_id" => %collection.id); + let blueprint = nexus + .blueprint_generate_from_collection(&opctx, collection.id) + .await + .expect("failed to generate initial blueprint"); + + // Now, execute the blueprint. + crate::realize_blueprint(&opctx, datastore, &blueprint, "test-suite") + .await + .expect("failed to execute initial blueprint"); + + // Now fetch DNS again. It ought not to have changed. + let dns_latest_internal = datastore + .dns_config_read(&opctx, DnsGroup::Internal) + .await + .expect("fetching latest internal DNS"); + let dns_latest_external = datastore + .dns_config_read(&opctx, DnsGroup::External) + .await + .expect("fetching latest external DNS"); + + assert_eq!( + dns_initial_internal.generation, + dns_latest_internal.generation + ); + assert_eq!( + dns_initial_external.generation, + dns_latest_external.generation + ); + + // XXX-dap continue writing the test. See above. + } } // XXX-dap duplicated -- figure out where to put this diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index e612547fa8..861527108b 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -20,6 +20,7 @@ gateway-test-utils.workspace = true headers.workspace = true http.workspace = true hyper.workspace = true +illumos-utils.workspace = true internal-dns.workspace = true nexus-config.workspace = true nexus-db-queries.workspace = true @@ -35,6 +36,7 @@ oximeter-producer.workspace = true serde.workspace = true serde_json.workspace = true serde_urlencoded.workspace = true +sled-agent-client.workspace = true slog.workspace = true tokio.workspace = true tokio-util.workspace = true diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 4ef77b3352..98a7f9ccd6 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -33,6 +33,10 @@ use nexus_types::internal_api::params::RecoverySiloConfig; use nexus_types::internal_api::params::ServiceKind; use nexus_types::internal_api::params::ServiceNic; use nexus_types::internal_api::params::ServicePutRequest; +use nexus_types::inventory::OmicronZoneConfig; +use nexus_types::inventory::OmicronZoneDataset; +use nexus_types::inventory::OmicronZoneType; +use nexus_types::inventory::OmicronZonesConfig; use omicron_common::address::DNS_OPTE_IPV4_SUBNET; use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; use omicron_common::api::external::MacAddr; @@ -57,6 +61,7 @@ use trust_dns_resolver::config::ResolverOpts; use trust_dns_resolver::TokioAsyncResolver; use uuid::Uuid; +use omicron_common::api::external::Generation; pub use sim::TEST_HARDWARE_THREADS; pub use sim::TEST_RESERVOIR_RAM; @@ -260,6 +265,7 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { pub external_dns: Option, pub internal_dns: Option, dns_config: Option, + omicron_zones: Vec, pub silo_name: Option, pub user_name: Option, @@ -300,6 +306,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { external_dns: None, internal_dns: None, dns_config: None, + omicron_zones: Vec::new(), silo_name: None, user_name: None, } @@ -380,6 +387,18 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { DatasetKind::Cockroach, internal_dns::ServiceName::Cockroach, ); + let pool_name = illumos_utils::zpool::ZpoolName::new_external(zpool_id) + .to_string() + .parse() + .unwrap(); + self.omicron_zones.push(OmicronZoneConfig { + id: dataset_id, + underlay_address: *address.ip(), + zone_type: OmicronZoneType::CockroachDb { + address: address.to_string(), + dataset: OmicronZoneDataset { pool_name }, + }, + }); self.database = Some(database); } @@ -753,6 +772,24 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { self.sled_agent_storage = Some(tempdir); } + pub async fn configure_sled_agent(&mut self) { + // Tell our Sled Agent to report the zones that we configured. + let Some(sled_agent) = &self.sled_agent else { + panic!("no sled agent has been created"); + }; + let client = sled_agent_client::Client::new( + &format!("http://{}", sled_agent.http_server.local_addr()), + self.logctx.log.clone(), + ); + client + .omicron_zones_put(&OmicronZonesConfig { + zones: self.omicron_zones.clone(), + generation: Generation::new().next(), + }) + .await + .expect("Failed to configure sled agent with our zones"); + } + // Set up the Crucible Pantry on an existing Sled Agent. pub async fn start_crucible_pantry(&mut self) { let sled_agent = self @@ -1043,6 +1080,10 @@ async fn setup_with_config_impl( "populate_internal_dns", Box::new(|builder| builder.populate_internal_dns().boxed()), ), + ( + "configure_sled_agent", + Box::new(|builder| builder.configure_sled_agent().boxed()), + ), ( "start_nexus_external", Box::new(|builder| { From ce06eee0f175818cde0ad80c5cb880a0cf3c5642 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 8 Mar 2024 16:21:07 -0800 Subject: [PATCH 13/34] simulate Clickhouse --- nexus/reconfigurator/execution/src/dns.rs | 6 +++--- nexus/test-utils/src/lib.rs | 13 +++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 4beacd5fa9..820e61c2da 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -23,7 +23,6 @@ use nexus_types::internal_api::params::DnsConfigZone; use nexus_types::internal_api::params::DnsRecord; use omicron_common::address::get_switch_zone_address; use omicron_common::address::CLICKHOUSE_KEEPER_PORT; -use omicron_common::address::CLICKHOUSE_PORT; use omicron_common::address::CRUCIBLE_PANTRY_PORT; use omicron_common::address::CRUCIBLE_PORT; use omicron_common::address::DENDRITE_PORT; @@ -265,8 +264,9 @@ pub fn blueprint_internal_dns_config( OmicronZoneType::InternalNtp { .. } => { (ServiceName::InternalNtp, NTP_PORT) } - OmicronZoneType::Clickhouse { .. } => { - (ServiceName::Clickhouse, CLICKHOUSE_PORT) + OmicronZoneType::Clickhouse { address, .. } => { + let port = parse_port(&address); + (ServiceName::Clickhouse, port) } OmicronZoneType::ClickhouseKeeper { .. } => { (ServiceName::ClickhouseKeeper, CLICKHOUSE_KEEPER_PORT) diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 98a7f9ccd6..4822279dc3 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -435,6 +435,19 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { .as_mut() .expect("Tests expect to set a port of Clickhouse") .set_port(port); + + let pool_name = illumos_utils::zpool::ZpoolName::new_external(zpool_id) + .to_string() + .parse() + .unwrap(); + self.omicron_zones.push(OmicronZoneConfig { + id: dataset_id, + underlay_address: *address.ip(), + zone_type: OmicronZoneType::Clickhouse { + address: address.to_string(), + dataset: OmicronZoneDataset { pool_name }, + }, + }); } pub async fn start_gateway(&mut self) { From 54dd75b790bff7f64a514db2ffbc53898c4869c6 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 8 Mar 2024 16:21:12 -0800 Subject: [PATCH 14/34] simulate Nexus --- nexus/reconfigurator/execution/src/dns.rs | 6 +- .../execution/src/resource_allocation.rs | 12 ++++ nexus/test-utils/src/lib.rs | 58 ++++++++++++++++--- 3 files changed, 65 insertions(+), 11 deletions(-) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 820e61c2da..1c66425676 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -29,7 +29,6 @@ use omicron_common::address::DENDRITE_PORT; use omicron_common::address::DNS_HTTP_PORT; use omicron_common::address::MGD_PORT; use omicron_common::address::MGS_PORT; -use omicron_common::address::NEXUS_INTERNAL_PORT; use omicron_common::address::NTP_PORT; use omicron_common::address::OXIMETER_PORT; use omicron_common::api::external::Error; @@ -275,8 +274,9 @@ pub fn blueprint_internal_dns_config( let port = parse_port(&address); (ServiceName::Cockroach, port) } - OmicronZoneType::Nexus { .. } => { - (ServiceName::Nexus, NEXUS_INTERNAL_PORT) + OmicronZoneType::Nexus { internal_address, .. } => { + let port = parse_port(internal_address); + (ServiceName::Nexus, port) } OmicronZoneType::Crucible { .. } => { (ServiceName::Crucible(omicron_zone.id), CRUCIBLE_PORT) diff --git a/nexus/reconfigurator/execution/src/resource_allocation.rs b/nexus/reconfigurator/execution/src/resource_allocation.rs index 8ca44df39e..24e5046ed5 100644 --- a/nexus/reconfigurator/execution/src/resource_allocation.rs +++ b/nexus/reconfigurator/execution/src/resource_allocation.rs @@ -93,6 +93,12 @@ impl<'a> ResourceAllocator<'a> { external_ip: IpAddr, port_range: Option<(u16, u16)>, ) -> anyhow::Result { + // Treat localhost as always allocated. We only use this in the test + // suite. + if external_ip.is_loopback() { + return Ok(true); + } + let allocated_ips = self .datastore .service_lookup_external_ips(self.opctx, zone_id) @@ -157,6 +163,12 @@ impl<'a> ResourceAllocator<'a> { zone_id: Uuid, nic: &NetworkInterface, ) -> anyhow::Result { + // Treat localhost as always allocated. We only use this in the test + // suite. + if nic.ip.is_loopback() { + return Ok(true); + } + let allocated_nics = self .datastore .service_list_network_interfaces(self.opctx, zone_id) diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 4822279dc3..8444ade49b 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -62,8 +62,11 @@ use trust_dns_resolver::TokioAsyncResolver; use uuid::Uuid; use omicron_common::api::external::Generation; +use omicron_common::api::external::Vni; pub use sim::TEST_HARDWARE_THREADS; pub use sim::TEST_RESERVOIR_RAM; +use sled_agent_client::types::NetworkInterface; +use sled_agent_client::types::NetworkInterfaceKind; pub mod db; pub mod http_testing; @@ -184,6 +187,7 @@ impl RackInitRequestBuilder { // Keeps track of: // - The "ServicePutRequest" (for handoff to Nexus) // - The internal DNS configuration for this service + // XXX-dap remove me fn add_service( &mut self, address: SocketAddrV6, @@ -192,6 +196,17 @@ impl RackInitRequestBuilder { sled_id: Uuid, ) { let zone_id = Uuid::new_v4(); + self.add_service_with_id(zone_id, address, kind, service_name, sled_id); + } + + fn add_service_with_id( + &mut self, + zone_id: Uuid, + address: SocketAddrV6, + kind: ServiceKind, + service_name: internal_dns::ServiceName, + sled_id: Uuid, + ) { self.services.push(ServicePutRequest { address, kind, @@ -617,16 +632,14 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { .mac_addrs .next() .expect("ran out of MAC addresses"); - self.rack_init_builder.add_service( + let external_address = + self.config.deployment.dropshot_external.dropshot.bind_address.ip(); + let nexus_id = self.config.deployment.id; + self.rack_init_builder.add_service_with_id( + nexus_id, address, ServiceKind::Nexus { - external_address: self - .config - .deployment - .dropshot_external - .dropshot - .bind_address - .ip(), + external_address, nic: ServiceNic { id: Uuid::new_v4(), name: "nexus".parse().unwrap(), @@ -642,6 +655,35 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { sled_id, ); + self.omicron_zones.push(OmicronZoneConfig { + id: nexus_id, + underlay_address: *address.ip(), + zone_type: OmicronZoneType::Nexus { + external_dns_servers: self + .config + .deployment + .external_dns_servers + .clone(), + external_ip: external_address, + external_tls: self.config.deployment.dropshot_external.tls, + internal_address: address.to_string(), + nic: NetworkInterface { + id: Uuid::new_v4(), + ip: external_address, + kind: NetworkInterfaceKind::Service(nexus_id), + mac, + name: format!("nexus-{}", nexus_id).parse().unwrap(), + primary: true, + slot: 0, + subnet: sled_agent_client::types::Ipv4Net::from( + *NEXUS_OPTE_IPV4_SUBNET, + ) + .into(), + vni: Vni::SERVICES_VNI, + }, + }, + }); + self.nexus_internal = Some(nexus_internal); self.nexus_internal_addr = Some(nexus_internal_addr); } From b05dacd1ddbe187c6791474d411e06193dba3260 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 8 Mar 2024 17:16:03 -0800 Subject: [PATCH 15/34] WIP: add dendrite and Crucible Pantry, but the dendrite IP/ports are confused --- dev-tools/omicron-dev/src/bin/omicron-dev.rs | 2 +- nexus/reconfigurator/execution/src/dns.rs | 6 +- nexus/test-utils/src/lib.rs | 193 ++++++++++++++----- sled-agent/src/sim/sled_agent.rs | 2 +- 4 files changed, 151 insertions(+), 52 deletions(-) diff --git a/dev-tools/omicron-dev/src/bin/omicron-dev.rs b/dev-tools/omicron-dev/src/bin/omicron-dev.rs index 5e0c6486d6..94c56b50f2 100644 --- a/dev-tools/omicron-dev/src/bin/omicron-dev.rs +++ b/dev-tools/omicron-dev/src/bin/omicron-dev.rs @@ -545,7 +545,7 @@ async fn cmd_run_all(args: &RunAllArgs) -> Result<(), anyhow::Error> { ); println!( "omicron-dev: management gateway: http://{}", - cptestctx.gateway.client.bind_address, + cptestctx.gateway.values().next().unwrap().client.bind_address, ); println!("omicron-dev: silo name: {}", cptestctx.silo_name,); println!( diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 1c66425676..03d0adef7b 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -23,7 +23,6 @@ use nexus_types::internal_api::params::DnsConfigZone; use nexus_types::internal_api::params::DnsRecord; use omicron_common::address::get_switch_zone_address; use omicron_common::address::CLICKHOUSE_KEEPER_PORT; -use omicron_common::address::CRUCIBLE_PANTRY_PORT; use omicron_common::address::CRUCIBLE_PORT; use omicron_common::address::DENDRITE_PORT; use omicron_common::address::DNS_HTTP_PORT; @@ -281,8 +280,9 @@ pub fn blueprint_internal_dns_config( OmicronZoneType::Crucible { .. } => { (ServiceName::Crucible(omicron_zone.id), CRUCIBLE_PORT) } - OmicronZoneType::CruciblePantry { .. } => { - (ServiceName::CruciblePantry, CRUCIBLE_PANTRY_PORT) + OmicronZoneType::CruciblePantry { address } => { + let port = parse_port(address); + (ServiceName::CruciblePantry, port) } OmicronZoneType::Oximeter { .. } => { (ServiceName::Oximeter, OXIMETER_PORT) diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 8444ade49b..0e6c00d918 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -73,6 +73,7 @@ pub mod http_testing; pub mod resource_helpers; pub const SLED_AGENT_UUID: &str = "b6d65341-167c-41df-9b5c-41cded99c229"; +pub const SLED_AGENT2_UUID: &str = "039be560-54cc-49e3-88df-1a29dadbf913"; pub const RACK_UUID: &str = "c19a698f-c6f9-4a17-ae30-20d711b8f7dc"; pub const SWITCH_UUID: &str = "dae4e1f1-410e-4314-bff1-fec0504be07e"; pub const OXIMETER_UUID: &str = "39e6175b-4df2-4730-b11d-cbc1e60a2e78"; @@ -96,9 +97,11 @@ pub struct ControlPlaneTestContext { pub logctx: LogContext, pub sled_agent_storage: camino_tempfile::Utf8TempDir, pub sled_agent: sim::Server, + pub sled_agent2_storage: camino_tempfile::Utf8TempDir, + pub sled_agent2: sim::Server, pub oximeter: Oximeter, pub producer: ProducerServer, - pub gateway: GatewayTestContext, + pub gateway: HashMap, pub dendrite: HashMap, pub mgd: HashMap, pub external_dns_zone_name: String, @@ -118,9 +121,12 @@ impl ControlPlaneTestContext { self.database.cleanup().await.unwrap(); self.clickhouse.cleanup().await.unwrap(); self.sled_agent.http_server.close().await.unwrap(); + self.sled_agent2.http_server.close().await.unwrap(); self.oximeter.close().await.unwrap(); self.producer.close().await.unwrap(); - self.gateway.teardown().await; + for (_, gateway) in self.gateway { + gateway.teardown().await; + } for (_, mut dendrite) in self.dendrite { dendrite.cleanup().await.unwrap(); } @@ -210,7 +216,7 @@ impl RackInitRequestBuilder { self.services.push(ServicePutRequest { address, kind, - service_id: Uuid::new_v4(), + service_id: zone_id, sled_id, zone_id: Some(zone_id), }); @@ -223,6 +229,22 @@ impl RackInitRequestBuilder { .expect("Failed to set up DNS for {kind}"); } + fn add_service_without_dns( + &mut self, + zone_id: Uuid, + address: SocketAddrV6, + kind: ServiceKind, + sled_id: Uuid, + ) { + self.services.push(ServicePutRequest { + address, + kind, + service_id: zone_id, + sled_id, + zone_id: Some(zone_id), + }); + } + // Keeps track of: // - The "DatasetPutRequest" (for handoff to Nexus) // - The internal DNS configuration for this service @@ -265,9 +287,11 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { pub clickhouse: Option, pub sled_agent_storage: Option, pub sled_agent: Option, + pub sled_agent2_storage: Option, + pub sled_agent2: Option, pub oximeter: Option, pub producer: Option, - pub gateway: Option, + pub gateway: HashMap, pub dendrite: HashMap, pub mgd: HashMap, @@ -281,6 +305,7 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { pub internal_dns: Option, dns_config: Option, omicron_zones: Vec, + omicron_zones2: Vec, pub silo_name: Option, pub user_name: Option, @@ -310,9 +335,11 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { clickhouse: None, sled_agent_storage: None, sled_agent: None, + sled_agent2_storage: None, + sled_agent2: None, oximeter: None, producer: None, - gateway: None, + gateway: HashMap::new(), dendrite: HashMap::new(), mgd: HashMap::new(), nexus_internal: None, @@ -322,6 +349,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { internal_dns: None, dns_config: None, omicron_zones: Vec::new(), + omicron_zones2: Vec::new(), silo_name: None, user_name: None, } @@ -465,7 +493,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { }); } - pub async fn start_gateway(&mut self) { + pub async fn start_gateway(&mut self, switch_location: SwitchLocation) { // For now, this MGS is not configured to match up in any way with // either the simulated sled agent or the Dendrite instances. It's // useful for testing stuff unrelated to that. But at some point we @@ -476,24 +504,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { gateway_messages::SpPort::One, ) .await; - let fake_mgs_zone_id = Uuid::new_v4(); - let SocketAddr::V6(v6addr) = gateway.client.bind_address else { - panic!("MGS unexpectedly listening on IPv4?"); - }; - let zone = self - .rack_init_builder - .internal_dns_config - .host_zone(fake_mgs_zone_id, *v6addr.ip()) - .expect("Failed to add DNS for MGS zone"); - self.rack_init_builder - .internal_dns_config - .service_backend_zone( - internal_dns::ServiceName::ManagementGatewayService, - &zone, - v6addr.port(), - ) - .expect("Failed to add DNS for MGS service"); - self.gateway = Some(gateway); + self.gateway.insert(switch_location, gateway); } pub async fn start_dendrite(&mut self, switch_location: SwitchLocation) { @@ -513,11 +524,16 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { let config = DpdConfig { address: std::net::SocketAddr::V6(address) }; self.config.pkg.dendrite.insert(switch_location, config); - let sled_id = Uuid::parse_str(SLED_AGENT_UUID).unwrap(); - self.rack_init_builder.add_service( + let sled_id = Uuid::parse_str(match switch_location { + SwitchLocation::Switch0 => SLED_AGENT_UUID, + SwitchLocation::Switch1 => SLED_AGENT2_UUID, + }) + .unwrap(); + + self.rack_init_builder.add_service_without_dns( + sled_id, address, ServiceKind::Dendrite, - internal_dns::ServiceName::Dendrite, sled_id, ); } @@ -537,15 +553,46 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { let config = MgdConfig { address: std::net::SocketAddr::V6(address) }; self.config.pkg.mgd.insert(switch_location, config); - let sled_id = Uuid::parse_str(SLED_AGENT_UUID).unwrap(); - self.rack_init_builder.add_service( + let sled_id = Uuid::parse_str(match switch_location { + SwitchLocation::Switch0 => SLED_AGENT_UUID, + SwitchLocation::Switch1 => SLED_AGENT2_UUID, + }) + .unwrap(); + + self.rack_init_builder.add_service_without_dns( + sled_id, address, ServiceKind::Mgd, - internal_dns::ServiceName::Mgd, sled_id, ); } + pub async fn record_switch_dns(&mut self) { + let log = &self.logctx.log; + debug!(log, "Recording DNS for the switch zones"); + for (sled_id, switch_location) in &[ + (SLED_AGENT_UUID, SwitchLocation::Switch0), + (SLED_AGENT2_UUID, SwitchLocation::Switch1), + ] { + let id = sled_id.parse().unwrap(); + self.rack_init_builder + .internal_dns_config + .host_zone_switch( + id, + Ipv6Addr::LOCALHOST, + self.dendrite.get(switch_location).unwrap().port, + self.gateway + .get(switch_location) + .unwrap() + .client + .bind_address + .port(), + self.mgd.get(switch_location).unwrap().port, + ) + .unwrap(); + } + } + pub async fn start_oximeter(&mut self) { let log = &self.logctx.log; debug!(log, "Starting Oximeter"); @@ -803,12 +850,15 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { self.server = Some(server); } - pub async fn start_sled(&mut self, sim_mode: sim::SimMode) { + pub async fn start_sled(&mut self, first: bool, sim_mode: sim::SimMode) { let nexus_address = self.nexus_internal_addr.expect("Must launch Nexus first"); // Set up a single sled agent. - let sa_id = Uuid::parse_str(SLED_AGENT_UUID).unwrap(); + let sa_id: Uuid = + if first { SLED_AGENT_UUID } else { SLED_AGENT2_UUID } + .parse() + .unwrap(); let tempdir = camino_tempfile::tempdir().unwrap(); let sled_agent = start_sled_agent( self.logctx.log.new(o!( @@ -823,14 +873,23 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { .await .expect("Failed to start sled agent"); - self.sled_agent = Some(sled_agent); - self.sled_agent_storage = Some(tempdir); + if first { + self.sled_agent = Some(sled_agent); + self.sled_agent_storage = Some(tempdir); + } else { + self.sled_agent2 = Some(sled_agent); + self.sled_agent2_storage = Some(tempdir); + } } - pub async fn configure_sled_agent(&mut self) { + pub async fn configure_sled_agent(&mut self, first: bool) { + let field = if first { &self.sled_agent } else { &self.sled_agent2 }; + let zones = + if first { &self.omicron_zones } else { &self.omicron_zones2 }; + // Tell our Sled Agent to report the zones that we configured. - let Some(sled_agent) = &self.sled_agent else { - panic!("no sled agent has been created"); + let Some(sled_agent) = field else { + panic!("expected sled agent has not been created"); }; let client = sled_agent_client::Client::new( &format!("http://{}", sled_agent.http_server.local_addr()), @@ -838,7 +897,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { ); client .omicron_zones_put(&OmicronZonesConfig { - zones: self.omicron_zones.clone(), + zones: zones.clone(), generation: Generation::new().next(), }) .await @@ -860,12 +919,21 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { }; let sled_id = Uuid::parse_str(SLED_AGENT_UUID).unwrap(); - self.rack_init_builder.add_service( + let zone_id = Uuid::new_v4(); + self.rack_init_builder.add_service_with_id( + zone_id, address, ServiceKind::CruciblePantry, internal_dns::ServiceName::CruciblePantry, sled_id, ); + self.omicron_zones.push(OmicronZoneConfig { + id: zone_id, + underlay_address: *address.ip(), + zone_type: OmicronZoneType::CruciblePantry { + address: address.to_string(), + }, + }); } // Set up an external DNS server. @@ -938,10 +1006,12 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { clickhouse: self.clickhouse.unwrap(), sled_agent_storage: self.sled_agent_storage.unwrap(), sled_agent: self.sled_agent.unwrap(), + sled_agent2_storage: self.sled_agent2_storage.unwrap(), + sled_agent2: self.sled_agent2.unwrap(), oximeter: self.oximeter.unwrap(), producer: self.producer.unwrap(), logctx: self.logctx, - gateway: self.gateway.unwrap(), + gateway: self.gateway, dendrite: self.dendrite, mgd: self.mgd, external_dns_zone_name: self.external_dns_zone_name.unwrap(), @@ -965,13 +1035,16 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { if let Some(sled_agent) = self.sled_agent { sled_agent.http_server.close().await.unwrap(); } + if let Some(sled_agent2) = self.sled_agent2 { + sled_agent2.http_server.close().await.unwrap(); + } if let Some(oximeter) = self.oximeter { oximeter.close().await.unwrap(); } if let Some(producer) = self.producer { producer.close().await.unwrap(); } - if let Some(gateway) = self.gateway { + for (_, gateway) in self.gateway { gateway.teardown().await; } for (_, mut dendrite) in self.dendrite { @@ -1082,8 +1155,16 @@ async fn setup_with_config_impl( Box::new(|builder| builder.start_clickhouse().boxed()), ), ( - "start_gateway", - Box::new(|builder| builder.start_gateway().boxed()), + "start_gateway_switch0", + Box::new(|builder| { + builder.start_gateway(SwitchLocation::Switch0).boxed() + }), + ), + ( + "start_gateway_switch1", + Box::new(|builder| { + builder.start_gateway(SwitchLocation::Switch1).boxed() + }), ), ( "start_dendrite_switch0", @@ -1109,6 +1190,10 @@ async fn setup_with_config_impl( builder.start_mgd(SwitchLocation::Switch1).boxed() }), ), + ( + "record_switch_dns", + Box::new(|builder| builder.record_switch_dns().boxed()), + ), ( "start_internal_dns", Box::new(|builder| builder.start_internal_dns().boxed()), @@ -1122,9 +1207,15 @@ async fn setup_with_config_impl( Box::new(|builder| builder.start_nexus_internal().boxed()), ), ( - "start_sled", + "start_sled1", + Box::new(move |builder| { + builder.start_sled(true, sim_mode).boxed() + }), + ), + ( + "start_sled2", Box::new(move |builder| { - builder.start_sled(sim_mode).boxed() + builder.start_sled(false, sim_mode).boxed() }), ), ( @@ -1136,8 +1227,16 @@ async fn setup_with_config_impl( Box::new(|builder| builder.populate_internal_dns().boxed()), ), ( - "configure_sled_agent", - Box::new(|builder| builder.configure_sled_agent().boxed()), + "configure_sled_agent1", + Box::new(|builder| { + builder.configure_sled_agent(true).boxed() + }), + ), + ( + "configure_sled_agent2", + Box::new(|builder| { + builder.configure_sled_agent(false).boxed() + }), ), ( "start_nexus_external", diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 483b2d6aa8..0b90bef590 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -740,7 +740,7 @@ impl SledAgent { Ok(Inventory { sled_id: self.id, sled_agent_address, - sled_role: SledRole::Gimlet, + sled_role: SledRole::Scrimlet, baseboard: self.config.hardware.baseboard.clone(), usable_hardware_threads: self.config.hardware.hardware_threads, usable_physical_ram: ByteCount::try_from( From 0a153eaf72bb7237a2015b87a764cc61c9c4ef46 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 8 Mar 2024 19:15:35 -0800 Subject: [PATCH 16/34] that'll fix it --- nexus/reconfigurator/execution/src/dns.rs | 71 ++++++++++++++----- nexus/reconfigurator/execution/src/lib.rs | 57 +++++++++++++++ .../src/app/background/blueprint_execution.rs | 1 + 3 files changed, 113 insertions(+), 16 deletions(-) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 03d0adef7b..8d0f46e006 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -4,6 +4,7 @@ //! Propagates internal DNS changes in a given blueprint +use crate::ExecutionOverrides; use crate::Sled; use dns_service_client::DnsDiff; use internal_dns::DnsConfigBuilder; @@ -21,13 +22,9 @@ use nexus_types::identity::Resource; use nexus_types::internal_api::params::DnsConfigParams; use nexus_types::internal_api::params::DnsConfigZone; use nexus_types::internal_api::params::DnsRecord; -use omicron_common::address::get_switch_zone_address; use omicron_common::address::CLICKHOUSE_KEEPER_PORT; use omicron_common::address::CRUCIBLE_PORT; -use omicron_common::address::DENDRITE_PORT; use omicron_common::address::DNS_HTTP_PORT; -use omicron_common::address::MGD_PORT; -use omicron_common::address::MGS_PORT; use omicron_common::address::NTP_PORT; use omicron_common::address::OXIMETER_PORT; use omicron_common::api::external::Error; @@ -46,6 +43,7 @@ pub(crate) async fn deploy_dns( creator: String, blueprint: &Blueprint, sleds_by_id: &BTreeMap, + overrides: &ExecutionOverrides, ) -> Result<(), Error> { // First, fetch the current DNS configs. let internal_dns_config_current = datastore @@ -65,7 +63,7 @@ pub(crate) async fn deploy_dns( // Next, construct the DNS config represented by the blueprint. let internal_dns_config_blueprint = - blueprint_internal_dns_config(blueprint, sleds_by_id); + blueprint_internal_dns_config(blueprint, sleds_by_id, overrides); let silos = datastore .silo_list_all_batched(opctx, Discoverability::All) .await @@ -228,6 +226,7 @@ pub(crate) async fn deploy_dns_one( pub fn blueprint_internal_dns_config( blueprint: &Blueprint, sleds_by_id: &BTreeMap, + overrides: &ExecutionOverrides, ) -> DnsConfigParams { // The DNS names configured here should match what RSS configures for the // same zones. It's tricky to have RSS share the same code because it uses @@ -310,15 +309,15 @@ pub fn blueprint_internal_dns_config( let scrimlets = sleds_by_id.values().filter(|sled| sled.is_scrimlet); for scrimlet in scrimlets { let sled_subnet = scrimlet.subnet(); - let switch_zone_ip = get_switch_zone_address(sled_subnet); + let switch_zone_ip = overrides.switch_zone_ip(scrimlet.id, sled_subnet); // unwrap(): see above. dns_builder .host_zone_switch( scrimlet.id, switch_zone_ip, - DENDRITE_PORT, - MGS_PORT, - MGD_PORT, + overrides.dendrite_port(scrimlet.id), + overrides.mgs_port(scrimlet.id), + overrides.mgd_port(scrimlet.id), ) .unwrap(); } @@ -438,6 +437,7 @@ mod test { use super::dns_compute_update; use crate::dns::blueprint_external_dns_config; use crate::dns::silo_dns_name; + use crate::ExecutionOverrides; use crate::Sled; use internal_dns::ServiceName; use internal_dns::DNS_ZONE; @@ -447,6 +447,8 @@ mod test { use nexus_inventory::CollectionBuilder; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_reconfigurator_planning::example::example; + use nexus_test_utils::SLED_AGENT2_UUID; + use nexus_test_utils::SLED_AGENT_UUID; use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::Blueprint; use nexus_types::deployment::OmicronZoneConfig; @@ -471,6 +473,7 @@ mod test { use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::IdentityMetadataCreateParams; + use omicron_common::api::external::SwitchLocation; use omicron_test_utils::dev::poll::wait_for_condition; use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::test_setup_log; @@ -522,8 +525,11 @@ mod test { #[test] fn test_blueprint_internal_dns_empty() { let blueprint = blueprint_empty(); - let blueprint_dns = - blueprint_internal_dns_config(&blueprint, &BTreeMap::new()); + let blueprint_dns = blueprint_internal_dns_config( + &blueprint, + &BTreeMap::new(), + &Default::default(), + ); assert!(blueprint_dns.sole_zone().unwrap().records.is_empty()); } @@ -620,8 +626,11 @@ mod test { }) .collect(); - let dns_config_blueprint = - blueprint_internal_dns_config(&blueprint, &sleds_by_id); + let dns_config_blueprint = blueprint_internal_dns_config( + &blueprint, + &sleds_by_id, + &Default::default(), + ); assert_eq!( dns_config_blueprint.generation, u64::from(initial_dns_generation.next()) @@ -1154,9 +1163,39 @@ mod test { .expect("failed to generate initial blueprint"); // Now, execute the blueprint. - crate::realize_blueprint(&opctx, datastore, &blueprint, "test-suite") - .await - .expect("failed to execute initial blueprint"); + // XXX-dap doc/cleanup + let mut overrides = ExecutionOverrides::default(); + let scrimlets = [ + (SLED_AGENT_UUID, SwitchLocation::Switch0), + (SLED_AGENT2_UUID, SwitchLocation::Switch1), + ]; + for (id_str, switch_location) in scrimlets { + let sled_id = id_str.parse().unwrap(); + let ip = Ipv6Addr::LOCALHOST; + let mgs_port = cptestctx + .gateway + .get(&switch_location) + .unwrap() + .client + .bind_address + .port(); + let dendrite_port = + cptestctx.dendrite.get(&switch_location).unwrap().port; + let mgd_port = cptestctx.mgd.get(&switch_location).unwrap().port; + overrides.override_switch_zone_ip(sled_id, ip); + overrides.override_dendrite_port(sled_id, dendrite_port); + overrides.override_mgs_port(sled_id, mgs_port); + overrides.override_mgd_port(sled_id, mgd_port); + } + crate::realize_blueprint( + &opctx, + datastore, + &blueprint, + "test-suite", + &overrides, + ) + .await + .expect("failed to execute initial blueprint"); // Now fetch DNS again. It ought not to have changed. let dns_latest_internal = datastore diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 74b4764d7a..db28052e65 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -11,11 +11,16 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; use nexus_types::identity::Asset; +use omicron_common::address::get_switch_zone_address; use omicron_common::address::Ipv6Subnet; +use omicron_common::address::DENDRITE_PORT; +use omicron_common::address::MGD_PORT; +use omicron_common::address::MGS_PORT; use omicron_common::address::SLED_PREFIX; use slog::info; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; +use std::net::Ipv6Addr; use std::net::SocketAddrV6; use uuid::Uuid; @@ -24,6 +29,56 @@ mod dns; mod omicron_zones; mod resource_allocation; +// XXX-dap +#[derive(Debug, Default)] +pub struct ExecutionOverrides { + pub dendrite_ports: BTreeMap, + pub mgs_ports: BTreeMap, + pub mgd_ports: BTreeMap, + pub switch_zone_ips: BTreeMap, +} + +impl ExecutionOverrides { + pub fn override_dendrite_port(&mut self, sled_id: Uuid, port: u16) { + self.dendrite_ports.insert(sled_id, port); + } + + fn dendrite_port(&self, sled_id: Uuid) -> u16 { + self.dendrite_ports.get(&sled_id).copied().unwrap_or(DENDRITE_PORT) + } + + pub fn override_mgs_port(&mut self, sled_id: Uuid, port: u16) { + self.mgs_ports.insert(sled_id, port); + } + + fn mgs_port(&self, sled_id: Uuid) -> u16 { + self.mgs_ports.get(&sled_id).copied().unwrap_or(MGS_PORT) + } + + pub fn override_mgd_port(&mut self, sled_id: Uuid, port: u16) { + self.mgd_ports.insert(sled_id, port); + } + + fn mgd_port(&self, sled_id: Uuid) -> u16 { + self.mgd_ports.get(&sled_id).copied().unwrap_or(MGD_PORT) + } + + pub fn override_switch_zone_ip(&mut self, sled_id: Uuid, addr: Ipv6Addr) { + self.switch_zone_ips.insert(sled_id, addr); + } + + fn switch_zone_ip( + &self, + sled_id: Uuid, + sled_subnet: Ipv6Subnet, + ) -> Ipv6Addr { + self.switch_zone_ips + .get(&sled_id) + .copied() + .unwrap_or_else(|| get_switch_zone_address(sled_subnet)) + } +} + struct Sled { id: Uuid, sled_agent_address: SocketAddrV6, @@ -56,6 +111,7 @@ pub async fn realize_blueprint( datastore: &DataStore, blueprint: &Blueprint, nexus_label: S, + overrides: &ExecutionOverrides, ) -> Result<(), Vec> where String: From, @@ -104,6 +160,7 @@ where String::from(nexus_label), blueprint, &sleds_by_id, + &overrides, ) .await .map_err(|e| vec![anyhow!("{}", InlineErrorChain::new(&e))])?; diff --git a/nexus/src/app/background/blueprint_execution.rs b/nexus/src/app/background/blueprint_execution.rs index 3cfc6d4e7f..f22d08a589 100644 --- a/nexus/src/app/background/blueprint_execution.rs +++ b/nexus/src/app/background/blueprint_execution.rs @@ -74,6 +74,7 @@ impl BackgroundTask for BlueprintExecutor { &self.datastore, blueprint, &self.nexus_label, + &Default::default(), ) .await; From 757018ee039d4bf1a005ed4c7c682c1c8c6c51d9 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 8 Mar 2024 21:45:26 -0800 Subject: [PATCH 17/34] fix more fallout in terrible ways --- dev-tools/omdb/tests/env.out | 3 +++ dev-tools/omdb/tests/successes.out | 13 ++++++----- .../tests/input/complex.json | 5 ++++- .../tests/output/collector_basic.txt | 4 ++-- .../output/collector_sled_agent_errors.txt | 2 +- .../app/background/inventory_collection.rs | 2 +- nexus/src/lib.rs | 22 ++++++++++++++++++- nexus/test-interface/src/lib.rs | 1 + nexus/test-utils/src/lib.rs | 1 + .../tests/integration_tests/initialization.rs | 22 +++++++------------ nexus/tests/integration_tests/sleds.rs | 14 ++++++------ 11 files changed, 56 insertions(+), 33 deletions(-) diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 3e6e89d508..ef8cf1631e 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -3,6 +3,7 @@ termination: Exited(0) --------------------------------------------- stdout: SERIAL IP ROLE ID +sim-039be560 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED sim-b6d65341 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED --------------------------------------------- stderr: @@ -268,6 +269,7 @@ termination: Exited(0) --------------------------------------------- stdout: SERIAL IP ROLE ID +sim-039be560 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED sim-b6d65341 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED --------------------------------------------- stderr: @@ -281,6 +283,7 @@ termination: Exited(0) --------------------------------------------- stdout: SERIAL IP ROLE ID +sim-039be560 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED sim-b6d65341 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED --------------------------------------------- stderr: diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index fe590acf55..2da6e4dceb 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -77,12 +77,10 @@ termination: Exited(0) stdout: SERVICE INSTANCE_ID ADDR SLED_SERIAL CruciblePantry REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 -Dendrite REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 -Dendrite REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 ExternalDns REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 InternalDns REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 Nexus REDACTED_UUID_REDACTED_UUID_REDACTED [::ffff:127.0.0.1]:REDACTED_PORT sim-b6d65341 -Mgd REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 +Mgd REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-039be560 Mgd REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 --------------------------------------------- stderr: @@ -93,17 +91,19 @@ EXECUTING COMMAND: omdb ["db", "services", "list-by-sled"] termination: Exited(0) --------------------------------------------- stdout: +sled: sim-039be560 (id REDACTED_UUID_REDACTED_UUID_REDACTED) + + SERVICE INSTANCE_ID ADDR + Mgd REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT + sled: sim-b6d65341 (id REDACTED_UUID_REDACTED_UUID_REDACTED) SERVICE INSTANCE_ID ADDR CruciblePantry REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT - Dendrite REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT - Dendrite REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT ExternalDns REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT InternalDns REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT Nexus REDACTED_UUID_REDACTED_UUID_REDACTED [::ffff:127.0.0.1]:REDACTED_PORT Mgd REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT - Mgd REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT --------------------------------------------- stderr: @@ -115,6 +115,7 @@ termination: Exited(0) --------------------------------------------- stdout: SERIAL IP ROLE ID +sim-039be560 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED sim-b6d65341 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED --------------------------------------------- stderr: diff --git a/dev-tools/reconfigurator-cli/tests/input/complex.json b/dev-tools/reconfigurator-cli/tests/input/complex.json index c168e153c1..8e3e9bc485 100644 --- a/dev-tools/reconfigurator-cli/tests/input/complex.json +++ b/dev-tools/reconfigurator-cli/tests/input/complex.json @@ -9948,6 +9948,7 @@ ], "parent_blueprint_id": null, "internal_dns_version": 1, + "external_dns_version": 1, "time_created": "2024-03-01T19:06:56.467313Z", "creator": "54c947d2-6355-453c-80fc-8f49cc2129ee", "comment": "from collection df8caafd-c444-4f65-a304-b9ceb62a96c2" @@ -10701,6 +10702,7 @@ ], "parent_blueprint_id": "486de160-c8f3-4600-acca-b0c78e33aca4", "internal_dns_version": 1, + "external_dns_version": 1, "time_created": "2024-03-01T19:07:58.105708Z", "creator": "54c947d2-6355-453c-80fc-8f49cc2129ee", "comment": "sled a243c1d0-9051-4b94-ab3e-f2a93fd0ae4f: add NTP zone" @@ -11550,9 +11552,10 @@ ], "parent_blueprint_id": "6c127695-ba15-408d-a992-325a1a888380", "internal_dns_version": 2, + "external_dns_version": 1, "time_created": "2024-03-01T19:08:52.730520Z", "creator": "54c947d2-6355-453c-80fc-8f49cc2129ee", "comment": "sled a243c1d0-9051-4b94-ab3e-f2a93fd0ae4f: add zones" } ] -} \ No newline at end of file +} diff --git a/nexus/inventory/tests/output/collector_basic.txt b/nexus/inventory/tests/output/collector_basic.txt index 4a05f09e1c..0fc1c552ab 100644 --- a/nexus/inventory/tests/output/collector_basic.txt +++ b/nexus/inventory/tests/output/collector_basic.txt @@ -71,12 +71,12 @@ rot pages found: CfpaScratch baseboard part "i86pc" serial "SimGimlet01": data_base64 "Z2ltbGV0LWNmcGEtc2NyYXRjaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=" sled agents found: - sled 03265caf-da7d-46c7-b1c2-39fa90ce5c65 (Gimlet) + sled 03265caf-da7d-46c7-b1c2-39fa90ce5c65 (Scrimlet) baseboard Some(BaseboardId { part_number: "sim-gimlet", serial_number: "sim-03265caf-da7d-46c7-b1c2-39fa90ce5c65" }) zone generation: Generation(3) zones found: zone 8b88a56f-3eb6-4d80-ba42-75d867bc427d type oximeter - sled 9cb9b78f-5614-440c-b66d-e8e81fab69b0 (Gimlet) + sled 9cb9b78f-5614-440c-b66d-e8e81fab69b0 (Scrimlet) baseboard Some(BaseboardId { part_number: "sim-gimlet", serial_number: "sim-9cb9b78f-5614-440c-b66d-e8e81fab69b0" }) zone generation: Generation(3) zones found: diff --git a/nexus/inventory/tests/output/collector_sled_agent_errors.txt b/nexus/inventory/tests/output/collector_sled_agent_errors.txt index aaa31fd1bb..7b9bbce84e 100644 --- a/nexus/inventory/tests/output/collector_sled_agent_errors.txt +++ b/nexus/inventory/tests/output/collector_sled_agent_errors.txt @@ -70,7 +70,7 @@ rot pages found: CfpaScratch baseboard part "i86pc" serial "SimGimlet01": data_base64 "Z2ltbGV0LWNmcGEtc2NyYXRjaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=" sled agents found: - sled 9cb9b78f-5614-440c-b66d-e8e81fab69b0 (Gimlet) + sled 9cb9b78f-5614-440c-b66d-e8e81fab69b0 (Scrimlet) baseboard Some(BaseboardId { part_number: "sim-gimlet", serial_number: "sim-9cb9b78f-5614-440c-b66d-e8e81fab69b0" }) zone generation: Generation(3) zones found: diff --git a/nexus/src/app/background/inventory_collection.rs b/nexus/src/app/background/inventory_collection.rs index c0d64d554a..c5a95b411c 100644 --- a/nexus/src/app/background/inventory_collection.rs +++ b/nexus/src/app/background/inventory_collection.rs @@ -315,7 +315,7 @@ mod test { // There will be one sled agent set up as part of the test context. let found_urls = db_enum.list_sled_agents().await.unwrap(); - assert_eq!(found_urls.len(), 1); + assert_eq!(found_urls.len(), 2); // Insert some sleds. let rack_id = Uuid::new_v4(); diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index 771a78f0b1..0c8a306fa0 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -27,6 +27,7 @@ use dropshot::ConfigDropshot; use external_api::http_entrypoints::external_api; use internal_api::http_entrypoints::internal_api; use nexus_config::NexusConfig; +use nexus_types::external_api::views::SledProvisionPolicy; use nexus_types::internal_api::params::ServiceKind; use omicron_common::address::IpRange; use omicron_common::api::internal::shared::{ @@ -237,6 +238,7 @@ impl nexus_test_interface::NexusServer for Server { external_dns_zone_name: &str, recovery_silo: nexus_types::internal_api::params::RecoverySiloConfig, certs: Vec, + disable_sled_id: Uuid, ) -> Self { // Perform the "handoff from RSS". // @@ -302,7 +304,25 @@ impl nexus_test_interface::NexusServer for Server { .expect("Could not initialize rack"); // Start the Nexus external API. - Server::start(internal_server).await.unwrap() + let rv = Server::start(internal_server).await.unwrap(); + + // It's convenient for tests to assume that there's only one + // provisionable sled. + // XXX-dap + rv.apictx() + .nexus + .sled_set_provision_policy( + &opctx, + &nexus_db_queries::db::lookup::LookupPath::new( + &opctx, + rv.apictx().nexus.datastore(), + ) + .sled_id(disable_sled_id), + SledProvisionPolicy::NonProvisionable, + ) + .await + .unwrap(); + rv } async fn get_http_server_external_address(&self) -> SocketAddr { diff --git a/nexus/test-interface/src/lib.rs b/nexus/test-interface/src/lib.rs index 0f53ac6445..10bc9e63f0 100644 --- a/nexus/test-interface/src/lib.rs +++ b/nexus/test-interface/src/lib.rs @@ -56,6 +56,7 @@ pub trait NexusServer: Send + Sync + 'static { external_dns_zone_name: &str, recovery_silo: nexus_types::internal_api::params::RecoverySiloConfig, tls_certificates: Vec, + disable_sled_id: Uuid, ) -> Self; async fn get_http_server_external_address(&self) -> SocketAddr; diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 0e6c00d918..1967e66f05 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -822,6 +822,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { &external_dns_zone_name, recovery_silo, tls_certificates, + SLED_AGENT2_UUID.parse().unwrap(), ) .await; diff --git a/nexus/tests/integration_tests/initialization.rs b/nexus/tests/integration_tests/initialization.rs index b77a121080..b21cc37f4f 100644 --- a/nexus/tests/integration_tests/initialization.rs +++ b/nexus/tests/integration_tests/initialization.rs @@ -78,19 +78,6 @@ async fn test_nexus_boots_before_cockroach() { #[tokio::test] async fn test_nexus_boots_before_dendrite() { - // Start MGS + Sim SP. This is needed for the Dendrite client initialization - // inside of Nexus initialization - let (mgs_config, sp_sim_config) = mgs_setup::load_test_config(); - let mgs_addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, MGS_PORT, 0, 0); - let mgs = mgs_setup::test_setup_with_config( - "test_nexus_boots_before_dendrite", - SpPort::One, - mgs_config, - &sp_sim_config, - Some(mgs_addr), - ) - .await; - let mut config = load_test_config(); let mut builder = @@ -101,6 +88,13 @@ async fn test_nexus_boots_before_dendrite() { let log = builder.logctx.log.new(o!("component" => "test")); + // Start MGS + Sim SP. This is needed for the Dendrite client initialization + // inside of Nexus initialization + info!(&log, "Starting MGS"); + builder.start_gateway(SwitchLocation::Switch0).await; + builder.start_gateway(SwitchLocation::Switch1).await; + info!(&log, "Started MGS"); + let populate = true; builder.start_crdb(populate).await; builder.start_internal_dns().await; @@ -150,6 +144,7 @@ async fn test_nexus_boots_before_dendrite() { info!(log, "Started mgd"); info!(log, "Populating internal DNS records"); + builder.record_switch_dns().await; builder.populate_internal_dns().await; info!(log, "Populated internal DNS records"); @@ -157,7 +152,6 @@ async fn test_nexus_boots_before_dendrite() { nexus_handle.await.expect("Test: Task starting Nexus has failed"); builder.teardown().await; - mgs.teardown().await; } // Helper to ensure we perform the same setup for the positive and negative test diff --git a/nexus/tests/integration_tests/sleds.rs b/nexus/tests/integration_tests/sleds.rs index b551cf51b5..743a76be17 100644 --- a/nexus/tests/integration_tests/sleds.rs +++ b/nexus/tests/integration_tests/sleds.rs @@ -48,9 +48,9 @@ async fn sled_instance_list( async fn test_sleds_list(cptestctx: &ControlPlaneTestContext) { let client = &cptestctx.external_client; - // Verify that there is one sled to begin with. + // Verify that there are two sleds to begin with. let sleds_url = "/v1/system/hardware/sleds"; - assert_eq!(sleds_list(&client, &sleds_url).await.len(), 1); + assert_eq!(sleds_list(&client, &sleds_url).await.len(), 2); // Now start a few more sled agents. let nsleds = 3; @@ -76,7 +76,7 @@ async fn test_sleds_list(cptestctx: &ControlPlaneTestContext) { // List sleds again. let sleds_found = sleds_list(&client, &sleds_url).await; - assert_eq!(sleds_found.len(), nsleds + 1); + assert_eq!(sleds_found.len(), nsleds + 2); let sledids_found = sleds_found.iter().map(|sv| sv.identity.id).collect::>(); @@ -97,9 +97,9 @@ async fn test_physical_disk_create_list_delete( let external_client = &cptestctx.external_client; let internal_client = &cptestctx.internal_client; - // Verify that there is one sled to begin with. + // Verify that there are two sleds to begin with. let sleds_url = "/v1/system/hardware/sleds"; - assert_eq!(sleds_list(&external_client, &sleds_url).await.len(), 1); + assert_eq!(sleds_list(&external_client, &sleds_url).await.len(), 2); // The test framework may set up some disks initially. let disks_url = @@ -140,9 +140,9 @@ async fn test_physical_disk_create_list_delete( async fn test_sled_instance_list(cptestctx: &ControlPlaneTestContext) { let external_client = &cptestctx.external_client; - // Verify that there is one sled to begin with. + // Verify that there are two sleds to begin with. let sleds_url = "/v1/system/hardware/sleds"; - assert_eq!(sleds_list(&external_client, &sleds_url).await.len(), 1); + assert_eq!(sleds_list(&external_client, &sleds_url).await.len(), 2); // Verify that there are no instances. let instances_url = From 50394febbb946c978dacaa9561becdfaf4bcdfba Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 8 Mar 2024 22:06:46 -0800 Subject: [PATCH 18/34] fix more of my new test --- nexus/reconfigurator/execution/src/dns.rs | 49 ++++++++-------- nexus/test-utils/src/lib.rs | 68 ++++++++++++++++++----- 2 files changed, 76 insertions(+), 41 deletions(-) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 8d0f46e006..ab0b9b17ef 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -22,11 +22,6 @@ use nexus_types::identity::Resource; use nexus_types::internal_api::params::DnsConfigParams; use nexus_types::internal_api::params::DnsConfigZone; use nexus_types::internal_api::params::DnsRecord; -use omicron_common::address::CLICKHOUSE_KEEPER_PORT; -use omicron_common::address::CRUCIBLE_PORT; -use omicron_common::address::DNS_HTTP_PORT; -use omicron_common::address::NTP_PORT; -use omicron_common::address::OXIMETER_PORT; use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::InternalContext; @@ -237,36 +232,32 @@ pub fn blueprint_internal_dns_config( let mut dns_builder = DnsConfigBuilder::new(); // XXX-dap don't panic + // See oxidecomputer/omicron#4988. fn parse_port(address: &str) -> u16 { address.parse::().unwrap().port() } - // The code below assumes that all zones are using the default port numbers. - // That should be true, as those are the only ports ever used today. - // In an ideal world, the correct port would be pulled out of the - // `OmicronZoneType` variant instead. Although that information is present, - // it's irritatingly non-trivial to do right now because SocketAddrs are - // represented as strings, so we'd need to parse all of them and handle all - // the errors, even though they should never happen. - // See oxidecomputer/omicron#4988. for (_, omicron_zone) in blueprint.all_omicron_zones() { if !blueprint.zones_in_service.contains(&omicron_zone.id) { continue; } let (service_name, port) = match &omicron_zone.zone_type { - OmicronZoneType::BoundaryNtp { .. } => { - (ServiceName::BoundaryNtp, NTP_PORT) + OmicronZoneType::BoundaryNtp { address, .. } => { + let port = parse_port(&address); + (ServiceName::BoundaryNtp, port) } - OmicronZoneType::InternalNtp { .. } => { - (ServiceName::InternalNtp, NTP_PORT) + OmicronZoneType::InternalNtp { address, .. } => { + let port = parse_port(&address); + (ServiceName::InternalNtp, port) } OmicronZoneType::Clickhouse { address, .. } => { let port = parse_port(&address); (ServiceName::Clickhouse, port) } - OmicronZoneType::ClickhouseKeeper { .. } => { - (ServiceName::ClickhouseKeeper, CLICKHOUSE_KEEPER_PORT) + OmicronZoneType::ClickhouseKeeper { address, .. } => { + let port = parse_port(&address); + (ServiceName::ClickhouseKeeper, port) } OmicronZoneType::CockroachDb { address, .. } => { let port = parse_port(&address); @@ -276,21 +267,25 @@ pub fn blueprint_internal_dns_config( let port = parse_port(internal_address); (ServiceName::Nexus, port) } - OmicronZoneType::Crucible { .. } => { - (ServiceName::Crucible(omicron_zone.id), CRUCIBLE_PORT) + OmicronZoneType::Crucible { address, .. } => { + let port = parse_port(address); + (ServiceName::Crucible(omicron_zone.id), port) } OmicronZoneType::CruciblePantry { address } => { let port = parse_port(address); (ServiceName::CruciblePantry, port) } - OmicronZoneType::Oximeter { .. } => { - (ServiceName::Oximeter, OXIMETER_PORT) + OmicronZoneType::Oximeter { address } => { + let port = parse_port(address); + (ServiceName::Oximeter, port) } - OmicronZoneType::ExternalDns { .. } => { - (ServiceName::ExternalDns, DNS_HTTP_PORT) + OmicronZoneType::ExternalDns { http_address, .. } => { + let port = parse_port(http_address); + (ServiceName::ExternalDns, port) } - OmicronZoneType::InternalDns { .. } => { - (ServiceName::InternalDns, DNS_HTTP_PORT) + OmicronZoneType::InternalDns { http_address, .. } => { + let port = parse_port(http_address); + (ServiceName::InternalDns, port) } }; diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 1967e66f05..3bb2508739 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -61,6 +61,7 @@ use trust_dns_resolver::config::ResolverOpts; use trust_dns_resolver::TokioAsyncResolver; use uuid::Uuid; +use chrono::SecondsFormat; use omicron_common::api::external::Generation; use omicron_common::api::external::Vni; pub use sim::TEST_HARDWARE_THREADS; @@ -193,18 +194,6 @@ impl RackInitRequestBuilder { // Keeps track of: // - The "ServicePutRequest" (for handoff to Nexus) // - The internal DNS configuration for this service - // XXX-dap remove me - fn add_service( - &mut self, - address: SocketAddrV6, - kind: ServiceKind, - service_name: internal_dns::ServiceName, - sled_id: Uuid, - ) { - let zone_id = Uuid::new_v4(); - self.add_service_with_id(zone_id, address, kind, service_name, sled_id); - } - fn add_service_with_id( &mut self, zone_id: Uuid, @@ -957,7 +946,9 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { .mac_addrs .next() .expect("ran out of MAC addresses"); - self.rack_init_builder.add_service( + let zone_id = Uuid::new_v4(); + self.rack_init_builder.add_service_with_id( + zone_id, dropshot_address, ServiceKind::ExternalDns { external_address: (*dns_address.ip()).into(), @@ -975,6 +966,36 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { internal_dns::ServiceName::ExternalDns, sled_id, ); + + let zpool_id = Uuid::new_v4(); + let pool_name = illumos_utils::zpool::ZpoolName::new_external(zpool_id) + .to_string() + .parse() + .unwrap(); + self.omicron_zones.push(OmicronZoneConfig { + id: zone_id, + underlay_address: *dropshot_address.ip(), + zone_type: OmicronZoneType::ExternalDns { + dataset: OmicronZoneDataset { pool_name }, + dns_address: dns_address.to_string(), + http_address: dropshot_address.to_string(), + nic: NetworkInterface { + id: Uuid::new_v4(), + ip: (*dns_address.ip()).into(), + kind: NetworkInterfaceKind::Service(zone_id), + mac, + name: format!("external-dns-{}", zone_id).parse().unwrap(), + primary: true, + slot: 0, + subnet: sled_agent_client::types::Ipv4Net::from( + *DNS_OPTE_IPV4_SUBNET, + ) + .into(), + vni: Vni::SERVICES_VNI, + }, + }, + }); + self.external_dns = Some(dns); } @@ -987,13 +1008,32 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { let SocketAddr::V6(address) = dns.dropshot_server.local_addr() else { panic!("Unsupported IPv4 DNS address"); }; - self.rack_init_builder.add_service( + let zone_id = Uuid::new_v4(); + self.rack_init_builder.add_service_with_id( + zone_id, address, ServiceKind::InternalDns, internal_dns::ServiceName::InternalDns, sled_id, ); + let zpool_id = Uuid::new_v4(); + let pool_name = illumos_utils::zpool::ZpoolName::new_external(zpool_id) + .to_string() + .parse() + .unwrap(); + self.omicron_zones.push(OmicronZoneConfig { + id: zone_id, + underlay_address: *address.ip(), + zone_type: OmicronZoneType::InternalDns { + dataset: OmicronZoneDataset { pool_name }, + dns_address: dns.dns_server.local_address().to_string(), + http_address: address.to_string(), + gz_address: Ipv6Addr::LOCALHOST, + gz_address_index: 0, + }, + }); + self.internal_dns = Some(dns); } From 6fa022e18544bd47f9f2f6fee7d7f8a4839edb76 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 9 Mar 2024 13:16:44 -0800 Subject: [PATCH 19/34] fix another test --- nexus/src/context.rs | 1 + nexus/test-utils/src/lib.rs | 27 +++++++++++++------ .../tests/integration_tests/initialization.rs | 7 ++--- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/nexus/src/context.rs b/nexus/src/context.rs index cf2b9d6f17..d8be5f3ab0 100644 --- a/nexus/src/context.rs +++ b/nexus/src/context.rs @@ -195,6 +195,7 @@ impl ServerContext { // This means a new cockroachdb instance won't picked up until // Nexus restarts. let addrs = loop { + debug!(log, "Looking up cockroach addresses"); match resolver .lookup_all_socket_v6(ServiceName::Cockroach) .await diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 3bb2508739..0a03016e1d 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -482,15 +482,22 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { }); } - pub async fn start_gateway(&mut self, switch_location: SwitchLocation) { - // For now, this MGS is not configured to match up in any way with - // either the simulated sled agent or the Dendrite instances. It's - // useful for testing stuff unrelated to that. But at some point we - // will probably want the reported data to match up better. + pub async fn start_gateway( + &mut self, + switch_location: SwitchLocation, + port: Option, + ) { debug!(&self.logctx.log, "Starting Management Gateway"); - let gateway = gateway_test_utils::setup::test_setup( + let (mgs_config, sp_sim_config) = + gateway_test_utils::setup::load_test_config(); + let mgs_addr = + port.map(|port| SocketAddrV6::new(Ipv6Addr::LOCALHOST, port, 0, 0)); + let gateway = gateway_test_utils::setup::test_setup_with_config( self.test_name, gateway_messages::SpPort::One, + mgs_config, + &sp_sim_config, + mgs_addr, ) .await; self.gateway.insert(switch_location, gateway); @@ -1198,13 +1205,17 @@ async fn setup_with_config_impl( ( "start_gateway_switch0", Box::new(|builder| { - builder.start_gateway(SwitchLocation::Switch0).boxed() + builder + .start_gateway(SwitchLocation::Switch0, None) + .boxed() }), ), ( "start_gateway_switch1", Box::new(|builder| { - builder.start_gateway(SwitchLocation::Switch1).boxed() + builder + .start_gateway(SwitchLocation::Switch1, None) + .boxed() }), ), ( diff --git a/nexus/tests/integration_tests/initialization.rs b/nexus/tests/integration_tests/initialization.rs index b21cc37f4f..0ff7906056 100644 --- a/nexus/tests/integration_tests/initialization.rs +++ b/nexus/tests/integration_tests/initialization.rs @@ -89,10 +89,11 @@ async fn test_nexus_boots_before_dendrite() { let log = builder.logctx.log.new(o!("component" => "test")); // Start MGS + Sim SP. This is needed for the Dendrite client initialization - // inside of Nexus initialization + // inside of Nexus initialization. We must use MGS_PORT here because Nexus + // hardcodes it. info!(&log, "Starting MGS"); - builder.start_gateway(SwitchLocation::Switch0).await; - builder.start_gateway(SwitchLocation::Switch1).await; + builder.start_gateway(SwitchLocation::Switch0, Some(MGS_PORT)).await; + builder.start_gateway(SwitchLocation::Switch1, None).await; info!(&log, "Started MGS"); let populate = true; From d339979bded50044a33048975212a126a324adbf Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 9 Mar 2024 13:20:07 -0800 Subject: [PATCH 20/34] fix warnings --- nexus/test-utils/src/lib.rs | 1 - nexus/tests/integration_tests/initialization.rs | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 0a03016e1d..887abe0c35 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -61,7 +61,6 @@ use trust_dns_resolver::config::ResolverOpts; use trust_dns_resolver::TokioAsyncResolver; use uuid::Uuid; -use chrono::SecondsFormat; use omicron_common::api::external::Generation; use omicron_common::api::external::Vni; pub use sim::TEST_HARDWARE_THREADS; diff --git a/nexus/tests/integration_tests/initialization.rs b/nexus/tests/integration_tests/initialization.rs index 0ff7906056..a76aef832e 100644 --- a/nexus/tests/integration_tests/initialization.rs +++ b/nexus/tests/integration_tests/initialization.rs @@ -2,17 +2,13 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::collections::HashMap; -use std::net::{Ipv6Addr, SocketAddrV6}; - -use gateway_messages::SpPort; -use gateway_test_utils::setup as mgs_setup; use nexus_config::Database; use nexus_config::InternalDns; use nexus_test_interface::NexusServer; use nexus_test_utils::{load_test_config, ControlPlaneTestContextBuilder}; use omicron_common::address::MGS_PORT; use omicron_common::api::internal::shared::SwitchLocation; +use std::collections::HashMap; use tokio::time::sleep; use tokio::time::timeout; use tokio::time::Duration; From f00e7e0ca33832ce5aea93be296ae57a877bf214 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 9 Mar 2024 14:40:24 -0800 Subject: [PATCH 21/34] mismerged Cargo.lock? seems to have caused tons of spurious chrono warnings? --- Cargo.lock | 102 ++++++++++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5679e52a67..db48902f75 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -54,9 +54,9 @@ dependencies = [ [[package]] name = "ahash" -version = "0.8.11" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +checksum = "42cd52102d3df161c77a887b608d7a4897d7cc112886a9537b738a887a03aaff" dependencies = [ "cfg-if", "getrandom 0.2.12", @@ -160,9 +160,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.80" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" +checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" dependencies = [ "backtrace", ] @@ -241,7 +241,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed72493ac66d5804837f480ab3766c72bdfab91a65e565fc54fa9e42db0073a8" dependencies = [ "anstyle", - "bstr 1.9.1", + "bstr 1.9.0", "doc-comment", "predicates", "predicates-core", @@ -678,12 +678,12 @@ dependencies = [ [[package]] name = "bstr" -version = "1.9.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" +checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", - "regex-automata 0.4.6", + "regex-automata 0.4.5", "serde", ] @@ -909,9 +909,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.35" +version = "0.4.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eaf5903dcbc0a39312feb77df2ff4c76387d591b9fc7b04a238dcf8bb62639a" +checksum = "5bc015644b92d5890fab7489e49d21f879d5c990186827d42ec511919404f38b" dependencies = [ "android-tzdata", "iana-time-zone", @@ -988,9 +988,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.2" +version = "4.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651" +checksum = "c918d541ef2913577a0f9566e9ce27cb35b6df072075769e0b26cb5a554520da" dependencies = [ "clap_builder", "clap_derive", @@ -998,9 +998,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.2" +version = "4.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" +checksum = "9f3e7391dad68afb0c2ede1bf619f579a3dc9c2ec67f089baa397123a2f3d1eb" dependencies = [ "anstream", "anstyle", @@ -1228,7 +1228,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.2", + "clap 4.5.1", "criterion-plot", "futures", "is-terminal", @@ -1905,7 +1905,7 @@ dependencies = [ "anyhow", "camino", "chrono", - "clap 4.5.2", + "clap 4.5.1", "dns-service-client", "dropshot", "expectorate", @@ -2212,7 +2212,7 @@ dependencies = [ "async-trait", "base64", "chrono", - "clap 4.5.2", + "clap 4.5.1", "colored", "dhcproto", "http 0.2.12", @@ -2620,7 +2620,7 @@ name = "gateway-cli" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.5.2", + "clap 4.5.1", "futures", "gateway-client", "gateway-messages", @@ -2802,7 +2802,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "759c97c1e17c55525b57192c06a267cda0ac5210b222d6b82189a2338fa1c13d" dependencies = [ "aho-corasick", - "bstr 1.9.1", + "bstr 1.9.0", "fnv", "log", "regex", @@ -3535,7 +3535,7 @@ dependencies = [ "bytes", "camino", "cancel-safe-futures", - "clap 4.5.2", + "clap 4.5.1", "ddm-admin-client", "display-error-chain", "futures", @@ -3595,7 +3595,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", - "clap 4.5.2", + "clap 4.5.1", "dropshot", "expectorate", "hyper 0.14.28", @@ -3676,7 +3676,7 @@ name = "internal-dns-cli" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.5.2", + "clap 4.5.1", "dropshot", "internal-dns", "omicron-common", @@ -3959,7 +3959,7 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d8de370f98a6cb8a4606618e53e802f93b094ddec0f96988eaec2c27e6e9ce7" dependencies = [ - "clap 4.5.2", + "clap 4.5.1", "termcolor", "threadpool", ] @@ -4015,7 +4015,7 @@ version = "0.2.4" source = "git+https://github.com/oxidecomputer/lpc55_support#96f064eaae5e95930efaab6c29fd1b2e22225dac" dependencies = [ "bitfield", - "clap 4.5.2", + "clap 4.5.1", "packed_struct", "serde", ] @@ -5043,7 +5043,7 @@ dependencies = [ "anyhow", "camino", "camino-tempfile", - "clap 4.5.2", + "clap 4.5.1", "dropshot", "expectorate", "futures", @@ -5077,7 +5077,7 @@ dependencies = [ "anyhow", "base64", "camino", - "clap 4.5.2", + "clap 4.5.1", "dropshot", "expectorate", "futures", @@ -5128,7 +5128,7 @@ dependencies = [ "camino-tempfile", "cancel-safe-futures", "chrono", - "clap 4.5.2", + "clap 4.5.1", "criterion", "crucible-agent-client", "crucible-pantry-client", @@ -5242,7 +5242,7 @@ dependencies = [ "camino", "camino-tempfile", "chrono", - "clap 4.5.2", + "clap 4.5.1", "crossterm", "crucible-agent-client", "csv", @@ -5294,7 +5294,7 @@ version = "0.1.0" dependencies = [ "anyhow", "camino", - "clap 4.5.2", + "clap 4.5.1", "expectorate", "futures", "hex", @@ -5361,7 +5361,7 @@ dependencies = [ "cancel-safe-futures", "cfg-if", "chrono", - "clap 4.5.2", + "clap 4.5.1", "crucible-agent-client", "ddm-admin-client", "derive_more", @@ -5495,12 +5495,12 @@ dependencies = [ "bitflags 1.3.2", "bitflags 2.4.2", "bstr 0.2.17", - "bstr 1.9.1", + "bstr 1.9.0", "byteorder", "bytes", "chrono", "cipher", - "clap 4.5.2", + "clap 4.5.1", "clap_builder", "console", "const-oid", @@ -5558,7 +5558,7 @@ dependencies = [ "rand 0.8.5", "rand_chacha 0.3.1", "regex", - "regex-automata 0.4.6", + "regex-automata 0.4.5", "regex-syntax 0.8.2", "reqwest", "ring 0.17.8", @@ -5878,7 +5878,7 @@ dependencies = [ "anyhow", "camino", "chrono", - "clap 4.5.2", + "clap 4.5.1", "dropshot", "expectorate", "futures", @@ -5921,7 +5921,7 @@ dependencies = [ "bytes", "camino", "chrono", - "clap 4.5.2", + "clap 4.5.1", "dropshot", "expectorate", "futures", @@ -5991,7 +5991,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 4.5.2", + "clap 4.5.1", "dropshot", "nexus-client", "omicron-common", @@ -6013,7 +6013,7 @@ dependencies = [ "anyhow", "camino", "chrono", - "clap 4.5.2", + "clap 4.5.1", "omicron-workspace-hack", "uuid", ] @@ -6773,7 +6773,7 @@ dependencies = [ "anyhow", "atty", "base64", - "clap 4.5.2", + "clap 4.5.1", "dropshot", "futures", "hyper 0.14.28", @@ -7068,7 +7068,7 @@ dependencies = [ "anyhow", "camino", "camino-tempfile", - "clap 4.5.2", + "clap 4.5.1", "dropshot", "expectorate", "humantime", @@ -7176,7 +7176,7 @@ checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.6", + "regex-automata 0.4.5", "regex-syntax 0.8.2", ] @@ -7188,9 +7188,9 @@ checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" dependencies = [ "aho-corasick", "memchr", @@ -8580,7 +8580,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", - "clap 4.5.2", + "clap 4.5.1", "dropshot", "futures", "gateway-messages", @@ -9766,7 +9766,7 @@ dependencies = [ "assert_cmd", "camino", "chrono", - "clap 4.5.2", + "clap 4.5.1", "console", "datatest-stable", "fs-err", @@ -10012,7 +10012,7 @@ dependencies = [ "camino", "camino-tempfile", "chrono", - "clap 4.5.2", + "clap 4.5.1", "debug-ignore", "display-error-chain", "dropshot", @@ -10043,7 +10043,7 @@ dependencies = [ "camino", "camino-tempfile", "cancel-safe-futures", - "clap 4.5.2", + "clap 4.5.1", "debug-ignore", "derive-where", "either", @@ -10464,7 +10464,7 @@ dependencies = [ "buf-list", "camino", "ciborium", - "clap 4.5.2", + "clap 4.5.1", "crossterm", "futures", "humantime", @@ -10525,7 +10525,7 @@ dependencies = [ "bytes", "camino", "ciborium", - "clap 4.5.2", + "clap 4.5.1", "crossterm", "omicron-workspace-hack", "reedline", @@ -10550,7 +10550,7 @@ dependencies = [ "bytes", "camino", "camino-tempfile", - "clap 4.5.2", + "clap 4.5.1", "ddm-admin-client", "debug-ignore", "display-error-chain", @@ -10876,7 +10876,7 @@ dependencies = [ "camino", "cargo_metadata", "cargo_toml", - "clap 4.5.2", + "clap 4.5.1", ] [[package]] @@ -11008,7 +11008,7 @@ name = "zone-network-setup" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.5.2", + "clap 4.5.1", "dropshot", "illumos-utils", "omicron-common", From a259938a498f2948945f3e8905d4349df84a25ed Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 9 Mar 2024 15:03:53 -0800 Subject: [PATCH 22/34] test runner could look more realistic --- Cargo.lock | 2 + dev-tools/omdb/tests/env.out | 3 + dev-tools/omdb/tests/successes.out | 13 +- dev-tools/omicron-dev/src/bin/omicron-dev.rs | 2 +- .../tests/output/collector_basic.txt | 4 +- .../output/collector_sled_agent_errors.txt | 2 +- .../app/background/inventory_collection.rs | 4 +- nexus/src/lib.rs | 22 +- nexus/test-interface/src/lib.rs | 1 + nexus/test-utils/Cargo.toml | 2 + nexus/test-utils/src/lib.rs | 376 +++++++++++++++--- .../tests/integration_tests/initialization.rs | 29 +- nexus/tests/integration_tests/sleds.rs | 14 +- sled-agent/src/sim/sled_agent.rs | 2 +- 14 files changed, 378 insertions(+), 98 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0cfc7b4500..db48902f75 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4648,6 +4648,7 @@ dependencies = [ "headers", "http 0.2.12", "hyper 0.14.28", + "illumos-utils", "internal-dns", "nexus-config", "nexus-db-queries", @@ -4664,6 +4665,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", + "sled-agent-client", "slog", "tokio", "tokio-util", diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 3e6e89d508..ef8cf1631e 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -3,6 +3,7 @@ termination: Exited(0) --------------------------------------------- stdout: SERIAL IP ROLE ID +sim-039be560 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED sim-b6d65341 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED --------------------------------------------- stderr: @@ -268,6 +269,7 @@ termination: Exited(0) --------------------------------------------- stdout: SERIAL IP ROLE ID +sim-039be560 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED sim-b6d65341 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED --------------------------------------------- stderr: @@ -281,6 +283,7 @@ termination: Exited(0) --------------------------------------------- stdout: SERIAL IP ROLE ID +sim-039be560 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED sim-b6d65341 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED --------------------------------------------- stderr: diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index fe590acf55..2da6e4dceb 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -77,12 +77,10 @@ termination: Exited(0) stdout: SERVICE INSTANCE_ID ADDR SLED_SERIAL CruciblePantry REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 -Dendrite REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 -Dendrite REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 ExternalDns REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 InternalDns REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 Nexus REDACTED_UUID_REDACTED_UUID_REDACTED [::ffff:127.0.0.1]:REDACTED_PORT sim-b6d65341 -Mgd REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 +Mgd REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-039be560 Mgd REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT sim-b6d65341 --------------------------------------------- stderr: @@ -93,17 +91,19 @@ EXECUTING COMMAND: omdb ["db", "services", "list-by-sled"] termination: Exited(0) --------------------------------------------- stdout: +sled: sim-039be560 (id REDACTED_UUID_REDACTED_UUID_REDACTED) + + SERVICE INSTANCE_ID ADDR + Mgd REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT + sled: sim-b6d65341 (id REDACTED_UUID_REDACTED_UUID_REDACTED) SERVICE INSTANCE_ID ADDR CruciblePantry REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT - Dendrite REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT - Dendrite REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT ExternalDns REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT InternalDns REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT Nexus REDACTED_UUID_REDACTED_UUID_REDACTED [::ffff:127.0.0.1]:REDACTED_PORT Mgd REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT - Mgd REDACTED_UUID_REDACTED_UUID_REDACTED [::1]:REDACTED_PORT --------------------------------------------- stderr: @@ -115,6 +115,7 @@ termination: Exited(0) --------------------------------------------- stdout: SERIAL IP ROLE ID +sim-039be560 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED sim-b6d65341 [::1]:REDACTED_PORT scrimlet REDACTED_UUID_REDACTED_UUID_REDACTED --------------------------------------------- stderr: diff --git a/dev-tools/omicron-dev/src/bin/omicron-dev.rs b/dev-tools/omicron-dev/src/bin/omicron-dev.rs index 5e0c6486d6..94c56b50f2 100644 --- a/dev-tools/omicron-dev/src/bin/omicron-dev.rs +++ b/dev-tools/omicron-dev/src/bin/omicron-dev.rs @@ -545,7 +545,7 @@ async fn cmd_run_all(args: &RunAllArgs) -> Result<(), anyhow::Error> { ); println!( "omicron-dev: management gateway: http://{}", - cptestctx.gateway.client.bind_address, + cptestctx.gateway.values().next().unwrap().client.bind_address, ); println!("omicron-dev: silo name: {}", cptestctx.silo_name,); println!( diff --git a/nexus/inventory/tests/output/collector_basic.txt b/nexus/inventory/tests/output/collector_basic.txt index 4a05f09e1c..0fc1c552ab 100644 --- a/nexus/inventory/tests/output/collector_basic.txt +++ b/nexus/inventory/tests/output/collector_basic.txt @@ -71,12 +71,12 @@ rot pages found: CfpaScratch baseboard part "i86pc" serial "SimGimlet01": data_base64 "Z2ltbGV0LWNmcGEtc2NyYXRjaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=" sled agents found: - sled 03265caf-da7d-46c7-b1c2-39fa90ce5c65 (Gimlet) + sled 03265caf-da7d-46c7-b1c2-39fa90ce5c65 (Scrimlet) baseboard Some(BaseboardId { part_number: "sim-gimlet", serial_number: "sim-03265caf-da7d-46c7-b1c2-39fa90ce5c65" }) zone generation: Generation(3) zones found: zone 8b88a56f-3eb6-4d80-ba42-75d867bc427d type oximeter - sled 9cb9b78f-5614-440c-b66d-e8e81fab69b0 (Gimlet) + sled 9cb9b78f-5614-440c-b66d-e8e81fab69b0 (Scrimlet) baseboard Some(BaseboardId { part_number: "sim-gimlet", serial_number: "sim-9cb9b78f-5614-440c-b66d-e8e81fab69b0" }) zone generation: Generation(3) zones found: diff --git a/nexus/inventory/tests/output/collector_sled_agent_errors.txt b/nexus/inventory/tests/output/collector_sled_agent_errors.txt index aaa31fd1bb..7b9bbce84e 100644 --- a/nexus/inventory/tests/output/collector_sled_agent_errors.txt +++ b/nexus/inventory/tests/output/collector_sled_agent_errors.txt @@ -70,7 +70,7 @@ rot pages found: CfpaScratch baseboard part "i86pc" serial "SimGimlet01": data_base64 "Z2ltbGV0LWNmcGEtc2NyYXRjaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=" sled agents found: - sled 9cb9b78f-5614-440c-b66d-e8e81fab69b0 (Gimlet) + sled 9cb9b78f-5614-440c-b66d-e8e81fab69b0 (Scrimlet) baseboard Some(BaseboardId { part_number: "sim-gimlet", serial_number: "sim-9cb9b78f-5614-440c-b66d-e8e81fab69b0" }) zone generation: Generation(3) zones found: diff --git a/nexus/src/app/background/inventory_collection.rs b/nexus/src/app/background/inventory_collection.rs index 27f08ec738..0666c136fc 100644 --- a/nexus/src/app/background/inventory_collection.rs +++ b/nexus/src/app/background/inventory_collection.rs @@ -314,9 +314,9 @@ mod test { page_size: NonZeroU32::new(3).unwrap(), }; - // There will be one sled agent set up as part of the test context. + // There will be two sled agents set up as part of the test context. let found_urls = db_enum.list_sled_agents().await.unwrap(); - assert_eq!(found_urls.len(), 1); + assert_eq!(found_urls.len(), 2); // Insert some sleds. let rack_id = Uuid::new_v4(); diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index 771a78f0b1..c0fba31afb 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -27,6 +27,7 @@ use dropshot::ConfigDropshot; use external_api::http_entrypoints::external_api; use internal_api::http_entrypoints::internal_api; use nexus_config::NexusConfig; +use nexus_types::external_api::views::SledProvisionPolicy; use nexus_types::internal_api::params::ServiceKind; use omicron_common::address::IpRange; use omicron_common::api::internal::shared::{ @@ -237,6 +238,7 @@ impl nexus_test_interface::NexusServer for Server { external_dns_zone_name: &str, recovery_silo: nexus_types::internal_api::params::RecoverySiloConfig, certs: Vec, + disable_sled_id: Uuid, ) -> Self { // Perform the "handoff from RSS". // @@ -302,7 +304,25 @@ impl nexus_test_interface::NexusServer for Server { .expect("Could not initialize rack"); // Start the Nexus external API. - Server::start(internal_server).await.unwrap() + let rv = Server::start(internal_server).await.unwrap(); + + // Historically, tests have assumed that there's only one provisionable + // sled, and that's convenient for a lot of purposes. Mark our second + // sled non-provisionable. + let nexus = &rv.apictx().nexus; + nexus + .sled_set_provision_policy( + &opctx, + &nexus_db_queries::db::lookup::LookupPath::new( + &opctx, + nexus.datastore(), + ) + .sled_id(disable_sled_id), + SledProvisionPolicy::NonProvisionable, + ) + .await + .unwrap(); + rv } async fn get_http_server_external_address(&self) -> SocketAddr { diff --git a/nexus/test-interface/src/lib.rs b/nexus/test-interface/src/lib.rs index 0f53ac6445..10bc9e63f0 100644 --- a/nexus/test-interface/src/lib.rs +++ b/nexus/test-interface/src/lib.rs @@ -56,6 +56,7 @@ pub trait NexusServer: Send + Sync + 'static { external_dns_zone_name: &str, recovery_silo: nexus_types::internal_api::params::RecoverySiloConfig, tls_certificates: Vec, + disable_sled_id: Uuid, ) -> Self; async fn get_http_server_external_address(&self) -> SocketAddr; diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index e612547fa8..861527108b 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -20,6 +20,7 @@ gateway-test-utils.workspace = true headers.workspace = true http.workspace = true hyper.workspace = true +illumos-utils.workspace = true internal-dns.workspace = true nexus-config.workspace = true nexus-db-queries.workspace = true @@ -35,6 +36,7 @@ oximeter-producer.workspace = true serde.workspace = true serde_json.workspace = true serde_urlencoded.workspace = true +sled-agent-client.workspace = true slog.workspace = true tokio.workspace = true tokio-util.workspace = true diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 4ef77b3352..9681d9ff97 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -33,12 +33,20 @@ use nexus_types::internal_api::params::RecoverySiloConfig; use nexus_types::internal_api::params::ServiceKind; use nexus_types::internal_api::params::ServiceNic; use nexus_types::internal_api::params::ServicePutRequest; +use nexus_types::inventory::OmicronZoneConfig; +use nexus_types::inventory::OmicronZoneDataset; +use nexus_types::inventory::OmicronZoneType; +use nexus_types::inventory::OmicronZonesConfig; use omicron_common::address::DNS_OPTE_IPV4_SUBNET; use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; +use omicron_common::api::external::Generation; use omicron_common::api::external::MacAddr; +use omicron_common::api::external::Vni; use omicron_common::api::external::{IdentityMetadata, Name}; use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::api::internal::nexus::ProducerKind; +use omicron_common::api::internal::shared::NetworkInterface; +use omicron_common::api::internal::shared::NetworkInterfaceKind; use omicron_common::api::internal::shared::SwitchLocation; use omicron_sled_agent::sim; use omicron_test_utils::dev; @@ -65,6 +73,7 @@ pub mod http_testing; pub mod resource_helpers; pub const SLED_AGENT_UUID: &str = "b6d65341-167c-41df-9b5c-41cded99c229"; +pub const SLED_AGENT2_UUID: &str = "039be560-54cc-49e3-88df-1a29dadbf913"; pub const RACK_UUID: &str = "c19a698f-c6f9-4a17-ae30-20d711b8f7dc"; pub const SWITCH_UUID: &str = "dae4e1f1-410e-4314-bff1-fec0504be07e"; pub const OXIMETER_UUID: &str = "39e6175b-4df2-4730-b11d-cbc1e60a2e78"; @@ -88,9 +97,11 @@ pub struct ControlPlaneTestContext { pub logctx: LogContext, pub sled_agent_storage: camino_tempfile::Utf8TempDir, pub sled_agent: sim::Server, + pub sled_agent2_storage: camino_tempfile::Utf8TempDir, + pub sled_agent2: sim::Server, pub oximeter: Oximeter, pub producer: ProducerServer, - pub gateway: GatewayTestContext, + pub gateway: HashMap, pub dendrite: HashMap, pub mgd: HashMap, pub external_dns_zone_name: String, @@ -110,9 +121,12 @@ impl ControlPlaneTestContext { self.database.cleanup().await.unwrap(); self.clickhouse.cleanup().await.unwrap(); self.sled_agent.http_server.close().await.unwrap(); + self.sled_agent2.http_server.close().await.unwrap(); self.oximeter.close().await.unwrap(); self.producer.close().await.unwrap(); - self.gateway.teardown().await; + for (_, gateway) in self.gateway { + gateway.teardown().await; + } for (_, mut dendrite) in self.dendrite { dendrite.cleanup().await.unwrap(); } @@ -179,18 +193,18 @@ impl RackInitRequestBuilder { // Keeps track of: // - The "ServicePutRequest" (for handoff to Nexus) // - The internal DNS configuration for this service - fn add_service( + fn add_service_with_id( &mut self, + zone_id: Uuid, address: SocketAddrV6, kind: ServiceKind, service_name: internal_dns::ServiceName, sled_id: Uuid, ) { - let zone_id = Uuid::new_v4(); self.services.push(ServicePutRequest { address, kind, - service_id: Uuid::new_v4(), + service_id: zone_id, sled_id, zone_id: Some(zone_id), }); @@ -203,6 +217,22 @@ impl RackInitRequestBuilder { .expect("Failed to set up DNS for {kind}"); } + fn add_service_without_dns( + &mut self, + zone_id: Uuid, + address: SocketAddrV6, + kind: ServiceKind, + sled_id: Uuid, + ) { + self.services.push(ServicePutRequest { + address, + kind, + service_id: zone_id, + sled_id, + zone_id: Some(zone_id), + }); + } + // Keeps track of: // - The "DatasetPutRequest" (for handoff to Nexus) // - The internal DNS configuration for this service @@ -245,9 +275,11 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { pub clickhouse: Option, pub sled_agent_storage: Option, pub sled_agent: Option, + pub sled_agent2_storage: Option, + pub sled_agent2: Option, pub oximeter: Option, pub producer: Option, - pub gateway: Option, + pub gateway: HashMap, pub dendrite: HashMap, pub mgd: HashMap, @@ -260,6 +292,8 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { pub external_dns: Option, pub internal_dns: Option, dns_config: Option, + omicron_zones: Vec, + omicron_zones2: Vec, pub silo_name: Option, pub user_name: Option, @@ -289,9 +323,11 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { clickhouse: None, sled_agent_storage: None, sled_agent: None, + sled_agent2_storage: None, + sled_agent2: None, oximeter: None, producer: None, - gateway: None, + gateway: HashMap::new(), dendrite: HashMap::new(), mgd: HashMap::new(), nexus_internal: None, @@ -300,6 +336,8 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { external_dns: None, internal_dns: None, dns_config: None, + omicron_zones: Vec::new(), + omicron_zones2: Vec::new(), silo_name: None, user_name: None, } @@ -380,6 +418,18 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { DatasetKind::Cockroach, internal_dns::ServiceName::Cockroach, ); + let pool_name = illumos_utils::zpool::ZpoolName::new_external(zpool_id) + .to_string() + .parse() + .unwrap(); + self.omicron_zones.push(OmicronZoneConfig { + id: dataset_id, + underlay_address: *address.ip(), + zone_type: OmicronZoneType::CockroachDb { + address: address.to_string(), + dataset: OmicronZoneDataset { pool_name }, + }, + }); self.database = Some(database); } @@ -416,37 +466,40 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { .as_mut() .expect("Tests expect to set a port of Clickhouse") .set_port(port); + + let pool_name = illumos_utils::zpool::ZpoolName::new_external(zpool_id) + .to_string() + .parse() + .unwrap(); + self.omicron_zones.push(OmicronZoneConfig { + id: dataset_id, + underlay_address: *address.ip(), + zone_type: OmicronZoneType::Clickhouse { + address: address.to_string(), + dataset: OmicronZoneDataset { pool_name }, + }, + }); } - pub async fn start_gateway(&mut self) { - // For now, this MGS is not configured to match up in any way with - // either the simulated sled agent or the Dendrite instances. It's - // useful for testing stuff unrelated to that. But at some point we - // will probably want the reported data to match up better. + pub async fn start_gateway( + &mut self, + switch_location: SwitchLocation, + port: Option, + ) { debug!(&self.logctx.log, "Starting Management Gateway"); - let gateway = gateway_test_utils::setup::test_setup( + let (mgs_config, sp_sim_config) = + gateway_test_utils::setup::load_test_config(); + let mgs_addr = + port.map(|port| SocketAddrV6::new(Ipv6Addr::LOCALHOST, port, 0, 0)); + let gateway = gateway_test_utils::setup::test_setup_with_config( self.test_name, gateway_messages::SpPort::One, + mgs_config, + &sp_sim_config, + mgs_addr, ) .await; - let fake_mgs_zone_id = Uuid::new_v4(); - let SocketAddr::V6(v6addr) = gateway.client.bind_address else { - panic!("MGS unexpectedly listening on IPv4?"); - }; - let zone = self - .rack_init_builder - .internal_dns_config - .host_zone(fake_mgs_zone_id, *v6addr.ip()) - .expect("Failed to add DNS for MGS zone"); - self.rack_init_builder - .internal_dns_config - .service_backend_zone( - internal_dns::ServiceName::ManagementGatewayService, - &zone, - v6addr.port(), - ) - .expect("Failed to add DNS for MGS service"); - self.gateway = Some(gateway); + self.gateway.insert(switch_location, gateway); } pub async fn start_dendrite(&mut self, switch_location: SwitchLocation) { @@ -466,11 +519,16 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { let config = DpdConfig { address: std::net::SocketAddr::V6(address) }; self.config.pkg.dendrite.insert(switch_location, config); - let sled_id = Uuid::parse_str(SLED_AGENT_UUID).unwrap(); - self.rack_init_builder.add_service( + let sled_id = Uuid::parse_str(match switch_location { + SwitchLocation::Switch0 => SLED_AGENT_UUID, + SwitchLocation::Switch1 => SLED_AGENT2_UUID, + }) + .unwrap(); + + self.rack_init_builder.add_service_without_dns( + sled_id, address, ServiceKind::Dendrite, - internal_dns::ServiceName::Dendrite, sled_id, ); } @@ -490,15 +548,46 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { let config = MgdConfig { address: std::net::SocketAddr::V6(address) }; self.config.pkg.mgd.insert(switch_location, config); - let sled_id = Uuid::parse_str(SLED_AGENT_UUID).unwrap(); - self.rack_init_builder.add_service( + let sled_id = Uuid::parse_str(match switch_location { + SwitchLocation::Switch0 => SLED_AGENT_UUID, + SwitchLocation::Switch1 => SLED_AGENT2_UUID, + }) + .unwrap(); + + self.rack_init_builder.add_service_without_dns( + sled_id, address, ServiceKind::Mgd, - internal_dns::ServiceName::Mgd, sled_id, ); } + pub async fn record_switch_dns(&mut self) { + let log = &self.logctx.log; + debug!(log, "Recording DNS for the switch zones"); + for (sled_id, switch_location) in &[ + (SLED_AGENT_UUID, SwitchLocation::Switch0), + (SLED_AGENT2_UUID, SwitchLocation::Switch1), + ] { + let id = sled_id.parse().unwrap(); + self.rack_init_builder + .internal_dns_config + .host_zone_switch( + id, + Ipv6Addr::LOCALHOST, + self.dendrite.get(switch_location).unwrap().port, + self.gateway + .get(switch_location) + .unwrap() + .client + .bind_address + .port(), + self.mgd.get(switch_location).unwrap().port, + ) + .unwrap(); + } + } + pub async fn start_oximeter(&mut self) { let log = &self.logctx.log; debug!(log, "Starting Oximeter"); @@ -585,16 +674,14 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { .mac_addrs .next() .expect("ran out of MAC addresses"); - self.rack_init_builder.add_service( + let external_address = + self.config.deployment.dropshot_external.dropshot.bind_address.ip(); + let nexus_id = self.config.deployment.id; + self.rack_init_builder.add_service_with_id( + nexus_id, address, ServiceKind::Nexus { - external_address: self - .config - .deployment - .dropshot_external - .dropshot - .bind_address - .ip(), + external_address, nic: ServiceNic { id: Uuid::new_v4(), name: "nexus".parse().unwrap(), @@ -610,6 +697,32 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { sled_id, ); + self.omicron_zones.push(OmicronZoneConfig { + id: nexus_id, + underlay_address: *address.ip(), + zone_type: OmicronZoneType::Nexus { + external_dns_servers: self + .config + .deployment + .external_dns_servers + .clone(), + external_ip: external_address, + external_tls: self.config.deployment.dropshot_external.tls, + internal_address: address.to_string(), + nic: NetworkInterface { + id: Uuid::new_v4(), + ip: external_address, + kind: NetworkInterfaceKind::Service { id: nexus_id }, + mac, + name: format!("nexus-{}", nexus_id).parse().unwrap(), + primary: true, + slot: 0, + subnet: (*NEXUS_OPTE_IPV4_SUBNET).into(), + vni: Vni::SERVICES_VNI, + }, + }, + }); + self.nexus_internal = Some(nexus_internal); self.nexus_internal_addr = Some(nexus_internal_addr); } @@ -701,6 +814,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { &external_dns_zone_name, recovery_silo, tls_certificates, + SLED_AGENT2_UUID.parse().unwrap(), ) .await; @@ -729,12 +843,22 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { self.server = Some(server); } - pub async fn start_sled(&mut self, sim_mode: sim::SimMode) { + pub async fn start_sled( + &mut self, + switch_location: SwitchLocation, + sim_mode: sim::SimMode, + ) { let nexus_address = self.nexus_internal_addr.expect("Must launch Nexus first"); // Set up a single sled agent. - let sa_id = Uuid::parse_str(SLED_AGENT_UUID).unwrap(); + let sa_id: Uuid = if switch_location == SwitchLocation::Switch0 { + SLED_AGENT_UUID + } else { + SLED_AGENT2_UUID + } + .parse() + .unwrap(); let tempdir = camino_tempfile::tempdir().unwrap(); let sled_agent = start_sled_agent( self.logctx.log.new(o!( @@ -749,8 +873,40 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { .await .expect("Failed to start sled agent"); - self.sled_agent = Some(sled_agent); - self.sled_agent_storage = Some(tempdir); + if switch_location == SwitchLocation::Switch0 { + self.sled_agent = Some(sled_agent); + self.sled_agent_storage = Some(tempdir); + } else { + self.sled_agent2 = Some(sled_agent); + self.sled_agent2_storage = Some(tempdir); + } + } + + pub async fn configure_sled_agent( + &mut self, + switch_location: SwitchLocation, + ) { + let (field, zones) = if switch_location == SwitchLocation::Switch0 { + (&self.sled_agent, &self.omicron_zones) + } else { + (&self.sled_agent2, &self.omicron_zones2) + }; + + // Tell our Sled Agent to report the zones that we configured. + let Some(sled_agent) = field else { + panic!("expected sled agent has not been created"); + }; + let client = sled_agent_client::Client::new( + &format!("http://{}", sled_agent.http_server.local_addr()), + self.logctx.log.clone(), + ); + client + .omicron_zones_put(&OmicronZonesConfig { + zones: zones.clone(), + generation: Generation::new().next(), + }) + .await + .expect("Failed to configure sled agent with our zones"); } // Set up the Crucible Pantry on an existing Sled Agent. @@ -768,12 +924,21 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { }; let sled_id = Uuid::parse_str(SLED_AGENT_UUID).unwrap(); - self.rack_init_builder.add_service( + let zone_id = Uuid::new_v4(); + self.rack_init_builder.add_service_with_id( + zone_id, address, ServiceKind::CruciblePantry, internal_dns::ServiceName::CruciblePantry, sled_id, ); + self.omicron_zones.push(OmicronZoneConfig { + id: zone_id, + underlay_address: *address.ip(), + zone_type: OmicronZoneType::CruciblePantry { + address: address.to_string(), + }, + }); } // Set up an external DNS server. @@ -796,7 +961,9 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { .mac_addrs .next() .expect("ran out of MAC addresses"); - self.rack_init_builder.add_service( + let zone_id = Uuid::new_v4(); + self.rack_init_builder.add_service_with_id( + zone_id, dropshot_address, ServiceKind::ExternalDns { external_address: (*dns_address.ip()).into(), @@ -814,6 +981,33 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { internal_dns::ServiceName::ExternalDns, sled_id, ); + + let zpool_id = Uuid::new_v4(); + let pool_name = illumos_utils::zpool::ZpoolName::new_external(zpool_id) + .to_string() + .parse() + .unwrap(); + self.omicron_zones.push(OmicronZoneConfig { + id: zone_id, + underlay_address: *dropshot_address.ip(), + zone_type: OmicronZoneType::ExternalDns { + dataset: OmicronZoneDataset { pool_name }, + dns_address: dns_address.to_string(), + http_address: dropshot_address.to_string(), + nic: NetworkInterface { + id: Uuid::new_v4(), + ip: (*dns_address.ip()).into(), + kind: NetworkInterfaceKind::Service { id: zone_id }, + mac, + name: format!("external-dns-{}", zone_id).parse().unwrap(), + primary: true, + slot: 0, + subnet: (*DNS_OPTE_IPV4_SUBNET).into(), + vni: Vni::SERVICES_VNI, + }, + }, + }); + self.external_dns = Some(dns); } @@ -826,13 +1020,32 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { let SocketAddr::V6(address) = dns.dropshot_server.local_addr() else { panic!("Unsupported IPv4 DNS address"); }; - self.rack_init_builder.add_service( + let zone_id = Uuid::new_v4(); + self.rack_init_builder.add_service_with_id( + zone_id, address, ServiceKind::InternalDns, internal_dns::ServiceName::InternalDns, sled_id, ); + let zpool_id = Uuid::new_v4(); + let pool_name = illumos_utils::zpool::ZpoolName::new_external(zpool_id) + .to_string() + .parse() + .unwrap(); + self.omicron_zones.push(OmicronZoneConfig { + id: zone_id, + underlay_address: *address.ip(), + zone_type: OmicronZoneType::InternalDns { + dataset: OmicronZoneDataset { pool_name }, + dns_address: dns.dns_server.local_address().to_string(), + http_address: address.to_string(), + gz_address: Ipv6Addr::LOCALHOST, + gz_address_index: 0, + }, + }); + self.internal_dns = Some(dns); } @@ -846,10 +1059,12 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { clickhouse: self.clickhouse.unwrap(), sled_agent_storage: self.sled_agent_storage.unwrap(), sled_agent: self.sled_agent.unwrap(), + sled_agent2_storage: self.sled_agent2_storage.unwrap(), + sled_agent2: self.sled_agent2.unwrap(), oximeter: self.oximeter.unwrap(), producer: self.producer.unwrap(), logctx: self.logctx, - gateway: self.gateway.unwrap(), + gateway: self.gateway, dendrite: self.dendrite, mgd: self.mgd, external_dns_zone_name: self.external_dns_zone_name.unwrap(), @@ -873,13 +1088,16 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { if let Some(sled_agent) = self.sled_agent { sled_agent.http_server.close().await.unwrap(); } + if let Some(sled_agent2) = self.sled_agent2 { + sled_agent2.http_server.close().await.unwrap(); + } if let Some(oximeter) = self.oximeter { oximeter.close().await.unwrap(); } if let Some(producer) = self.producer { producer.close().await.unwrap(); } - if let Some(gateway) = self.gateway { + for (_, gateway) in self.gateway { gateway.teardown().await; } for (_, mut dendrite) in self.dendrite { @@ -990,8 +1208,20 @@ async fn setup_with_config_impl( Box::new(|builder| builder.start_clickhouse().boxed()), ), ( - "start_gateway", - Box::new(|builder| builder.start_gateway().boxed()), + "start_gateway_switch0", + Box::new(|builder| { + builder + .start_gateway(SwitchLocation::Switch0, None) + .boxed() + }), + ), + ( + "start_gateway_switch1", + Box::new(|builder| { + builder + .start_gateway(SwitchLocation::Switch1, None) + .boxed() + }), ), ( "start_dendrite_switch0", @@ -1017,6 +1247,10 @@ async fn setup_with_config_impl( builder.start_mgd(SwitchLocation::Switch1).boxed() }), ), + ( + "record_switch_dns", + Box::new(|builder| builder.record_switch_dns().boxed()), + ), ( "start_internal_dns", Box::new(|builder| builder.start_internal_dns().boxed()), @@ -1030,9 +1264,19 @@ async fn setup_with_config_impl( Box::new(|builder| builder.start_nexus_internal().boxed()), ), ( - "start_sled", + "start_sled1", Box::new(move |builder| { - builder.start_sled(sim_mode).boxed() + builder + .start_sled(SwitchLocation::Switch0, sim_mode) + .boxed() + }), + ), + ( + "start_sled2", + Box::new(move |builder| { + builder + .start_sled(SwitchLocation::Switch1, sim_mode) + .boxed() }), ), ( @@ -1043,6 +1287,22 @@ async fn setup_with_config_impl( "populate_internal_dns", Box::new(|builder| builder.populate_internal_dns().boxed()), ), + ( + "configure_sled_agent1", + Box::new(|builder| { + builder + .configure_sled_agent(SwitchLocation::Switch0) + .boxed() + }), + ), + ( + "configure_sled_agent2", + Box::new(|builder| { + builder + .configure_sled_agent(SwitchLocation::Switch1) + .boxed() + }), + ), ( "start_nexus_external", Box::new(|builder| { diff --git a/nexus/tests/integration_tests/initialization.rs b/nexus/tests/integration_tests/initialization.rs index b77a121080..a76aef832e 100644 --- a/nexus/tests/integration_tests/initialization.rs +++ b/nexus/tests/integration_tests/initialization.rs @@ -2,17 +2,13 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::collections::HashMap; -use std::net::{Ipv6Addr, SocketAddrV6}; - -use gateway_messages::SpPort; -use gateway_test_utils::setup as mgs_setup; use nexus_config::Database; use nexus_config::InternalDns; use nexus_test_interface::NexusServer; use nexus_test_utils::{load_test_config, ControlPlaneTestContextBuilder}; use omicron_common::address::MGS_PORT; use omicron_common::api::internal::shared::SwitchLocation; +use std::collections::HashMap; use tokio::time::sleep; use tokio::time::timeout; use tokio::time::Duration; @@ -78,19 +74,6 @@ async fn test_nexus_boots_before_cockroach() { #[tokio::test] async fn test_nexus_boots_before_dendrite() { - // Start MGS + Sim SP. This is needed for the Dendrite client initialization - // inside of Nexus initialization - let (mgs_config, sp_sim_config) = mgs_setup::load_test_config(); - let mgs_addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, MGS_PORT, 0, 0); - let mgs = mgs_setup::test_setup_with_config( - "test_nexus_boots_before_dendrite", - SpPort::One, - mgs_config, - &sp_sim_config, - Some(mgs_addr), - ) - .await; - let mut config = load_test_config(); let mut builder = @@ -101,6 +84,14 @@ async fn test_nexus_boots_before_dendrite() { let log = builder.logctx.log.new(o!("component" => "test")); + // Start MGS + Sim SP. This is needed for the Dendrite client initialization + // inside of Nexus initialization. We must use MGS_PORT here because Nexus + // hardcodes it. + info!(&log, "Starting MGS"); + builder.start_gateway(SwitchLocation::Switch0, Some(MGS_PORT)).await; + builder.start_gateway(SwitchLocation::Switch1, None).await; + info!(&log, "Started MGS"); + let populate = true; builder.start_crdb(populate).await; builder.start_internal_dns().await; @@ -150,6 +141,7 @@ async fn test_nexus_boots_before_dendrite() { info!(log, "Started mgd"); info!(log, "Populating internal DNS records"); + builder.record_switch_dns().await; builder.populate_internal_dns().await; info!(log, "Populated internal DNS records"); @@ -157,7 +149,6 @@ async fn test_nexus_boots_before_dendrite() { nexus_handle.await.expect("Test: Task starting Nexus has failed"); builder.teardown().await; - mgs.teardown().await; } // Helper to ensure we perform the same setup for the positive and negative test diff --git a/nexus/tests/integration_tests/sleds.rs b/nexus/tests/integration_tests/sleds.rs index b551cf51b5..743a76be17 100644 --- a/nexus/tests/integration_tests/sleds.rs +++ b/nexus/tests/integration_tests/sleds.rs @@ -48,9 +48,9 @@ async fn sled_instance_list( async fn test_sleds_list(cptestctx: &ControlPlaneTestContext) { let client = &cptestctx.external_client; - // Verify that there is one sled to begin with. + // Verify that there are two sleds to begin with. let sleds_url = "/v1/system/hardware/sleds"; - assert_eq!(sleds_list(&client, &sleds_url).await.len(), 1); + assert_eq!(sleds_list(&client, &sleds_url).await.len(), 2); // Now start a few more sled agents. let nsleds = 3; @@ -76,7 +76,7 @@ async fn test_sleds_list(cptestctx: &ControlPlaneTestContext) { // List sleds again. let sleds_found = sleds_list(&client, &sleds_url).await; - assert_eq!(sleds_found.len(), nsleds + 1); + assert_eq!(sleds_found.len(), nsleds + 2); let sledids_found = sleds_found.iter().map(|sv| sv.identity.id).collect::>(); @@ -97,9 +97,9 @@ async fn test_physical_disk_create_list_delete( let external_client = &cptestctx.external_client; let internal_client = &cptestctx.internal_client; - // Verify that there is one sled to begin with. + // Verify that there are two sleds to begin with. let sleds_url = "/v1/system/hardware/sleds"; - assert_eq!(sleds_list(&external_client, &sleds_url).await.len(), 1); + assert_eq!(sleds_list(&external_client, &sleds_url).await.len(), 2); // The test framework may set up some disks initially. let disks_url = @@ -140,9 +140,9 @@ async fn test_physical_disk_create_list_delete( async fn test_sled_instance_list(cptestctx: &ControlPlaneTestContext) { let external_client = &cptestctx.external_client; - // Verify that there is one sled to begin with. + // Verify that there are two sleds to begin with. let sleds_url = "/v1/system/hardware/sleds"; - assert_eq!(sleds_list(&external_client, &sleds_url).await.len(), 1); + assert_eq!(sleds_list(&external_client, &sleds_url).await.len(), 2); // Verify that there are no instances. let instances_url = diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 483b2d6aa8..0b90bef590 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -740,7 +740,7 @@ impl SledAgent { Ok(Inventory { sled_id: self.id, sled_agent_address, - sled_role: SledRole::Gimlet, + sled_role: SledRole::Scrimlet, baseboard: self.config.hardware.baseboard.clone(), usable_hardware_threads: self.config.hardware.hardware_threads, usable_physical_ram: ByteCount::try_from( From 1b7679255d2af0a8062ab1f3c644169761a83c68 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 9 Mar 2024 15:27:18 -0800 Subject: [PATCH 23/34] remove spurious change --- nexus/src/context.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/nexus/src/context.rs b/nexus/src/context.rs index d8be5f3ab0..cf2b9d6f17 100644 --- a/nexus/src/context.rs +++ b/nexus/src/context.rs @@ -195,7 +195,6 @@ impl ServerContext { // This means a new cockroachdb instance won't picked up until // Nexus restarts. let addrs = loop { - debug!(log, "Looking up cockroach addresses"); match resolver .lookup_all_socket_v6(ServiceName::Cockroach) .await From 1c22f48910e8ef84de6502eabd60a07167e93a27 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 9 Mar 2024 17:49:20 -0800 Subject: [PATCH 24/34] progress on the test --- Cargo.lock | 1 + nexus/reconfigurator/execution/Cargo.toml | 1 + nexus/reconfigurator/execution/src/dns.rs | 108 ++++++++++++++++++ .../planning/src/blueprint_builder.rs | 6 +- 4 files changed, 115 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index db48902f75..88ddfcecc9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4565,6 +4565,7 @@ dependencies = [ "nexus-db-queries", "nexus-inventory", "nexus-reconfigurator-planning", + "nexus-reconfigurator-preparation", "nexus-test-utils", "nexus-test-utils-macros", "nexus-types", diff --git a/nexus/reconfigurator/execution/Cargo.toml b/nexus/reconfigurator/execution/Cargo.toml index 72fed3044e..b3ae83313f 100644 --- a/nexus/reconfigurator/execution/Cargo.toml +++ b/nexus/reconfigurator/execution/Cargo.toml @@ -35,6 +35,7 @@ omicron-workspace-hack.workspace = true httptest.workspace = true ipnet.workspace = true nexus-reconfigurator-planning.workspace = true +nexus-reconfigurator-preparation.workspace = true nexus-inventory.workspace = true nexus-test-utils.workspace = true nexus-test-utils-macros.workspace = true diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index ab0b9b17ef..e67c535afe 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -441,7 +441,9 @@ mod test { use nexus_db_queries::context::OpContext; use nexus_inventory::CollectionBuilder; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; + use nexus_reconfigurator_planning::blueprint_builder::EnsureMultiple; use nexus_reconfigurator_planning::example::example; + use nexus_reconfigurator_preparation::policy_from_db; use nexus_test_utils::SLED_AGENT2_UUID; use nexus_test_utils::SLED_AGENT_UUID; use nexus_test_utils_macros::nexus_test; @@ -462,7 +464,9 @@ mod test { use nexus_types::internal_api::params::Srv; use omicron_common::address::get_sled_address; use omicron_common::address::get_switch_zone_address; + use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; + use omicron_common::address::NEXUS_REDUNDANCY; use omicron_common::address::RACK_PREFIX; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Error; @@ -1211,6 +1215,110 @@ mod test { dns_latest_external.generation ); + // Now, go through the motions of provisioning a new Nexus zone. + // We do this directly with BlueprintBuilder to avoid the planner + // deciding to make other unrelated changes. + let sled_rows = datastore.sled_list_all_batched(&opctx).await.unwrap(); + let zpool_rows = + datastore.zpool_list_all_external_batched(&opctx).await.unwrap(); + let ip_pool_range_rows = { + let (authz_service_ip_pool, _) = + datastore.ip_pools_service_lookup(&opctx).await.unwrap(); + datastore + .ip_pool_list_ranges_batched(&opctx, &authz_service_ip_pool) + .await + .unwrap() + }; + let mut policy = policy_from_db( + &sled_rows, + &zpool_rows, + &ip_pool_range_rows, + // This is not used because we're not actually going through the + // planner. + NEXUS_REDUNDANCY, + ) + .unwrap(); + // We'll need another (fake) external IP for this new Nexus. + policy + .service_ip_pool_ranges + .push(IpRange::from(IpAddr::V4(Ipv4Addr::LOCALHOST))); + let mut builder = BlueprintBuilder::new_based_on( + &log, + &blueprint, + Generation::from( + u32::try_from(dns_latest_internal.generation).unwrap(), + ), + Generation::from( + u32::try_from(dns_latest_external.generation).unwrap(), + ), + &policy, + "test suite", + ) + .unwrap(); + let sled_id = + blueprint.sleds().next().expect("expected at least one sled"); + let nalready = builder.sled_num_nexus_zones(sled_id); + let rv = builder + .sled_ensure_zone_multiple_nexus(sled_id, nalready + 1) + .unwrap(); + assert_eq!(rv, EnsureMultiple::Added(1)); + let blueprint2 = builder.build(); + + crate::realize_blueprint( + &opctx, + datastore, + &blueprint2, + "test-suite", + &overrides, + ) + .await + .expect("failed to execute second blueprint"); + + // Now fetch DNS again. Both should have changed this time. + let dns_latest_internal = datastore + .dns_config_read(&opctx, DnsGroup::Internal) + .await + .expect("fetching latest internal DNS"); + let dns_latest_external = datastore + .dns_config_read(&opctx, DnsGroup::External) + .await + .expect("fetching latest external DNS"); + + assert_eq!( + dns_latest_internal.generation, + dns_initial_internal.generation + 1, + ); + assert_eq!( + dns_latest_external.generation, + dns_initial_external.generation + 1, + ); + + // XXX-dap examine the specific changes to both DNS configs + + // If we execute it again, we should see no more changes. + crate::realize_blueprint( + &opctx, + datastore, + &blueprint2, + "test-suite", + &overrides, + ) + .await + .expect("failed to execute second blueprint again"); + + // Now fetch DNS again. Both should have changed this time. + let dns_internal2 = datastore + .dns_config_read(&opctx, DnsGroup::Internal) + .await + .expect("fetching latest internal DNS"); + let dns_external2 = datastore + .dns_config_read(&opctx, DnsGroup::External) + .await + .expect("fetching latest external DNS"); + + assert_eq!(dns_latest_internal.generation, dns_internal2.generation); + assert_eq!(dns_latest_external.generation, dns_external2.generation); + // XXX-dap continue writing the test. See above. } } diff --git a/nexus/reconfigurator/planning/src/blueprint_builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder.rs index 7dd0619e1f..0433280b09 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder.rs @@ -259,7 +259,11 @@ impl<'a> BlueprintBuilder<'a> { } } if let Some(external_ip) = z.zone_type.external_ip()? { - if !used_external_ips.insert(external_ip) { + // Ignore localhost. This is used in the test suite, and it + // gets reused many times. + if !external_ip.is_loopback() + && !used_external_ips.insert(external_ip) + { bail!("duplicate external IP: {external_ip}"); } } From 2b5c89b691996a4f6fe67ff02dfcfd16fe713eb3 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 9 Mar 2024 20:37:36 -0800 Subject: [PATCH 25/34] flesh out more of the test --- nexus/reconfigurator/execution/src/dns.rs | 101 +++++++++++++++++++--- 1 file changed, 91 insertions(+), 10 deletions(-) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index e67c535afe..56430f96ed 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -389,7 +389,7 @@ fn dns_compute_update( debug!( log, "adding name"; - "name" => name, + "dns_name" => name, "new_records" => ?new_records, ); update.add_name( @@ -402,7 +402,7 @@ fn dns_compute_update( debug!( log, "removing name"; - "name" => name, + "dns_name" => name, "old_records" => ?old_records, ); update.remove_name(name.to_string())?; @@ -412,7 +412,7 @@ fn dns_compute_update( debug!( log, "updating name"; - "name" => name, + "dns_name" => name, "old_records" => ?old_records, "new_records" => ?new_records, ); @@ -434,6 +434,7 @@ mod test { use crate::dns::silo_dns_name; use crate::ExecutionOverrides; use crate::Sled; + use dns_service_client::DnsDiff; use internal_dns::ServiceName; use internal_dns::DNS_ZONE; use nexus_db_model::DnsGroup; @@ -1160,6 +1161,7 @@ mod test { .blueprint_generate_from_collection(&opctx, collection.id) .await .expect("failed to generate initial blueprint"); + eprintln!("blueprint: {:?}", blueprint); // Now, execute the blueprint. // XXX-dap doc/cleanup @@ -1263,6 +1265,19 @@ mod test { .unwrap(); assert_eq!(rv, EnsureMultiple::Added(1)); let blueprint2 = builder.build(); + eprintln!("blueprint2: {:?}", blueprint2); + // Figure out the id of the new zone. + let zones_before = blueprint + .all_omicron_zones() + .filter_map(|(_, z)| z.zone_type.is_nexus().then_some(z.id)) + .collect::>(); + let zones_after = blueprint2 + .all_omicron_zones() + .filter_map(|(_, z)| z.zone_type.is_nexus().then_some(z.id)) + .collect::>(); + let new_zones: Vec<_> = zones_after.difference(&zones_before).collect(); + assert_eq!(new_zones.len(), 1); + let new_zone_id = *new_zones[0]; crate::realize_blueprint( &opctx, @@ -1279,21 +1294,63 @@ mod test { .dns_config_read(&opctx, DnsGroup::Internal) .await .expect("fetching latest internal DNS"); - let dns_latest_external = datastore - .dns_config_read(&opctx, DnsGroup::External) - .await - .expect("fetching latest external DNS"); assert_eq!( dns_latest_internal.generation, dns_initial_internal.generation + 1, ); + + let diff = + DnsDiff::new(&dns_initial_internal, &dns_latest_internal).unwrap(); + // There should be one new AAAA record for the zone itself. + let new_records: Vec<_> = diff.names_added().collect(); + let (new_name, &[DnsRecord::Aaaa(_)]) = new_records[0] else { + panic!("did not find expected AAAA record for new Nexus zone"); + }; + let new_zone_host = internal_dns::config::Host::for_zone( + new_zone_id, + internal_dns::config::ZoneVariant::Other, + ); + assert!(new_zone_host.fqdn().starts_with(new_name)); + + // Nothing was removed. + assert!(diff.names_removed().next().is_none()); + + // The SRV record for Nexus itself ought to have changed, growing one + // more record -- for the new AAAA record above. + let changed: Vec<_> = diff.names_changed().collect(); + assert_eq!(changed.len(), 1); + let (name, old_records, new_records) = changed[0]; + assert_eq!(name, ServiceName::Nexus.dns_name()); + let new_srv = subset_plus_one(old_records, new_records); + let DnsRecord::Srv(new_srv) = new_srv else { + panic!("expected SRV record, found {:?}", new_srv); + }; + assert_eq!(new_srv.target, new_zone_host.fqdn()); + + // As for external DNS: all existing names ought to have been changed, + // gaining a new AAAA record for the new host. + let dns_latest_external = datastore + .dns_config_read(&opctx, DnsGroup::External) + .await + .expect("fetching latest external DNS"); assert_eq!( dns_latest_external.generation, dns_initial_external.generation + 1, ); - - // XXX-dap examine the specific changes to both DNS configs + let diff = + DnsDiff::new(&dns_initial_external, &dns_latest_external).unwrap(); + assert!(diff.names_added().next().is_none()); + assert!(diff.names_removed().next().is_none()); + let changed: Vec<_> = diff.names_changed().collect(); + for (name, old_records, new_records) in changed { + // These are Silo names and end with ".sys". + assert!(name.ends_with(".sys")); + // We can't really tell which one points to what, especially in the + // test suite where all Nexus zones use localhost for their external + // IP. All we can tell is that there's one new one. + assert_eq!(old_records.len() + 1, new_records.len()); + } // If we execute it again, we should see no more changes. crate::realize_blueprint( @@ -1306,7 +1363,6 @@ mod test { .await .expect("failed to execute second blueprint again"); - // Now fetch DNS again. Both should have changed this time. let dns_internal2 = datastore .dns_config_read(&opctx, DnsGroup::Internal) .await @@ -1321,6 +1377,31 @@ mod test { // XXX-dap continue writing the test. See above. } + + fn subset_plus_one<'a, T: std::fmt::Debug + Ord + Eq>( + list1: &'a [T], + list2: &'a [T], + ) -> &'a T { + let set: BTreeSet<_> = list1.into_iter().collect(); + let mut extra = Vec::with_capacity(1); + for item in list2 { + if !set.contains(&item) { + extra.push(item); + } + } + + if extra.len() != 1 { + panic!( + "expected list2 to have one extra element:\n\ + list1: {:?}\n\ + list2: {:?}\n + extra: {:?}\n", + list1, list2, extra + ); + } + + extra.into_iter().next().unwrap() + } } // XXX-dap duplicated -- figure out where to put this From 27810132d99987a2b9dd615186b3794208478adc Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 9 Mar 2024 20:49:46 -0800 Subject: [PATCH 26/34] pull out overridables --- nexus/reconfigurator/execution/src/dns.rs | 10 +-- nexus/reconfigurator/execution/src/lib.rs | 59 +------------ .../execution/src/overridables.rs | 82 +++++++++++++++++++ 3 files changed, 90 insertions(+), 61 deletions(-) create mode 100644 nexus/reconfigurator/execution/src/overridables.rs diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 56430f96ed..1aa675a198 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -4,7 +4,7 @@ //! Propagates internal DNS changes in a given blueprint -use crate::ExecutionOverrides; +use crate::overridables::Overridables; use crate::Sled; use dns_service_client::DnsDiff; use internal_dns::DnsConfigBuilder; @@ -38,7 +38,7 @@ pub(crate) async fn deploy_dns( creator: String, blueprint: &Blueprint, sleds_by_id: &BTreeMap, - overrides: &ExecutionOverrides, + overrides: &Overridables, ) -> Result<(), Error> { // First, fetch the current DNS configs. let internal_dns_config_current = datastore @@ -221,7 +221,7 @@ pub(crate) async fn deploy_dns_one( pub fn blueprint_internal_dns_config( blueprint: &Blueprint, sleds_by_id: &BTreeMap, - overrides: &ExecutionOverrides, + overrides: &Overridables, ) -> DnsConfigParams { // The DNS names configured here should match what RSS configures for the // same zones. It's tricky to have RSS share the same code because it uses @@ -432,7 +432,7 @@ mod test { use super::dns_compute_update; use crate::dns::blueprint_external_dns_config; use crate::dns::silo_dns_name; - use crate::ExecutionOverrides; + use crate::overridables::Overridables; use crate::Sled; use dns_service_client::DnsDiff; use internal_dns::ServiceName; @@ -1165,7 +1165,7 @@ mod test { // Now, execute the blueprint. // XXX-dap doc/cleanup - let mut overrides = ExecutionOverrides::default(); + let mut overrides = Overridables::default(); let scrimlets = [ (SLED_AGENT_UUID, SwitchLocation::Switch0), (SLED_AGENT2_UUID, SwitchLocation::Switch1), diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index db28052e65..dac7abd5dd 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -11,73 +11,20 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; use nexus_types::identity::Asset; -use omicron_common::address::get_switch_zone_address; use omicron_common::address::Ipv6Subnet; -use omicron_common::address::DENDRITE_PORT; -use omicron_common::address::MGD_PORT; -use omicron_common::address::MGS_PORT; use omicron_common::address::SLED_PREFIX; use slog::info; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; -use std::net::Ipv6Addr; use std::net::SocketAddrV6; +use overridables::Overridables; use uuid::Uuid; mod datasets; mod dns; mod omicron_zones; mod resource_allocation; - -// XXX-dap -#[derive(Debug, Default)] -pub struct ExecutionOverrides { - pub dendrite_ports: BTreeMap, - pub mgs_ports: BTreeMap, - pub mgd_ports: BTreeMap, - pub switch_zone_ips: BTreeMap, -} - -impl ExecutionOverrides { - pub fn override_dendrite_port(&mut self, sled_id: Uuid, port: u16) { - self.dendrite_ports.insert(sled_id, port); - } - - fn dendrite_port(&self, sled_id: Uuid) -> u16 { - self.dendrite_ports.get(&sled_id).copied().unwrap_or(DENDRITE_PORT) - } - - pub fn override_mgs_port(&mut self, sled_id: Uuid, port: u16) { - self.mgs_ports.insert(sled_id, port); - } - - fn mgs_port(&self, sled_id: Uuid) -> u16 { - self.mgs_ports.get(&sled_id).copied().unwrap_or(MGS_PORT) - } - - pub fn override_mgd_port(&mut self, sled_id: Uuid, port: u16) { - self.mgd_ports.insert(sled_id, port); - } - - fn mgd_port(&self, sled_id: Uuid) -> u16 { - self.mgd_ports.get(&sled_id).copied().unwrap_or(MGD_PORT) - } - - pub fn override_switch_zone_ip(&mut self, sled_id: Uuid, addr: Ipv6Addr) { - self.switch_zone_ips.insert(sled_id, addr); - } - - fn switch_zone_ip( - &self, - sled_id: Uuid, - sled_subnet: Ipv6Subnet, - ) -> Ipv6Addr { - self.switch_zone_ips - .get(&sled_id) - .copied() - .unwrap_or_else(|| get_switch_zone_address(sled_subnet)) - } -} +mod overridables; struct Sled { id: Uuid, @@ -111,7 +58,7 @@ pub async fn realize_blueprint( datastore: &DataStore, blueprint: &Blueprint, nexus_label: S, - overrides: &ExecutionOverrides, + overrides: &Overridables, ) -> Result<(), Vec> where String: From, diff --git a/nexus/reconfigurator/execution/src/overridables.rs b/nexus/reconfigurator/execution/src/overridables.rs new file mode 100644 index 0000000000..80a14d1fed --- /dev/null +++ b/nexus/reconfigurator/execution/src/overridables.rs @@ -0,0 +1,82 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use omicron_common::address::get_switch_zone_address; +use omicron_common::address::Ipv6Subnet; +use omicron_common::address::DENDRITE_PORT; +use omicron_common::address::MGD_PORT; +use omicron_common::address::MGS_PORT; +use omicron_common::address::SLED_PREFIX; +use std::collections::BTreeMap; +use std::net::Ipv6Addr; +use uuid::Uuid; + +/// Override values used during blueprint execution +/// +/// Blueprint execution assumes certain values about production systems that +/// differ in the simulated testing environment and cannot be easily derived +/// from anything else in the environment. To accommodate this, this structure +/// provides access to these values. Everywhere except the test suite, this +/// structure is empty and returns the default (production) values. The test +/// suite overrides these values. +#[derive(Debug, Default)] +pub struct Overridables { + /// map: sled id -> TCP port on which that sled's Dendrite is listening + pub dendrite_ports: BTreeMap, + /// map: sled id -> TCP port on which that sled's MGS is listening + pub mgs_ports: BTreeMap, + /// map: sled id -> TCP port on which that sled's MGD is listening + pub mgd_ports: BTreeMap, + /// map: sled id -> IP address of the sled's switch zone + pub switch_zone_ips: BTreeMap, +} + +impl Overridables { + /// Specify the TCP port on which this sled's Dendrite is listening + pub fn override_dendrite_port(&mut self, sled_id: Uuid, port: u16) { + self.dendrite_ports.insert(sled_id, port); + } + + /// Returns the TCP port on which this sled's Dendrite is listening + pub fn dendrite_port(&self, sled_id: Uuid) -> u16 { + self.dendrite_ports.get(&sled_id).copied().unwrap_or(DENDRITE_PORT) + } + + /// Specify the TCP port on which this sled's MGS is listening + pub fn override_mgs_port(&mut self, sled_id: Uuid, port: u16) { + self.mgs_ports.insert(sled_id, port); + } + + /// Returns the TCP port on which this sled's MGS is listening + pub fn mgs_port(&self, sled_id: Uuid) -> u16 { + self.mgs_ports.get(&sled_id).copied().unwrap_or(MGS_PORT) + } + + /// Specify the TCP port on which this sled's MGD is listening + pub fn override_mgd_port(&mut self, sled_id: Uuid, port: u16) { + self.mgd_ports.insert(sled_id, port); + } + + /// Returns the TCP port on which this sled's MGD is listening + pub fn mgd_port(&self, sled_id: Uuid) -> u16 { + self.mgd_ports.get(&sled_id).copied().unwrap_or(MGD_PORT) + } + + /// Specify the IP address of this switch zone + pub fn override_switch_zone_ip(&mut self, sled_id: Uuid, addr: Ipv6Addr) { + self.switch_zone_ips.insert(sled_id, addr); + } + + /// Returns the IP address of this sled's switch zone + pub fn switch_zone_ip( + &self, + sled_id: Uuid, + sled_subnet: Ipv6Subnet, + ) -> Ipv6Addr { + self.switch_zone_ips + .get(&sled_id) + .copied() + .unwrap_or_else(|| get_switch_zone_address(sled_subnet)) + } +} From d322a96109f2f04245611a46497dd69bb7383074 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 9 Mar 2024 20:56:48 -0800 Subject: [PATCH 27/34] rustfmt --- nexus/reconfigurator/execution/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index dac7abd5dd..a1f7d133e6 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -13,18 +13,18 @@ use nexus_types::deployment::Blueprint; use nexus_types::identity::Asset; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; +use overridables::Overridables; use slog::info; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::net::SocketAddrV6; -use overridables::Overridables; use uuid::Uuid; mod datasets; mod dns; mod omicron_zones; -mod resource_allocation; mod overridables; +mod resource_allocation; struct Sled { id: Uuid, From b2cb116bef7c2a952d60f59fb0eea96cc0d33ff0 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 11 Mar 2024 15:57:52 -0700 Subject: [PATCH 28/34] get more tests working --- nexus/db-queries/src/db/datastore/rack.rs | 97 ++++---- nexus/reconfigurator/execution/src/dns.rs | 286 ++++++++++++++++++---- nexus/src/app/silo.rs | 7 +- 3 files changed, 282 insertions(+), 108 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index d6537ec622..fa4d6a4210 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -27,7 +27,6 @@ use crate::db::model::Rack; use crate::db::model::Zpool; use crate::db::pagination::paginated; use crate::db::pool::DbConnection; -use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -44,6 +43,8 @@ use nexus_db_model::PasswordHashString; use nexus_db_model::SiloUser; use nexus_db_model::SiloUserPasswordHash; use nexus_db_model::SledUnderlaySubnetAllocation; +use nexus_types::deployment::Blueprint; +use nexus_types::deployment::OmicronZoneType; use nexus_types::external_api::params as external_params; use nexus_types::external_api::shared; use nexus_types::external_api::shared::IdentityType; @@ -54,6 +55,7 @@ use nexus_types::internal_api::params as internal_params; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::IdentityMetadataCreateParams; +use omicron_common::api::external::InternalContext; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; @@ -793,64 +795,49 @@ impl DataStore { pub async fn nexus_external_addresses( &self, opctx: &OpContext, + blueprint: Option<&Blueprint>, ) -> Result<(Vec, Vec), Error> { opctx.authorize(authz::Action::Read, &authz::DNS_CONFIG).await?; - // XXX-dap use the current target blueprint here? - // Or maybe do that in the caller, since the other caller is doing - // blueprint execution for what might not be the current target. - - use crate::db::schema::external_ip::dsl as extip_dsl; - use crate::db::schema::service::dsl as service_dsl; - - let err = OptionalError::new(); - let conn = self.pool_connection_authorized(opctx).await?; - self.transaction_retry_wrapper("nexus_external_addresses") - .transaction(&conn, |conn| { - let err = err.clone(); - async move { - let ips = extip_dsl::external_ip - .inner_join( - service_dsl::service.on(service_dsl::id - .eq(extip_dsl::parent_id.assume_not_null())), - ) - .filter(extip_dsl::parent_id.is_not_null()) - .filter(extip_dsl::time_deleted.is_null()) - .filter(extip_dsl::is_service) - .filter( - service_dsl::kind.eq(db::model::ServiceKind::Nexus), - ) - .select(ExternalIp::as_select()) - .get_results_async(&conn) - .await? - .into_iter() - .map(|external_ip| external_ip.ip.ip()) - .collect(); - - let dns_zones = self - .dns_zones_list_all_on_connection( - opctx, - &conn, - DnsGroup::External, - ) - .await - .map_err(|e| match e.retryable() { - NotRetryable(not_retryable_err) => { - err.bail(not_retryable_err) - } - Retryable(retryable_err) => retryable_err, - })?; - - Ok((ips, dns_zones)) - } - }) + let dns_zones = self + .dns_zones_list_all(opctx, DnsGroup::External) .await - .map_err(|e| { - if let Some(err) = err.take() { - return err.into(); - } - public_error_from_diesel(e, ErrorHandler::Server) - }) + .internal_context("listing DNS zones to list external addresses")?; + + let nexus_external_ips = if let Some(blueprint) = blueprint { + blueprint + .all_omicron_zones() + .filter_map(|(_, z)| match z.zone_type { + OmicronZoneType::Nexus { external_ip, .. } => { + Some(external_ip) + } + _ => None, + }) + .collect() + } else { + use crate::db::schema::external_ip::dsl as extip_dsl; + use crate::db::schema::service::dsl as service_dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + extip_dsl::external_ip + .inner_join(service_dsl::service.on( + service_dsl::id.eq(extip_dsl::parent_id.assume_not_null()), + )) + .filter(extip_dsl::parent_id.is_not_null()) + .filter(extip_dsl::time_deleted.is_null()) + .filter(extip_dsl::is_service) + .filter(service_dsl::kind.eq(db::model::ServiceKind::Nexus)) + .select(ExternalIp::as_select()) + .get_results_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? + .into_iter() + .map(|external_ip| external_ip.ip.ip()) + .collect() + }; + + Ok((nexus_external_ips, dns_zones)) } } diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 1aa675a198..109503a3d7 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -68,14 +68,15 @@ pub(crate) async fn deploy_dns( .filter(|silo| silo.id() != *SILO_ID) .collect::>(); - let (_, nexus_external_dns_zones) = - datastore.nexus_external_addresses(opctx).await?; + let (nexus_external_ips, nexus_external_dns_zones) = + datastore.nexus_external_addresses(opctx, Some(blueprint)).await?; let nexus_external_dns_zone_names = nexus_external_dns_zones .into_iter() .map(|z| z.zone_name) .collect::>(); let external_dns_config_blueprint = blueprint_external_dns_config( blueprint, + &nexus_external_ips, &silos.iter().collect::>(), &nexus_external_dns_zone_names, ); @@ -326,25 +327,15 @@ pub fn blueprint_internal_dns_config( pub fn blueprint_external_dns_config( blueprint: &Blueprint, + nexus_external_ips: &[IpAddr], silos: &[&Silo], external_dns_zone_names: &[String], ) -> DnsConfigParams { - let nexus_external_ips = - blueprint.all_omicron_zones().filter_map(|(_, z)| { - if blueprint.zones_in_service.contains(&z.id) { - if let OmicronZoneType::Nexus { external_ip, .. } = &z.zone_type - { - return Some(*external_ip); - } - } - - None - }); let dns_records: Vec = nexus_external_ips .into_iter() .map(|addr| match addr { - IpAddr::V4(addr) => DnsRecord::A(addr), - IpAddr::V6(addr) => DnsRecord::Aaaa(addr), + IpAddr::V4(addr) => DnsRecord::A(*addr), + IpAddr::V6(addr) => DnsRecord::Aaaa(*addr), }) .collect(); @@ -440,15 +431,18 @@ mod test { use nexus_db_model::DnsGroup; use nexus_db_model::Silo; use nexus_db_queries::context::OpContext; + use nexus_db_queries::db::DataStore; use nexus_inventory::CollectionBuilder; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_reconfigurator_planning::blueprint_builder::EnsureMultiple; use nexus_reconfigurator_planning::example::example; use nexus_reconfigurator_preparation::policy_from_db; + use nexus_test_utils::resource_helpers::create_silo; use nexus_test_utils::SLED_AGENT2_UUID; use nexus_test_utils::SLED_AGENT_UUID; use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::Blueprint; + use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::OmicronZoneConfig; use nexus_types::deployment::OmicronZoneType; use nexus_types::deployment::Policy; @@ -835,10 +829,21 @@ mod test { }) .unwrap(); + let nexus_external_ips: Vec<_> = blueprint + .all_omicron_zones() + .filter_map(|(_, z)| match &z.zone_type { + OmicronZoneType::Nexus { external_ip, .. } => { + Some(*external_ip) + } + _ => None, + }) + .collect(); + // It shouldn't ever be possible to have no Silos at all, but at least // make sure we don't panic. let external_dns_config = blueprint_external_dns_config( &blueprint, + &nexus_external_ips, &[], &[String::from("oxide.test")], ); @@ -851,19 +856,36 @@ mod test { assert!(external_dns_config.zones[0].records.is_empty()); // Same with external DNS zones. - let external_dns_config = - blueprint_external_dns_config(&blueprint, &[&my_silo], &[]); + let external_dns_config = blueprint_external_dns_config( + &blueprint, + &nexus_external_ips, + &[&my_silo], + &[], + ); assert_eq!( external_dns_config.generation, u64::from(initial_external_dns_generation.next()) ); assert!(external_dns_config.zones.is_empty()); + // Same with external IPs. + let external_dns_config = blueprint_external_dns_config( + &blueprint, + &[], + &[&my_silo], + &[String::from("oxide.test")], + ); + assert_eq!( + external_dns_config.generation, + u64::from(initial_external_dns_generation.next()) + ); + // Now check a more typical case. (Although we wouldn't normally have // more than one external DNS zone, it's a more general case and pretty // easy to test.) let external_dns_config = blueprint_external_dns_config( &blueprint, + &nexus_external_ips, &[&my_silo], &[String::from("oxide1.test"), String::from("oxide2.test")], ); @@ -1094,15 +1116,19 @@ mod test { // - If we create a blueprint matching the current system, and then apply // it, there are no changes to either internal or external DNS // + // - If we create a Silo, DNS will be updated. If we then re-execute the + // previous blueprint, again, there will be no new changes to DNS. + // // - If we then generate a blueprint with a Nexus zone and execute the DNS // part of that, then: // - // - internal DNS SRV record for _nexus._tcp is updated + // - internal DNS SRV record for _nexus._tcp is added // - internal DNS AAAA record for the new zone is added // - external DNS gets a A record for the new zone's external IP // // - If we subsequently create a new Silo, the new Silo's DNS record // reflects the Nexus zone that was added. + // // XXX-dap move to crate-level test since it uses realize_blueprint()? #[nexus_test] async fn test_silos_external_dns_end_to_end( @@ -1198,24 +1224,30 @@ mod test { .await .expect("failed to execute initial blueprint"); - // Now fetch DNS again. It ought not to have changed. - let dns_latest_internal = datastore - .dns_config_read(&opctx, DnsGroup::Internal) - .await - .expect("fetching latest internal DNS"); - let dns_latest_external = datastore - .dns_config_read(&opctx, DnsGroup::External) - .await - .expect("fetching latest external DNS"); - - assert_eq!( - dns_initial_internal.generation, - dns_latest_internal.generation - ); - assert_eq!( - dns_initial_external.generation, - dns_latest_external.generation - ); + // DNS ought not to have changed. + verify_dns_unchanged( + &opctx, + datastore, + &dns_initial_internal, + &dns_initial_external, + ) + .await; + + // Create a Silo. Make sure that external DNS is updated (and that + // internal DNS is not). Then make sure that if we execute the same + // blueprint again, DNS does not change again (i.e., that it does not + // revert somehow). + let dns_latest_external = create_silo_and_verify_dns( + cptestctx, + &opctx, + datastore, + &blueprint, + &overrides, + "squidport", + &dns_initial_internal, + &dns_initial_external, + ) + .await; // Now, go through the motions of provisioning a new Nexus zone. // We do this directly with BlueprintBuilder to avoid the planner @@ -1248,7 +1280,7 @@ mod test { &log, &blueprint, Generation::from( - u32::try_from(dns_latest_internal.generation).unwrap(), + u32::try_from(dns_initial_internal.generation).unwrap(), ), Generation::from( u32::try_from(dns_latest_external.generation).unwrap(), @@ -1279,6 +1311,38 @@ mod test { assert_eq!(new_zones.len(), 1); let new_zone_id = *new_zones[0]; + // Set this blueprint as the current target. We set it to disabled + // because we're controlling the execution directly here. But we need + // to do this so that silo creation sees the change. + // + // Doing this requires writing the whole history of this blueprint. + datastore + .blueprint_target_set_current( + &opctx, + BlueprintTarget { + target_id: blueprint.id, + enabled: false, + time_made_target: chrono::Utc::now(), + }, + ) + .await + .expect("failed to set blueprint as target"); + datastore + .blueprint_insert(&opctx, &blueprint2) + .await + .expect("failed to save blueprint to database"); + datastore + .blueprint_target_set_current( + &opctx, + BlueprintTarget { + target_id: blueprint2.id, + enabled: false, + time_made_target: chrono::Utc::now(), + }, + ) + .await + .expect("failed to set blueprint as target"); + crate::realize_blueprint( &opctx, datastore, @@ -1329,17 +1393,18 @@ mod test { assert_eq!(new_srv.target, new_zone_host.fqdn()); // As for external DNS: all existing names ought to have been changed, - // gaining a new AAAA record for the new host. + // gaining a new A record for the new host. + let dns_previous_external = dns_latest_external; let dns_latest_external = datastore .dns_config_read(&opctx, DnsGroup::External) .await .expect("fetching latest external DNS"); assert_eq!( dns_latest_external.generation, - dns_initial_external.generation + 1, + dns_previous_external.generation + 1, ); let diff = - DnsDiff::new(&dns_initial_external, &dns_latest_external).unwrap(); + DnsDiff::new(&dns_previous_external, &dns_latest_external).unwrap(); assert!(diff.names_added().next().is_none()); assert!(diff.names_removed().next().is_none()); let changed: Vec<_> = diff.names_changed().collect(); @@ -1362,20 +1427,47 @@ mod test { ) .await .expect("failed to execute second blueprint again"); + verify_dns_unchanged( + &opctx, + datastore, + &dns_latest_internal, + &dns_latest_external, + ) + .await; - let dns_internal2 = datastore - .dns_config_read(&opctx, DnsGroup::Internal) - .await - .expect("fetching latest internal DNS"); - let dns_external2 = datastore - .dns_config_read(&opctx, DnsGroup::External) - .await - .expect("fetching latest external DNS"); - - assert_eq!(dns_latest_internal.generation, dns_internal2.generation); - assert_eq!(dns_latest_external.generation, dns_external2.generation); + // Now create another Silo and verify the changes to DNS. + // This ensures that the "create Silo" path picks up Nexus instances + // that exist only in Reconfigurator, not the services table. + let dns_latest_external = create_silo_and_verify_dns( + &cptestctx, + &opctx, + datastore, + &blueprint2, + &overrides, + "tickety-boo", + &dns_latest_internal, + &dns_latest_external, + ) + .await; - // XXX-dap continue writing the test. See above. + // One more time, make sure that executing the blueprint does not do + // anything. + crate::realize_blueprint( + &opctx, + datastore, + &blueprint2, + "test-suite", + &overrides, + ) + .await + .expect("failed to execute second blueprint again"); + verify_dns_unchanged( + &opctx, + datastore, + &dns_latest_internal, + &dns_latest_external, + ) + .await; } fn subset_plus_one<'a, T: std::fmt::Debug + Ord + Eq>( @@ -1402,6 +1494,96 @@ mod test { extra.into_iter().next().unwrap() } + + async fn create_silo_and_verify_dns( + cptestctx: &ControlPlaneTestContext, + opctx: &OpContext, + datastore: &DataStore, + blueprint: &Blueprint, + overrides: &Overridables, + silo_name: &str, + old_internal: &DnsConfigParams, + old_external: &DnsConfigParams, + ) -> DnsConfigParams { + // Create a Silo. Make sure that external DNS is updated (and that + // internal DNS is not). This is tested elsewhere already but really we + // want to make sure that if we then execute the blueprint again, DNS + // does not change _again_ (i.e., does not somehow revert). + let silo = create_silo( + &cptestctx.external_client, + silo_name, + false, + shared::SiloIdentityMode::SamlJit, + ) + .await; + + let dns_latest_internal = datastore + .dns_config_read(&opctx, DnsGroup::Internal) + .await + .expect("fetching latest internal DNS"); + assert_eq!(old_internal.generation, dns_latest_internal.generation); + let dns_latest_external = datastore + .dns_config_read(&opctx, DnsGroup::External) + .await + .expect("fetching latest external DNS"); + assert_eq!(old_external.generation + 1, dns_latest_external.generation); + + // Specifically, there should be one new name (for the new Silo). + let diff = DnsDiff::new(&old_external, &dns_latest_external).unwrap(); + assert!(diff.names_removed().next().is_none()); + assert!(diff.names_changed().next().is_none()); + let added = diff.names_added().collect::>(); + assert_eq!(added.len(), 1); + let (new_name, new_records) = added[0]; + assert_eq!(new_name, silo_dns_name(&silo.identity.name)); + // And it should have the same IP addresses as all of the other Silos. + assert_eq!( + new_records, + old_external.zones[0].records.values().next().unwrap() + ); + + // If we execute the blueprint, DNS should not be changed. + crate::realize_blueprint( + &opctx, + datastore, + &blueprint, + "test-suite", + &overrides, + ) + .await + .expect("failed to execute blueprint"); + let dns_latest_internal = datastore + .dns_config_read(&opctx, DnsGroup::Internal) + .await + .expect("fetching latest internal DNS"); + let dns_latest_external = datastore + .dns_config_read(&opctx, DnsGroup::External) + .await + .expect("fetching latest external DNS"); + assert_eq!(old_internal.generation, dns_latest_internal.generation); + assert_eq!(old_external.generation + 1, dns_latest_external.generation); + + dns_latest_external + } + + async fn verify_dns_unchanged( + opctx: &OpContext, + datastore: &DataStore, + old_internal: &DnsConfigParams, + old_external: &DnsConfigParams, + ) { + let dns_latest_internal = datastore + .dns_config_read(&opctx, DnsGroup::Internal) + .await + .expect("fetching latest internal DNS"); + let dns_latest_external = datastore + .dns_config_read(&opctx, DnsGroup::External) + .await + .expect("fetching latest external DNS"); + + assert_eq!(old_internal.generation, dns_latest_internal.generation); + assert_eq!(old_external.generation, dns_latest_external.generation); + } } // XXX-dap duplicated -- figure out where to put this diff --git a/nexus/src/app/silo.rs b/nexus/src/app/silo.rs index 8461be015a..07c3cc5b6b 100644 --- a/nexus/src/app/silo.rs +++ b/nexus/src/app/silo.rs @@ -95,8 +95,13 @@ impl super::Nexus { // Set up an external DNS name for this Silo's API and console // endpoints (which are the same endpoint). + let target_blueprint = datastore + .blueprint_target_get_current_full(opctx) + .await + .internal_context("loading target blueprint")?; + let target = target_blueprint.as_ref().map(|(_, blueprint)| blueprint); let (nexus_external_ips, nexus_external_dns_zones) = - datastore.nexus_external_addresses(nexus_opctx).await?; + datastore.nexus_external_addresses(nexus_opctx, target).await?; let dns_records: Vec = nexus_external_ips .into_iter() .map(|addr| match addr { From 2e8d018479669899428842fd7f78432d2f74fc5f Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 11 Mar 2024 16:49:25 -0700 Subject: [PATCH 29/34] cleanup --- nexus/reconfigurator/execution/src/dns.rs | 30 +--------------- .../execution/src/overridables.rs | 35 +++++++++++++++++++ 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 109503a3d7..47afc9ff57 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -438,8 +438,6 @@ mod test { use nexus_reconfigurator_planning::example::example; use nexus_reconfigurator_preparation::policy_from_db; use nexus_test_utils::resource_helpers::create_silo; - use nexus_test_utils::SLED_AGENT2_UUID; - use nexus_test_utils::SLED_AGENT_UUID; use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintTarget; @@ -467,7 +465,6 @@ mod test { use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::IdentityMetadataCreateParams; - use omicron_common::api::external::SwitchLocation; use omicron_test_utils::dev::poll::wait_for_condition; use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::test_setup_log; @@ -1128,8 +1125,6 @@ mod test { // // - If we subsequently create a new Silo, the new Silo's DNS record // reflects the Nexus zone that was added. - // - // XXX-dap move to crate-level test since it uses realize_blueprint()? #[nexus_test] async fn test_silos_external_dns_end_to_end( cptestctx: &ControlPlaneTestContext, @@ -1190,30 +1185,7 @@ mod test { eprintln!("blueprint: {:?}", blueprint); // Now, execute the blueprint. - // XXX-dap doc/cleanup - let mut overrides = Overridables::default(); - let scrimlets = [ - (SLED_AGENT_UUID, SwitchLocation::Switch0), - (SLED_AGENT2_UUID, SwitchLocation::Switch1), - ]; - for (id_str, switch_location) in scrimlets { - let sled_id = id_str.parse().unwrap(); - let ip = Ipv6Addr::LOCALHOST; - let mgs_port = cptestctx - .gateway - .get(&switch_location) - .unwrap() - .client - .bind_address - .port(); - let dendrite_port = - cptestctx.dendrite.get(&switch_location).unwrap().port; - let mgd_port = cptestctx.mgd.get(&switch_location).unwrap().port; - overrides.override_switch_zone_ip(sled_id, ip); - overrides.override_dendrite_port(sled_id, dendrite_port); - overrides.override_mgs_port(sled_id, mgs_port); - overrides.override_mgd_port(sled_id, mgd_port); - } + let overrides = Overridables::for_test(cptestctx); crate::realize_blueprint( &opctx, datastore, diff --git a/nexus/reconfigurator/execution/src/overridables.rs b/nexus/reconfigurator/execution/src/overridables.rs index 80a14d1fed..bc3109adeb 100644 --- a/nexus/reconfigurator/execution/src/overridables.rs +++ b/nexus/reconfigurator/execution/src/overridables.rs @@ -79,4 +79,39 @@ impl Overridables { .copied() .unwrap_or_else(|| get_switch_zone_address(sled_subnet)) } + + /// Generates a set of overrides describing the simulated test environment. + #[cfg(test)] + pub fn for_test( + cptestctx: &nexus_test_utils::ControlPlaneTestContext< + omicron_nexus::Server, + >, + ) -> Overridables { + use omicron_common::api::external::SwitchLocation; + + let mut overrides = Overridables::default(); + let scrimlets = [ + (nexus_test_utils::SLED_AGENT_UUID, SwitchLocation::Switch0), + (nexus_test_utils::SLED_AGENT2_UUID, SwitchLocation::Switch1), + ]; + for (id_str, switch_location) in scrimlets { + let sled_id = id_str.parse().unwrap(); + let ip = Ipv6Addr::LOCALHOST; + let mgs_port = cptestctx + .gateway + .get(&switch_location) + .unwrap() + .client + .bind_address + .port(); + let dendrite_port = + cptestctx.dendrite.get(&switch_location).unwrap().port; + let mgd_port = cptestctx.mgd.get(&switch_location).unwrap().port; + overrides.override_switch_zone_ip(sled_id, ip); + overrides.override_dendrite_port(sled_id, dendrite_port); + overrides.override_mgs_port(sled_id, mgs_port); + overrides.override_mgd_port(sled_id, mgd_port); + } + overrides + } } From 5648f86c7079d5b489f92473c3ab404f3c0bfc01 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 11 Mar 2024 16:56:03 -0700 Subject: [PATCH 30/34] do not panic --- nexus/reconfigurator/execution/src/dns.rs | 60 +++++++++++++++-------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 47afc9ff57..40615590ba 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -6,6 +6,7 @@ use crate::overridables::Overridables; use crate::Sled; +use anyhow::Context; use dns_service_client::DnsDiff; use internal_dns::DnsConfigBuilder; use internal_dns::ServiceName; @@ -58,7 +59,13 @@ pub(crate) async fn deploy_dns( // Next, construct the DNS config represented by the blueprint. let internal_dns_config_blueprint = - blueprint_internal_dns_config(blueprint, sleds_by_id, overrides); + blueprint_internal_dns_config(blueprint, sleds_by_id, overrides) + .map_err(|e| { + Error::internal_error(&format!( + "error constructing internal DNS config: {:#}", + e + )) + })?; let silos = datastore .silo_list_all_batched(opctx, Discoverability::All) .await @@ -223,7 +230,7 @@ pub fn blueprint_internal_dns_config( blueprint: &Blueprint, sleds_by_id: &BTreeMap, overrides: &Overridables, -) -> DnsConfigParams { +) -> Result { // The DNS names configured here should match what RSS configures for the // same zones. It's tricky to have RSS share the same code because it uses // Sled Agent's _internal_ `OmicronZoneConfig` (and friends), whereas we're @@ -232,10 +239,13 @@ pub fn blueprint_internal_dns_config( // the details. let mut dns_builder = DnsConfigBuilder::new(); - // XXX-dap don't panic - // See oxidecomputer/omicron#4988. - fn parse_port(address: &str) -> u16 { - address.parse::().unwrap().port() + // It's annoying that we have to parse this because it really should be + // valid already. See oxidecomputer/omicron#4988. + fn parse_port(address: &str) -> Result { + address + .parse::() + .with_context(|| format!("parsing socket address {:?}", address)) + .map(|addr| addr.port()) } for (_, omicron_zone) in blueprint.all_omicron_zones() { @@ -243,49 +253,57 @@ pub fn blueprint_internal_dns_config( continue; } + let context = || { + format!( + "parsing {} zone with id {}", + omicron_zone.zone_type.label(), + omicron_zone.id + ) + }; let (service_name, port) = match &omicron_zone.zone_type { OmicronZoneType::BoundaryNtp { address, .. } => { - let port = parse_port(&address); + let port = parse_port(&address).with_context(context)?; (ServiceName::BoundaryNtp, port) } OmicronZoneType::InternalNtp { address, .. } => { - let port = parse_port(&address); + let port = parse_port(&address).with_context(context)?; (ServiceName::InternalNtp, port) } OmicronZoneType::Clickhouse { address, .. } => { - let port = parse_port(&address); + let port = parse_port(&address).with_context(context)?; (ServiceName::Clickhouse, port) } OmicronZoneType::ClickhouseKeeper { address, .. } => { - let port = parse_port(&address); + let port = parse_port(&address).with_context(context)?; (ServiceName::ClickhouseKeeper, port) } OmicronZoneType::CockroachDb { address, .. } => { - let port = parse_port(&address); + let port = parse_port(&address).with_context(context)?; (ServiceName::Cockroach, port) } OmicronZoneType::Nexus { internal_address, .. } => { - let port = parse_port(internal_address); + let port = + parse_port(internal_address).with_context(context)?; (ServiceName::Nexus, port) } OmicronZoneType::Crucible { address, .. } => { - let port = parse_port(address); + let port = parse_port(address).with_context(context)?; (ServiceName::Crucible(omicron_zone.id), port) } OmicronZoneType::CruciblePantry { address } => { - let port = parse_port(address); + let port = parse_port(address).with_context(context)?; (ServiceName::CruciblePantry, port) } OmicronZoneType::Oximeter { address } => { - let port = parse_port(address); + let port = parse_port(address).with_context(context)?; (ServiceName::Oximeter, port) } OmicronZoneType::ExternalDns { http_address, .. } => { - let port = parse_port(http_address); + let port = parse_port(http_address).with_context(context)?; (ServiceName::ExternalDns, port) } OmicronZoneType::InternalDns { http_address, .. } => { - let port = parse_port(http_address); + let port = parse_port(http_address).with_context(context)?; (ServiceName::InternalDns, port) } }; @@ -322,7 +340,7 @@ pub fn blueprint_internal_dns_config( // whatever it was when this blueprint was generated. This will only be // used if the generated DNS contents are different from what's current. dns_builder.generation(blueprint.internal_dns_version.next()); - dns_builder.build() + Ok(dns_builder.build()) } pub fn blueprint_external_dns_config( @@ -520,7 +538,8 @@ mod test { &blueprint, &BTreeMap::new(), &Default::default(), - ); + ) + .unwrap(); assert!(blueprint_dns.sole_zone().unwrap().records.is_empty()); } @@ -621,7 +640,8 @@ mod test { &blueprint, &sleds_by_id, &Default::default(), - ); + ) + .unwrap(); assert_eq!( dns_config_blueprint.generation, u64::from(initial_dns_generation.next()) From c2041d8ab894e0a9285c9202506878826615b415 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 11 Mar 2024 17:04:40 -0700 Subject: [PATCH 31/34] commonize silo_dns_name() --- nexus/reconfigurator/execution/src/dns.rs | 6 ++---- nexus/reconfigurator/execution/src/lib.rs | 2 ++ nexus/src/app/external_endpoints.rs | 2 +- nexus/src/app/rack.rs | 2 +- nexus/src/app/silo.rs | 14 +------------- 5 files changed, 7 insertions(+), 19 deletions(-) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 40615590ba..e766fb5a04 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -1487,6 +1487,7 @@ mod test { extra.into_iter().next().unwrap() } + #[allow(clippy::too_many_arguments)] async fn create_silo_and_verify_dns( cptestctx: &ControlPlaneTestContext, opctx: &OpContext, @@ -1578,14 +1579,11 @@ mod test { } } -// XXX-dap duplicated -- figure out where to put this /// Returns the (relative) DNS name for this Silo's API and console endpoints /// _within_ the external DNS zone (i.e., without that zone's suffix) /// /// This specific naming scheme is determined under RFD 357. -pub(crate) fn silo_dns_name( - name: &omicron_common::api::external::Name, -) -> String { +pub fn silo_dns_name(name: &omicron_common::api::external::Name) -> String { // RFD 4 constrains resource names (including Silo names) to DNS-safe // strings, which is why it's safe to directly put the name of the // resource into the DNS name rather than doing any kind of escaping. diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index a1f7d133e6..d7ed512400 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -20,6 +20,8 @@ use std::collections::BTreeMap; use std::net::SocketAddrV6; use uuid::Uuid; +pub use dns::silo_dns_name; + mod datasets; mod dns; mod omicron_zones; diff --git a/nexus/src/app/external_endpoints.rs b/nexus/src/app/external_endpoints.rs index bcfec667ce..25a9dd4e6c 100644 --- a/nexus/src/app/external_endpoints.rs +++ b/nexus/src/app/external_endpoints.rs @@ -26,7 +26,6 @@ //! "certificate resolver" object that impls //! [`rustls::server::ResolvesServerCert`]. See [`NexusCertResolver`]. -use super::silo::silo_dns_name; use crate::ServerContext; use anyhow::anyhow; use anyhow::bail; @@ -39,6 +38,7 @@ use nexus_db_queries::db::datastore::Discoverability; use nexus_db_queries::db::fixed_data::silo::SILO_ID; use nexus_db_queries::db::model::ServiceKind; use nexus_db_queries::db::DataStore; +use nexus_reconfigurator_execution::silo_dns_name; use nexus_types::identity::Resource; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::DataPageParams; diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 4030fce31d..4ba31bb0fe 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -4,7 +4,6 @@ //! Rack management -use super::silo::silo_dns_name; use crate::external_api::params; use crate::external_api::params::CertificateCreate; use crate::external_api::shared::ServiceUsingCertificate; @@ -20,6 +19,7 @@ use nexus_db_queries::db; use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder; use nexus_db_queries::db::datastore::RackInit; use nexus_db_queries::db::lookup::LookupPath; +use nexus_reconfigurator_execution::silo_dns_name; use nexus_types::external_api::params::Address; use nexus_types::external_api::params::AddressConfig; use nexus_types::external_api::params::AddressLotBlockCreate; diff --git a/nexus/src/app/silo.rs b/nexus/src/app/silo.rs index 07c3cc5b6b..487af96aab 100644 --- a/nexus/src/app/silo.rs +++ b/nexus/src/app/silo.rs @@ -16,6 +16,7 @@ use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::{self, lookup}; use nexus_db_queries::{authn, authz}; +use nexus_reconfigurator_execution::silo_dns_name; use nexus_types::internal_api::params::DnsRecord; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::ListResultVec; @@ -891,16 +892,3 @@ impl super::Nexus { LookupPath::new(opctx, &self.db_datastore).silo_group_id(*group_id) } } - -/// Returns the (relative) DNS name for this Silo's API and console endpoints -/// _within_ the external DNS zone (i.e., without that zone's suffix) -/// -/// This specific naming scheme is determined under RFD 357. -pub(crate) fn silo_dns_name( - name: &omicron_common::api::external::Name, -) -> String { - // RFD 4 constrains resource names (including Silo names) to DNS-safe - // strings, which is why it's safe to directly put the name of the - // resource into the DNS name rather than doing any kind of escaping. - format!("{}.sys", name) -} From 7028e0b921d8b6ab296e6e36899c002a3da801b2 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 13 Mar 2024 15:15:15 -0700 Subject: [PATCH 32/34] review feedback --- nexus/db-queries/src/db/datastore/silo.rs | 2 +- nexus/reconfigurator/execution/src/dns.rs | 78 +++++++++---------- nexus/reconfigurator/execution/src/lib.rs | 19 +++++ .../execution/src/overridables.rs | 12 ++- .../execution/src/resource_allocation.rs | 13 ++-- .../planning/src/blueprint_builder.rs | 6 +- .../src/app/background/blueprint_execution.rs | 1 - nexus/src/app/deployment.rs | 2 +- 8 files changed, 77 insertions(+), 56 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/silo.rs b/nexus/db-queries/src/db/datastore/silo.rs index 59c5e80232..0fd858b900 100644 --- a/nexus/db-queries/src/db/datastore/silo.rs +++ b/nexus/db-queries/src/db/datastore/silo.rs @@ -47,7 +47,7 @@ use ref_cast::RefCast; use uuid::Uuid; /// Filter a "silo_list" query based on silos' discoverability -#[derive(Clone, Copy)] +#[derive(Debug, Clone, Copy)] pub enum Discoverability { /// Show all Silos All, diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index e766fb5a04..4a0dfa236c 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Propagates internal DNS changes in a given blueprint +//! Propagates DNS changes in a given blueprint use crate::overridables::Overridables; use crate::Sled; @@ -84,7 +84,7 @@ pub(crate) async fn deploy_dns( let external_dns_config_blueprint = blueprint_external_dns_config( blueprint, &nexus_external_ips, - &silos.iter().collect::>(), + &silos, &nexus_external_dns_zone_names, ); @@ -346,7 +346,7 @@ pub fn blueprint_internal_dns_config( pub fn blueprint_external_dns_config( blueprint: &Blueprint, nexus_external_ips: &[IpAddr], - silos: &[&Silo], + silos: &[Silo], external_dns_zone_names: &[String], ) -> DnsConfigParams { let dns_records: Vec = nexus_external_ips @@ -435,12 +435,20 @@ fn dns_compute_update( Ok(Some(update)) } +/// Returns the (relative) DNS name for this Silo's API and console endpoints +/// _within_ the external DNS zone (i.e., without that zone's suffix) +/// +/// This specific naming scheme is determined under RFD 357. +pub fn silo_dns_name(name: &omicron_common::api::external::Name) -> String { + // RFD 4 constrains resource names (including Silo names) to DNS-safe + // strings, which is why it's safe to directly put the name of the + // resource into the DNS name rather than doing any kind of escaping. + format!("{}.sys", name) +} + #[cfg(test)] mod test { - use super::blueprint_internal_dns_config; - use super::dns_compute_update; - use crate::dns::blueprint_external_dns_config; - use crate::dns::silo_dns_name; + use super::*; use crate::overridables::Overridables; use crate::Sled; use dns_service_client::DnsDiff; @@ -876,7 +884,7 @@ mod test { let external_dns_config = blueprint_external_dns_config( &blueprint, &nexus_external_ips, - &[&my_silo], + std::slice::from_ref(&my_silo), &[], ); assert_eq!( @@ -889,7 +897,7 @@ mod test { let external_dns_config = blueprint_external_dns_config( &blueprint, &[], - &[&my_silo], + std::slice::from_ref(&my_silo), &[String::from("oxide.test")], ); assert_eq!( @@ -903,7 +911,7 @@ mod test { let external_dns_config = blueprint_external_dns_config( &blueprint, &nexus_external_ips, - &[&my_silo], + std::slice::from_ref(&my_silo), &[String::from("oxide1.test"), String::from("oxide2.test")], ); assert_eq!( @@ -1196,7 +1204,8 @@ mod test { .await .expect("fetching initial external DNS"); - // Now, use it to construct an initial blueprint. + // Now, use the collection to construct an initial blueprint. + // This stores it into the database, too. info!(log, "using collection"; "collection_id" => %collection.id); let blueprint = nexus .blueprint_generate_from_collection(&opctx, collection.id) @@ -1204,9 +1213,22 @@ mod test { .expect("failed to generate initial blueprint"); eprintln!("blueprint: {:?}", blueprint); + // Set it as the current target. We'll need this later. + datastore + .blueprint_target_set_current( + &opctx, + BlueprintTarget { + target_id: blueprint.id, + enabled: false, + time_made_target: chrono::Utc::now(), + }, + ) + .await + .expect("failed to set blueprint as target"); + // Now, execute the blueprint. let overrides = Overridables::for_test(cptestctx); - crate::realize_blueprint( + crate::realize_blueprint_with_overrides( &opctx, datastore, &blueprint, @@ -1306,19 +1328,6 @@ mod test { // Set this blueprint as the current target. We set it to disabled // because we're controlling the execution directly here. But we need // to do this so that silo creation sees the change. - // - // Doing this requires writing the whole history of this blueprint. - datastore - .blueprint_target_set_current( - &opctx, - BlueprintTarget { - target_id: blueprint.id, - enabled: false, - time_made_target: chrono::Utc::now(), - }, - ) - .await - .expect("failed to set blueprint as target"); datastore .blueprint_insert(&opctx, &blueprint2) .await @@ -1335,7 +1344,7 @@ mod test { .await .expect("failed to set blueprint as target"); - crate::realize_blueprint( + crate::realize_blueprint_with_overrides( &opctx, datastore, &blueprint2, @@ -1410,7 +1419,7 @@ mod test { } // If we execute it again, we should see no more changes. - crate::realize_blueprint( + crate::realize_blueprint_with_overrides( &opctx, datastore, &blueprint2, @@ -1444,7 +1453,7 @@ mod test { // One more time, make sure that executing the blueprint does not do // anything. - crate::realize_blueprint( + crate::realize_blueprint_with_overrides( &opctx, datastore, &blueprint2, @@ -1536,7 +1545,7 @@ mod test { ); // If we execute the blueprint, DNS should not be changed. - crate::realize_blueprint( + crate::realize_blueprint_with_overrides( &opctx, datastore, &blueprint, @@ -1578,14 +1587,3 @@ mod test { assert_eq!(old_external.generation, dns_latest_external.generation); } } - -/// Returns the (relative) DNS name for this Silo's API and console endpoints -/// _within_ the external DNS zone (i.e., without that zone's suffix) -/// -/// This specific naming scheme is determined under RFD 357. -pub fn silo_dns_name(name: &omicron_common::api::external::Name) -> String { - // RFD 4 constrains resource names (including Silo names) to DNS-safe - // strings, which is why it's safe to directly put the name of the - // resource into the DNS name rather than doing any kind of escaping. - format!("{}.sys", name) -} diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index d7ed512400..ec7a7a1ba7 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -60,6 +60,25 @@ pub async fn realize_blueprint( datastore: &DataStore, blueprint: &Blueprint, nexus_label: S, +) -> Result<(), Vec> +where + String: From, +{ + realize_blueprint_with_overrides( + opctx, + datastore, + blueprint, + nexus_label, + &Default::default(), + ) + .await +} + +pub async fn realize_blueprint_with_overrides( + opctx: &OpContext, + datastore: &DataStore, + blueprint: &Blueprint, + nexus_label: S, overrides: &Overridables, ) -> Result<(), Vec> where diff --git a/nexus/reconfigurator/execution/src/overridables.rs b/nexus/reconfigurator/execution/src/overridables.rs index bc3109adeb..5c4ce7dc6f 100644 --- a/nexus/reconfigurator/execution/src/overridables.rs +++ b/nexus/reconfigurator/execution/src/overridables.rs @@ -34,7 +34,8 @@ pub struct Overridables { impl Overridables { /// Specify the TCP port on which this sled's Dendrite is listening - pub fn override_dendrite_port(&mut self, sled_id: Uuid, port: u16) { + #[cfg(test)] + fn override_dendrite_port(&mut self, sled_id: Uuid, port: u16) { self.dendrite_ports.insert(sled_id, port); } @@ -44,7 +45,8 @@ impl Overridables { } /// Specify the TCP port on which this sled's MGS is listening - pub fn override_mgs_port(&mut self, sled_id: Uuid, port: u16) { + #[cfg(test)] + fn override_mgs_port(&mut self, sled_id: Uuid, port: u16) { self.mgs_ports.insert(sled_id, port); } @@ -54,7 +56,8 @@ impl Overridables { } /// Specify the TCP port on which this sled's MGD is listening - pub fn override_mgd_port(&mut self, sled_id: Uuid, port: u16) { + #[cfg(test)] + fn override_mgd_port(&mut self, sled_id: Uuid, port: u16) { self.mgd_ports.insert(sled_id, port); } @@ -64,7 +67,8 @@ impl Overridables { } /// Specify the IP address of this switch zone - pub fn override_switch_zone_ip(&mut self, sled_id: Uuid, addr: Ipv6Addr) { + #[cfg(test)] + fn override_switch_zone_ip(&mut self, sled_id: Uuid, addr: Ipv6Addr) { self.switch_zone_ips.insert(sled_id, addr); } diff --git a/nexus/reconfigurator/execution/src/resource_allocation.rs b/nexus/reconfigurator/execution/src/resource_allocation.rs index caf3c4a2e1..5872cee0f9 100644 --- a/nexus/reconfigurator/execution/src/resource_allocation.rs +++ b/nexus/reconfigurator/execution/src/resource_allocation.rs @@ -93,9 +93,11 @@ impl<'a> ResourceAllocator<'a> { external_ip: IpAddr, port_range: Option<(u16, u16)>, ) -> anyhow::Result { - // Treat localhost as always allocated. We only use this in the test - // suite. - if external_ip.is_loopback() { + // localhost is used by many components in the test suite. We can't use + // the normal path because normally a given external IP must only be + // used once. Just treat localhost in the test suite as though it's + // already allocated. We do the same in is_nic_already_allocated(). + if cfg!(test) && external_ip.is_loopback() { return Ok(true); } @@ -163,9 +165,8 @@ impl<'a> ResourceAllocator<'a> { zone_id: Uuid, nic: &NetworkInterface, ) -> anyhow::Result { - // Treat localhost as always allocated. We only use this in the test - // suite. - if nic.ip.is_loopback() { + // See the comment in is_external_ip_already_allocated(). + if cfg!(test) && nic.ip.is_loopback() { return Ok(true); } diff --git a/nexus/reconfigurator/planning/src/blueprint_builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder.rs index 0433280b09..1d31dbf1f9 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder.rs @@ -259,9 +259,9 @@ impl<'a> BlueprintBuilder<'a> { } } if let Some(external_ip) = z.zone_type.external_ip()? { - // Ignore localhost. This is used in the test suite, and it - // gets reused many times. - if !external_ip.is_loopback() + // In the test suite, ignore localhost. It gets reused many + // times and that's okay. + if (!cfg!(test) || !external_ip.is_loopback()) && !used_external_ips.insert(external_ip) { bail!("duplicate external IP: {external_ip}"); diff --git a/nexus/src/app/background/blueprint_execution.rs b/nexus/src/app/background/blueprint_execution.rs index 49e24204d7..9402c321b5 100644 --- a/nexus/src/app/background/blueprint_execution.rs +++ b/nexus/src/app/background/blueprint_execution.rs @@ -74,7 +74,6 @@ impl BackgroundTask for BlueprintExecutor { &self.datastore, blueprint, &self.nexus_label, - &Default::default(), ) .await; diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index c0d59f49b3..82cb497078 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -123,7 +123,7 @@ impl super::Nexus { &sled_rows, &zpool_rows, &ip_pool_range_rows, - NEXUS_REDUNDANCY + 1, // XXX-dap + NEXUS_REDUNDANCY, )?; // The choice of which inventory collection to use here is not From a7064799e1c0e5b6fc41f77103e2b64d1f76e817 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 13 Mar 2024 15:49:36 -0700 Subject: [PATCH 33/34] implement schema update --- nexus/db-model/src/schema.rs | 2 +- schema/crdb/42.0.0/up1.sql | 8 ++++++++ schema/crdb/42.0.0/up2.sql | 2 ++ schema/crdb/dbinit.sql | 3 +-- 4 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 schema/crdb/42.0.0/up1.sql create mode 100644 schema/crdb/42.0.0/up2.sql diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 69b46682dc..bcbf7fa88f 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion; /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(41, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(42, 0, 0); table! { disk (id) { diff --git a/schema/crdb/42.0.0/up1.sql b/schema/crdb/42.0.0/up1.sql new file mode 100644 index 0000000000..e8ae49c7c8 --- /dev/null +++ b/schema/crdb/42.0.0/up1.sql @@ -0,0 +1,8 @@ +-- Add the "external_dns_version" column to the "blueprint" table. +-- This query will end up setting the external DNS version for any existing +-- blueprints to 1. This is always safe because it's the smallest possible +-- value and if a value is too small, the end result is simply needing to +-- regenerate the blueprint in order to be able to execute it. (On the other +-- hand, using a value that's too large could cause corruption.) +ALTER TABLE omicron.public.blueprint + ADD COLUMN IF NOT EXISTS external_dns_version INT8 NOT NULL DEFAULT 1; diff --git a/schema/crdb/42.0.0/up2.sql b/schema/crdb/42.0.0/up2.sql new file mode 100644 index 0000000000..cc89117e1d --- /dev/null +++ b/schema/crdb/42.0.0/up2.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.blueprint + ALTER COLUMN external_dns_version DROP DEFAULT; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 2a91406d0f..9895464fb5 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3161,7 +3161,6 @@ CREATE TABLE IF NOT EXISTS omicron.public.blueprint ( -- identifies the latest internal DNS version when blueprint planning began internal_dns_version INT8 NOT NULL, -- identifies the latest external DNS version when blueprint planning began - -- XXX-dap migration code must set the value for existing blueprints external_dns_version INT8 NOT NULL ); @@ -3587,7 +3586,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '41.0.0', NULL) + ( TRUE, NOW(), NOW(), '42.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From ef535d43e984ec3486e19aaeacae2de46f3a2823 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 13 Mar 2024 16:36:24 -0700 Subject: [PATCH 34/34] fix test --- nexus/reconfigurator/execution/src/dns.rs | 10 +++++++++- nexus/reconfigurator/planning/src/blueprint_builder.rs | 7 ++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index efce18f2ad..9e0434c59f 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -456,6 +456,8 @@ mod test { use internal_dns::DNS_ZONE; use nexus_db_model::DnsGroup; use nexus_db_model::Silo; + use nexus_db_queries::authn; + use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_inventory::CollectionBuilder; @@ -503,6 +505,7 @@ mod test { use std::net::Ipv6Addr; use std::net::SocketAddrV6; use std::str::FromStr; + use std::sync::Arc; use std::time::Duration; use uuid::Uuid; @@ -1161,7 +1164,12 @@ mod test { let nexus = &cptestctx.server.apictx().nexus; let datastore = nexus.datastore(); let log = &cptestctx.logctx.log; - let opctx = OpContext::for_tests(log.clone(), datastore.clone()); + let opctx = OpContext::for_background( + log.clone(), + Arc::new(authz::Authz::new(log)), + authn::Context::internal_api(), + datastore.clone(), + ); // First, wait until Nexus has successfully completed an inventory // collection. diff --git a/nexus/reconfigurator/planning/src/blueprint_builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder.rs index 2257281a58..8b0d440d26 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder.rs @@ -306,9 +306,10 @@ impl<'a> BlueprintBuilder<'a> { } } if let Some(external_ip) = z.zone_type.external_ip()? { - // In the test suite, ignore localhost. It gets reused many - // times and that's okay. - if (!cfg!(test) || !external_ip.is_loopback()) + // For the test suite, ignore localhost. It gets reused many + // times and that's okay. We don't expect to see localhost + // outside the test suite. + if !external_ip.is_loopback() && !used_external_ips.insert(external_ip) { bail!("duplicate external IP: {external_ip}");