From 6bc5e6062df7bffbe0eca1465b9d116ff7849e9f Mon Sep 17 00:00:00 2001 From: artemis everfree Date: Mon, 2 Oct 2023 19:29:36 -0700 Subject: [PATCH] RandomnWithDistinctSleds region allocation strategy (#3858) PR #3650 introduced the Random region allocation strategy to allocate regions randomly across the rack. This expands on that with the addition of the RandomWithDistinctSleds region allocation strategy. This strategy is the same, but requires the 3 crucible regions be allocated on 3 different sleds to improve resiliency against a whole-sled failure. The Random strategy still exists, and does not require 3 distinct sleds. This is useful in one-sled environments such as the integration tests, and lab setups. This also fixes a shortcoming of #3650 whereby multiple datasets on a single zpool could be selected. That fix applies to both the old Random strategy and the new RandomWithDistinctSleds strategy. In the present, I have unit tests that verify the allocation behavior works correctly with cockroachdb, and we can try it out on dogfood. Adds the `-r` / `--rack-topology` command line argument to omicron-package target create. Use this to specify whether you are packaging for a single-sled or multi-sled environment. Under single-sled environments, the requirement for 3 distinct sleds is removed. Fixes #3702 --------- Co-authored-by: iliana etaoin --- .github/buildomat/jobs/deploy.sh | 1 + .github/buildomat/jobs/package.sh | 9 +- .github/buildomat/jobs/tuf-repo.sh | 3 +- .github/workflows/rust.yml | 2 +- common/src/nexus_config.rs | 74 ++++- docs/how-to-run.adoc | 34 ++- installinator/Cargo.toml | 1 + .../db-model/src/queries/region_allocation.rs | 22 ++ nexus/db-queries/src/db/datastore/mod.rs | 277 +++++++++++++----- nexus/db-queries/src/db/datastore/region.rs | 2 +- .../src/db/queries/region_allocation.rs | 242 +++++++++------ nexus/examples/config.toml | 11 + nexus/src/app/mod.rs | 8 + nexus/src/app/sagas/disk_create.rs | 8 +- nexus/src/app/sagas/snapshot_create.rs | 7 +- nexus/tests/config.test.toml | 5 + package-manifest.toml | 3 +- package/src/bin/omicron-package.rs | 3 +- package/src/lib.rs | 23 ++ package/src/target.rs | 31 +- sled-agent/Cargo.toml | 2 + sled-agent/src/services.rs | 3 + smf/nexus/multi-sled/config-partial.toml | 45 +++ .../{ => single-sled}/config-partial.toml | 5 + 24 files changed, 617 insertions(+), 204 deletions(-) create mode 100644 smf/nexus/multi-sled/config-partial.toml rename smf/nexus/{ => single-sled}/config-partial.toml (86%) diff --git a/.github/buildomat/jobs/deploy.sh b/.github/buildomat/jobs/deploy.sh index 5d3dd8ec39..c2579d98ea 100755 --- a/.github/buildomat/jobs/deploy.sh +++ b/.github/buildomat/jobs/deploy.sh @@ -143,6 +143,7 @@ cd /opt/oxide/work ptime -m tar xvzf /input/package/work/package.tar.gz cp /input/package/work/zones/* out/ +mv out/omicron-nexus-single-sled.tar.gz out/omicron-nexus.tar.gz mkdir tests for p in /input/ci-tools/work/end-to-end-tests/*.gz; do ptime -m gunzip < "$p" > "tests/$(basename "${p%.gz}")" diff --git a/.github/buildomat/jobs/package.sh b/.github/buildomat/jobs/package.sh index fe5d6a9b7f..64c087524e 100755 --- a/.github/buildomat/jobs/package.sh +++ b/.github/buildomat/jobs/package.sh @@ -45,7 +45,7 @@ ptime -m ./tools/ci_download_softnpu_machinery # Build the test target ptime -m cargo run --locked --release --bin omicron-package -- \ - -t test target create -i standard -m non-gimlet -s softnpu + -t test target create -i standard -m non-gimlet -s softnpu -r single-sled ptime -m cargo run --locked --release --bin omicron-package -- \ -t test package @@ -81,9 +81,13 @@ stamp_packages() { done } +# Keep the single-sled Nexus zone around for the deploy job. (The global zone +# build below overwrites the file.) +mv out/omicron-nexus.tar.gz out/omicron-nexus-single-sled.tar.gz + # Build necessary for the global zone ptime -m cargo run --locked --release --bin omicron-package -- \ - -t host target create -i standard -m gimlet -s asic + -t host target create -i standard -m gimlet -s asic -r multi-sled ptime -m cargo run --locked --release --bin omicron-package -- \ -t host package stamp_packages omicron-sled-agent maghemite propolis-server overlay @@ -111,6 +115,7 @@ zones=( out/external-dns.tar.gz out/internal-dns.tar.gz out/omicron-nexus.tar.gz + out/omicron-nexus-single-sled.tar.gz out/oximeter-collector.tar.gz out/propolis-server.tar.gz out/switch-*.tar.gz diff --git a/.github/buildomat/jobs/tuf-repo.sh b/.github/buildomat/jobs/tuf-repo.sh index fab6770564..e169bebff6 100644 --- a/.github/buildomat/jobs/tuf-repo.sh +++ b/.github/buildomat/jobs/tuf-repo.sh @@ -77,10 +77,11 @@ done mkdir /work/package pushd /work/package tar xf /input/package/work/package.tar.gz out package-manifest.toml target/release/omicron-package -target/release/omicron-package -t default target create -i standard -m gimlet -s asic +target/release/omicron-package -t default target create -i standard -m gimlet -s asic -r multi-sled ln -s /input/package/work/zones/* out/ rm out/switch-softnpu.tar.gz # not used when target switch=asic rm out/omicron-gateway-softnpu.tar.gz # not used when target switch=asic +rm out/omicron-nexus-single-sled.tar.gz # only used for deploy tests for zone in out/*.tar.gz; do target/release/omicron-package stamp "$(basename "${zone%.tar.gz}")" "$VERSION" done diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 722aacbe0f..f5cf1dc885 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -41,7 +41,7 @@ jobs: - name: Install Pre-Requisites run: ./tools/install_builder_prerequisites.sh -y - name: Set default target - run: cargo run --bin omicron-package -- -t default target create + run: cargo run --bin omicron-package -- -t default target create -r single-sled - name: Check build of deployed Omicron packages run: cargo run --bin omicron-package -- -t default check diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs index 73ccec996c..ad62c34f92 100644 --- a/common/src/nexus_config.rs +++ b/common/src/nexus_config.rs @@ -372,6 +372,8 @@ pub struct PackageConfig { pub dendrite: HashMap, /// Background task configuration pub background_tasks: BackgroundTaskConfig, + /// Default Crucible region allocation strategy + pub default_region_allocation_strategy: RegionAllocationStrategy, } #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] @@ -594,6 +596,9 @@ mod test { dns_external.period_secs_propagation = 7 dns_external.max_concurrent_server_updates = 8 external_endpoints.period_secs = 9 + [default_region_allocation_strategy] + type = "random" + seed = 0 "##, ) .unwrap(); @@ -677,6 +682,10 @@ mod test { period_secs: Duration::from_secs(9), } }, + default_region_allocation_strategy: + crate::nexus_config::RegionAllocationStrategy::Random { + seed: Some(0) + } }, } ); @@ -724,6 +733,8 @@ mod test { dns_external.period_secs_propagation = 7 dns_external.max_concurrent_server_updates = 8 external_endpoints.period_secs = 9 + [default_region_allocation_strategy] + type = "random" "##, ) .unwrap(); @@ -864,25 +875,31 @@ mod test { struct DummyConfig { deployment: DeploymentConfig, } - let config_path = "../smf/nexus/config-partial.toml"; - println!( - "checking {:?} with example deployment section added", - config_path - ); - let mut contents = std::fs::read_to_string(config_path) - .expect("failed to read Nexus SMF config file"); - contents.push_str( - "\n\n\n \ - # !! content below added by test_repo_configs_are_valid()\n\ - \n\n\n", - ); let example_deployment = toml::to_string_pretty(&DummyConfig { deployment: example_config.deployment, }) .unwrap(); - contents.push_str(&example_deployment); - let _: Config = toml::from_str(&contents) - .expect("Nexus SMF config file is not valid"); + + let nexus_config_paths = [ + "../smf/nexus/single-sled/config-partial.toml", + "../smf/nexus/multi-sled/config-partial.toml", + ]; + for config_path in nexus_config_paths { + println!( + "checking {:?} with example deployment section added", + config_path + ); + let mut contents = std::fs::read_to_string(config_path) + .expect("failed to read Nexus SMF config file"); + contents.push_str( + "\n\n\n \ + # !! content below added by test_repo_configs_are_valid()\n\ + \n\n\n", + ); + contents.push_str(&example_deployment); + let _: Config = toml::from_str(&contents) + .expect("Nexus SMF config file is not valid"); + } } #[test] @@ -894,3 +911,30 @@ mod test { ); } } + +/// Defines a strategy for choosing what physical disks to use when allocating +/// new crucible regions. +/// +/// NOTE: More strategies can - and should! - be added. +/// +/// See for a more +/// complete discussion. +/// +/// Longer-term, we should consider: +/// - Storage size + remaining free space +/// - Sled placement of datasets +/// - What sort of loads we'd like to create (even split across all disks +/// may not be preferable, especially if maintenance is expected) +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum RegionAllocationStrategy { + /// Choose disks pseudo-randomly. An optional seed may be provided to make + /// the ordering deterministic, otherwise the current time in nanoseconds + /// will be used. Ordering is based on sorting the output of `md5(UUID of + /// candidate dataset + seed)`. The seed does not need to come from a + /// cryptographically secure source. + Random { seed: Option }, + + /// Like Random, but ensures that each region is allocated on its own sled. + RandomWithDistinctSleds { seed: Option }, +} diff --git a/docs/how-to-run.adoc b/docs/how-to-run.adoc index 1988a42669..7539c5183f 100644 --- a/docs/how-to-run.adoc +++ b/docs/how-to-run.adoc @@ -321,10 +321,32 @@ Error: Creates a new build target, and sets it as "active" Usage: omicron-package target create [OPTIONS] Options: - -i, --image [default: standard] [possible values: standard, trampoline] - -m, --machine [possible values: gimlet, gimlet-standalone, non-gimlet] - -s, --switch [possible values: asic, stub, softnpu] - -h, --help Print help (see more with '--help') + -i, --image + [default: standard] + + Possible values: + - standard: A typical host OS image + - trampoline: A recovery host OS image, intended to bootstrap a Standard image + + -m, --machine + Possible values: + - gimlet: Use sled agent configuration for a Gimlet + - gimlet-standalone: Use sled agent configuration for a Gimlet running in isolation + - non-gimlet: Use sled agent configuration for a device emulating a Gimlet + + -s, --switch + Possible values: + - asic: Use the "real" Dendrite, that attempts to interact with the Tofino + - stub: Use a "stub" Dendrite that does not require any real hardware + - softnpu: Use a "softnpu" Dendrite that uses the SoftNPU asic emulator + + -r, --rack-topology + Possible values: + - multi-sled: Use configurations suitable for a multi-sled deployment, such as dogfood and production racks + - single-sled: Use configurations suitable for a single-sled deployment, such as CI and dev machines + + -h, --help + Print help (see a summary with '-h') ---- @@ -332,9 +354,9 @@ To set up a build target for a non-Gimlet machine with simulated (but fully func [source,console] ---- -$ cargo run --release --bin omicron-package -- -t default target create -i standard -m non-gimlet -s softnpu +$ cargo run --release --bin omicron-package -- -t default target create -i standard -m non-gimlet -s softnpu -r single-sled Finished release [optimized] target(s) in 0.66s - Running `target/release/omicron-package -t default target create -i standard -m non-gimlet -s softnpu` + Running `target/release/omicron-package -t default target create -i standard -m non-gimlet -s softnpu -r single-sled` Created new build target 'default' and set it as active ---- diff --git a/installinator/Cargo.toml b/installinator/Cargo.toml index 3b2f04c38f..428ea0d08e 100644 --- a/installinator/Cargo.toml +++ b/installinator/Cargo.toml @@ -57,3 +57,4 @@ tokio-stream.workspace = true [features] image-standard = [] image-trampoline = [] +rack-topology-single-sled = [] \ No newline at end of file diff --git a/nexus/db-model/src/queries/region_allocation.rs b/nexus/db-model/src/queries/region_allocation.rs index 43fac3c9a6..2025e79fb8 100644 --- a/nexus/db-model/src/queries/region_allocation.rs +++ b/nexus/db-model/src/queries/region_allocation.rs @@ -47,6 +47,13 @@ table! { } } +table! { + shuffled_candidate_datasets { + id -> Uuid, + pool_id -> Uuid, + } +} + table! { candidate_regions { id -> Uuid, @@ -89,6 +96,19 @@ table! { } } +table! { + one_zpool_per_sled (pool_id) { + pool_id -> Uuid + } +} + +table! { + one_dataset_per_zpool { + id -> Uuid, + pool_id -> Uuid + } +} + table! { inserted_regions { id -> Uuid, @@ -141,6 +161,7 @@ diesel::allow_tables_to_appear_in_same_query!( ); diesel::allow_tables_to_appear_in_same_query!(old_regions, dataset,); +diesel::allow_tables_to_appear_in_same_query!(old_regions, zpool,); diesel::allow_tables_to_appear_in_same_query!( inserted_regions, @@ -149,6 +170,7 @@ diesel::allow_tables_to_appear_in_same_query!( diesel::allow_tables_to_appear_in_same_query!(candidate_zpools, dataset,); diesel::allow_tables_to_appear_in_same_query!(candidate_zpools, zpool,); +diesel::allow_tables_to_appear_in_same_query!(candidate_datasets, dataset); // == Needed for random region allocation == diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index ff1df710bb..b1f3203c60 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -307,43 +307,6 @@ pub enum UpdatePrecondition { Value(T), } -/// Defines a strategy for choosing what physical disks to use when allocating -/// new crucible regions. -/// -/// NOTE: More strategies can - and should! - be added. -/// -/// See for a more -/// complete discussion. -/// -/// Longer-term, we should consider: -/// - Storage size + remaining free space -/// - Sled placement of datasets -/// - What sort of loads we'd like to create (even split across all disks -/// may not be preferable, especially if maintenance is expected) -#[derive(Debug, Clone)] -pub enum RegionAllocationStrategy { - /// Choose disks that have the least data usage in the rack. This strategy - /// can lead to bad failure states wherein the disks with the least usage - /// have the least usage because regions on them are actually failing in - /// some way. Further retried allocations will then continue to try to - /// allocate onto the disk, perpetuating the problem. Currently this - /// strategy only exists so we can test that using different allocation - /// strategies actually results in different allocation patterns, hence the - /// `#[cfg(test)]`. - /// - /// See https://github.com/oxidecomputer/omicron/issues/3416 for more on the - /// failure-states associated with this strategy - #[cfg(test)] - LeastUsedDisk, - - /// Choose disks pseudo-randomly. An optional seed may be provided to make - /// the ordering deterministic, otherwise the current time in nanoseconds - /// will be used. Ordering is based on sorting the output of `md5(UUID of - /// candidate dataset + seed)`. The seed does not need to come from a - /// cryptographically secure source. - Random(Option), -} - /// Constructs a DataStore for use in test suites that has preloaded the /// built-in users, roles, and role assignments that are needed for basic /// operation @@ -421,7 +384,9 @@ mod test { use omicron_common::api::external::{ self, ByteCount, Error, IdentityMetadataCreateParams, LookupType, Name, }; + use omicron_common::nexus_config::RegionAllocationStrategy; use omicron_test_utils::dev; + use std::collections::HashMap; use std::collections::HashSet; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddrV6}; use std::num::NonZeroU32; @@ -704,12 +669,18 @@ mod test { } } + struct TestDataset { + sled_id: Uuid, + dataset_id: Uuid, + } + async fn create_test_datasets_for_region_allocation( opctx: &OpContext, datastore: Arc, - ) -> Vec { + number_of_sleds: usize, + ) -> Vec { // Create sleds... - let sled_ids: Vec = stream::iter(0..REGION_REDUNDANCY_THRESHOLD) + let sled_ids: Vec = stream::iter(0..number_of_sleds) .then(|_| create_test_sled(&datastore)) .collect() .await; @@ -740,48 +711,69 @@ mod test { .collect() .await; + #[derive(Copy, Clone)] + struct Zpool { + sled_id: Uuid, + pool_id: Uuid, + } + // 1 pool per disk - let zpool_ids: Vec = stream::iter(physical_disks) + let zpools: Vec = stream::iter(physical_disks) .then(|disk| { - create_test_zpool(&datastore, disk.sled_id, disk.disk_id) + let pool_id_future = + create_test_zpool(&datastore, disk.sled_id, disk.disk_id); + async move { + let pool_id = pool_id_future.await; + Zpool { sled_id: disk.sled_id, pool_id } + } }) .collect() .await; let bogus_addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 8080, 0, 0); - // 1 dataset per zpool - let dataset_ids: Vec = stream::iter(zpool_ids) - .then(|zpool_id| { - let id = Uuid::new_v4(); - let dataset = Dataset::new( - id, - zpool_id, - bogus_addr, - DatasetKind::Crucible, - ); - let datastore = datastore.clone(); - async move { - datastore.dataset_upsert(dataset).await.unwrap(); - id - } + let datasets: Vec = stream::iter(zpools) + .map(|zpool| { + // 3 datasets per zpool, to test that pools are distinct + let zpool_iter: Vec = (0..3).map(|_| zpool).collect(); + stream::iter(zpool_iter).then(|zpool| { + let id = Uuid::new_v4(); + let dataset = Dataset::new( + id, + zpool.pool_id, + bogus_addr, + DatasetKind::Crucible, + ); + + let datastore = datastore.clone(); + async move { + datastore.dataset_upsert(dataset).await.unwrap(); + + TestDataset { sled_id: zpool.sled_id, dataset_id: id } + } + }) }) + .flatten() .collect() .await; - dataset_ids + datasets } #[tokio::test] /// Note that this test is currently non-deterministic. It can be made /// deterministic by generating deterministic *dataset* Uuids. The sled and /// pool IDs should not matter. - async fn test_region_allocation() { - let logctx = dev::test_setup_log("test_region_allocation"); + async fn test_region_allocation_strat_random() { + let logctx = dev::test_setup_log("test_region_allocation_strat_random"); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - create_test_datasets_for_region_allocation(&opctx, datastore.clone()) - .await; + create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; // Allocate regions from the datasets for this disk. Do it a few times // for good measure. @@ -799,7 +791,9 @@ mod test { volume_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(alloc_seed as u128)), + &RegionAllocationStrategy::Random { + seed: Some(alloc_seed), + }, ) .await .unwrap(); @@ -809,8 +803,81 @@ mod test { let mut disk_datasets = HashSet::new(); let mut disk_zpools = HashSet::new(); - // TODO: When allocation chooses 3 distinct sleds, uncomment this. - // let mut disk1_sleds = HashSet::new(); + for (dataset, region) in dataset_and_regions { + // Must be 3 unique datasets + assert!(disk_datasets.insert(dataset.id())); + + // Must be 3 unique zpools + assert!(disk_zpools.insert(dataset.pool_id)); + + assert_eq!(volume_id, region.volume_id()); + assert_eq!(ByteCount::from(4096), region.block_size()); + let (_, extent_count) = DataStore::get_crucible_allocation( + &BlockSize::AdvancedFormat, + params.size, + ); + assert_eq!(extent_count, region.extent_count()); + } + } + + let _ = db.cleanup().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + /// Test the [`RegionAllocationStrategy::RandomWithDistinctSleds`] strategy. + /// It should always pick datasets where no two datasets are on the same + /// zpool and no two zpools are on the same sled. + async fn test_region_allocation_strat_random_with_distinct_sleds() { + let logctx = dev::test_setup_log( + "test_region_allocation_strat_random_with_distinct_sleds", + ); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create a rack without enough sleds for a successful allocation when + // we require 3 distinct sleds. + let test_datasets = create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; + + // We need to check that our datasets end up on 3 distinct sleds, but the query doesn't return the sled ID, so we need to reverse map from dataset ID to sled ID + let sled_id_map: HashMap = test_datasets + .into_iter() + .map(|test_dataset| (test_dataset.dataset_id, test_dataset.sled_id)) + .collect(); + + // Allocate regions from the datasets for this disk. Do it a few times + // for good measure. + for alloc_seed in 0..10 { + let params = create_test_disk_create_params( + &format!("disk{}", alloc_seed), + ByteCount::from_mebibytes_u32(1), + ); + let volume_id = Uuid::new_v4(); + + let expected_region_count = REGION_REDUNDANCY_THRESHOLD; + let dataset_and_regions = datastore + .region_allocate( + &opctx, + volume_id, + ¶ms.disk_source, + params.size, + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: Some(alloc_seed), + }, + ) + .await + .unwrap(); + + // Verify the allocation. + assert_eq!(expected_region_count, dataset_and_regions.len()); + let mut disk_datasets = HashSet::new(); + let mut disk_zpools = HashSet::new(); + let mut disk_sleds = HashSet::new(); for (dataset, region) in dataset_and_regions { // Must be 3 unique datasets assert!(disk_datasets.insert(dataset.id())); @@ -819,8 +886,8 @@ mod test { assert!(disk_zpools.insert(dataset.pool_id)); // Must be 3 unique sleds - // TODO: When allocation chooses 3 distinct sleds, uncomment this. - // assert!(disk1_sleds.insert(Err(dataset))); + let sled_id = sled_id_map.get(&dataset.id()).unwrap(); + assert!(disk_sleds.insert(*sled_id)); assert_eq!(volume_id, region.volume_id()); assert_eq!(ByteCount::from(4096), region.block_size()); @@ -836,14 +903,72 @@ mod test { logctx.cleanup_successful(); } + #[tokio::test] + /// Ensure the [`RegionAllocationStrategy::RandomWithDistinctSleds`] + /// strategy fails when there aren't enough distinct sleds. + async fn test_region_allocation_strat_random_with_distinct_sleds_fails() { + let logctx = dev::test_setup_log( + "test_region_allocation_strat_random_with_distinct_sleds_fails", + ); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create a rack without enough sleds for a successful allocation when + // we require 3 distinct sleds. + create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD - 1, + ) + .await; + + // Allocate regions from the datasets for this disk. Do it a few times + // for good measure. + for alloc_seed in 0..10 { + let params = create_test_disk_create_params( + &format!("disk{}", alloc_seed), + ByteCount::from_mebibytes_u32(1), + ); + let volume_id = Uuid::new_v4(); + + let err = datastore + .region_allocate( + &opctx, + volume_id, + ¶ms.disk_source, + params.size, + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: Some(alloc_seed), + }, + ) + .await + .unwrap_err(); + + let expected = "Not enough zpool space to allocate disks"; + assert!( + err.to_string().contains(expected), + "Saw error: \'{err}\', but expected \'{expected}\'" + ); + + assert!(matches!(err, Error::ServiceUnavailable { .. })); + } + + let _ = db.cleanup().await; + logctx.cleanup_successful(); + } + #[tokio::test] async fn test_region_allocation_is_idempotent() { let logctx = dev::test_setup_log("test_region_allocation_is_idempotent"); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - create_test_datasets_for_region_allocation(&opctx, datastore.clone()) - .await; + create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; // Allocate regions from the datasets for this volume. let params = create_test_disk_create_params( @@ -857,7 +982,7 @@ mod test { volume_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(0)), + &RegionAllocationStrategy::Random { seed: Some(0) }, ) .await .unwrap(); @@ -870,7 +995,7 @@ mod test { volume_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(1)), + &RegionAllocationStrategy::Random { seed: Some(1) }, ) .await .unwrap(); @@ -959,7 +1084,7 @@ mod test { volume1_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(0)), + &RegionAllocationStrategy::Random { seed: Some(0) }, ) .await .unwrap_err(); @@ -983,8 +1108,12 @@ mod test { let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - create_test_datasets_for_region_allocation(&opctx, datastore.clone()) - .await; + create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; let disk_size = test_zpool_size(); let alloc_size = ByteCount::try_from(disk_size.to_bytes() * 2).unwrap(); @@ -997,7 +1126,7 @@ mod test { volume1_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(0)), + &RegionAllocationStrategy::Random { seed: Some(0) }, ) .await .is_err()); diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index 5bc79b9481..9465fe2792 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -5,7 +5,6 @@ //! [`DataStore`] methods on [`Region`]s. use super::DataStore; -use super::RegionAllocationStrategy; use super::RunnableQuery; use crate::context::OpContext; use crate::db; @@ -23,6 +22,7 @@ use omicron_common::api::external; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; use omicron_common::backoff::{self, BackoffError}; +use omicron_common::nexus_config::RegionAllocationStrategy; use slog::Logger; use uuid::Uuid; diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index b071ee3f44..7f7b2ea9bf 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -6,7 +6,6 @@ use crate::db::alias::ExpressionAlias; use crate::db::cast_uuid_as_bytea::CastUuidToBytea; -use crate::db::datastore::RegionAllocationStrategy; use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD; use crate::db::model::{Dataset, DatasetKind, Region}; use crate::db::pool::DbConnection; @@ -24,10 +23,11 @@ use diesel::{ use nexus_db_model::queries::region_allocation::{ candidate_datasets, candidate_regions, candidate_zpools, cockroach_md5, do_insert, inserted_regions, old_regions, old_zpool_usage, - proposed_dataset_changes, updated_datasets, + proposed_dataset_changes, shuffled_candidate_datasets, updated_datasets, }; use nexus_db_model::schema; use omicron_common::api::external; +use omicron_common::nexus_config::RegionAllocationStrategy; const NOT_ENOUGH_DATASETS_SENTINEL: &'static str = "Not enough datasets"; const NOT_ENOUGH_ZPOOL_SPACE_SENTINEL: &'static str = "Not enough space"; @@ -53,7 +53,7 @@ pub fn from_diesel(e: async_bb8_diesel::ConnectionError) -> external::Error { } NOT_ENOUGH_ZPOOL_SPACE_SENTINEL => { return external::Error::unavail( - "Not enough zpool space to allocate disks", + "Not enough zpool space to allocate disks. There may not be enough disks with space for the requested region. You may also see this if your rack is in a degraded state, or you're running the default multi-rack topology configuration in a 1-sled development environment.", ); } NOT_ENOUGH_UNIQUE_ZPOOLS_SENTINEL => { @@ -91,6 +91,8 @@ impl OldRegions { /// This implicitly distinguishes between "M.2s" and "U.2s" -- Nexus needs to /// determine during dataset provisioning which devices should be considered for /// usage as Crucible storage. +/// +/// We select only one dataset from each zpool. #[derive(Subquery, QueryId)] #[subquery(name = candidate_datasets)] struct CandidateDatasets { @@ -98,71 +100,65 @@ struct CandidateDatasets { } impl CandidateDatasets { - fn new( - allocation_strategy: &RegionAllocationStrategy, - candidate_zpools: &CandidateZpools, - ) -> Self { + fn new(candidate_zpools: &CandidateZpools, seed: u128) -> Self { use crate::db::schema::dataset::dsl as dataset_dsl; use candidate_zpools::dsl as candidate_zpool_dsl; - let query = match allocation_strategy { - #[cfg(test)] - RegionAllocationStrategy::LeastUsedDisk => { - let query: Box< - dyn CteQuery, - > = Box::new( - dataset_dsl::dataset - .inner_join( - candidate_zpools - .query_source() - .on(dataset_dsl::pool_id - .eq(candidate_zpool_dsl::pool_id)), - ) - .filter(dataset_dsl::time_deleted.is_null()) - .filter(dataset_dsl::size_used.is_not_null()) - .filter(dataset_dsl::kind.eq(DatasetKind::Crucible)) - .order(dataset_dsl::size_used.asc()) - .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap()) - .select((dataset_dsl::id, dataset_dsl::pool_id)), - ); - query - } - RegionAllocationStrategy::Random(seed) => { - let seed = seed.unwrap_or_else(|| { - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_nanos() - }); - - let seed_bytes = seed.to_le_bytes(); - - let query: Box< - dyn CteQuery, - > = Box::new( - dataset_dsl::dataset - .inner_join( - candidate_zpools - .query_source() - .on(dataset_dsl::pool_id - .eq(candidate_zpool_dsl::pool_id)), - ) - .filter(dataset_dsl::time_deleted.is_null()) - .filter(dataset_dsl::size_used.is_not_null()) - .filter(dataset_dsl::kind.eq(DatasetKind::Crucible)) - // We order by md5 to shuffle the ordering of the datasets. - // md5 has a uniform output distribution so it does the job. - .order(cockroach_md5::dsl::md5( + let seed_bytes = seed.to_le_bytes(); + + let query: Box> = + Box::new( + dataset_dsl::dataset + .inner_join(candidate_zpools.query_source().on( + dataset_dsl::pool_id.eq(candidate_zpool_dsl::pool_id), + )) + .filter(dataset_dsl::time_deleted.is_null()) + .filter(dataset_dsl::size_used.is_not_null()) + .filter(dataset_dsl::kind.eq(DatasetKind::Crucible)) + .distinct_on(dataset_dsl::pool_id) + .order_by(( + dataset_dsl::pool_id, + cockroach_md5::dsl::md5( CastUuidToBytea::new(dataset_dsl::id) .concat(seed_bytes.to_vec()), - )) - .select((dataset_dsl::id, dataset_dsl::pool_id)) - .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap()), - ); - query - } - }; + ), + )) + .select((dataset_dsl::id, dataset_dsl::pool_id)), + ); + Self { query } + } +} + +/// Shuffle the candidate datasets, and select REGION_REDUNDANCY_THRESHOLD +/// regions from it. +#[derive(Subquery, QueryId)] +#[subquery(name = shuffled_candidate_datasets)] +struct ShuffledCandidateDatasets { + query: Box>, +} +impl ShuffledCandidateDatasets { + fn new(candidate_datasets: &CandidateDatasets, seed: u128) -> Self { + use candidate_datasets::dsl as candidate_datasets_dsl; + + let seed_bytes = seed.to_le_bytes(); + + let query: Box> = + Box::new( + candidate_datasets + .query_source() + // We order by md5 to shuffle the ordering of the datasets. + // md5 has a uniform output distribution so it does the job. + .order(cockroach_md5::dsl::md5( + CastUuidToBytea::new(candidate_datasets_dsl::id) + .concat(seed_bytes.to_vec()), + )) + .select(( + candidate_datasets_dsl::id, + candidate_datasets_dsl::pool_id, + )) + .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap()), + ); Self { query } } } @@ -179,14 +175,14 @@ diesel::sql_function!(fn now() -> Timestamptz); impl CandidateRegions { fn new( - candidate_datasets: &CandidateDatasets, + shuffled_candidate_datasets: &ShuffledCandidateDatasets, volume_id: uuid::Uuid, block_size: u64, blocks_per_extent: u64, extent_count: u64, ) -> Self { - use candidate_datasets::dsl as candidate_datasets_dsl; use schema::region; + use shuffled_candidate_datasets::dsl as shuffled_candidate_datasets_dsl; let volume_id = volume_id.into_sql::(); let block_size = (block_size as i64).into_sql::(); @@ -195,20 +191,22 @@ impl CandidateRegions { let extent_count = (extent_count as i64).into_sql::(); Self { - query: Box::new(candidate_datasets.query_source().select(( - ExpressionAlias::new::(gen_random_uuid()), - ExpressionAlias::new::(now()), - ExpressionAlias::new::(now()), - ExpressionAlias::new::( - candidate_datasets_dsl::id, + query: Box::new(shuffled_candidate_datasets.query_source().select( + ( + ExpressionAlias::new::(gen_random_uuid()), + ExpressionAlias::new::(now()), + ExpressionAlias::new::(now()), + ExpressionAlias::new::( + shuffled_candidate_datasets_dsl::id, + ), + ExpressionAlias::new::(volume_id), + ExpressionAlias::new::(block_size), + ExpressionAlias::new::( + blocks_per_extent, + ), + ExpressionAlias::new::(extent_count), ), - ExpressionAlias::new::(volume_id), - ExpressionAlias::new::(block_size), - ExpressionAlias::new::( - blocks_per_extent, - ), - ExpressionAlias::new::(extent_count), - ))), + )), } } } @@ -285,12 +283,14 @@ struct CandidateZpools { } impl CandidateZpools { - fn new(old_zpool_usage: &OldPoolUsage, zpool_size_delta: u64) -> Self { + fn new( + old_zpool_usage: &OldPoolUsage, + zpool_size_delta: u64, + seed: u128, + distinct_sleds: bool, + ) -> Self { use schema::zpool::dsl as zpool_dsl; - let with_zpool = zpool_dsl::zpool - .on(zpool_dsl::id.eq(old_zpool_usage::dsl::pool_id)); - // Why are we using raw `diesel::dsl::sql` here? // // When SQL performs the "SUM" operation on "bigint" type, the result @@ -309,15 +309,40 @@ impl CandidateZpools { + diesel::dsl::sql(&zpool_size_delta.to_string())) .le(diesel::dsl::sql(zpool_dsl::total_size::NAME)); - Self { - query: Box::new( - old_zpool_usage - .query_source() - .inner_join(with_zpool) - .filter(it_will_fit) - .select((old_zpool_usage::dsl::pool_id,)), - ), - } + let with_zpool = zpool_dsl::zpool + .on(zpool_dsl::id.eq(old_zpool_usage::dsl::pool_id)); + + let base_query = old_zpool_usage + .query_source() + .inner_join(with_zpool) + .filter(it_will_fit) + .select((old_zpool_usage::dsl::pool_id,)); + + let query = if distinct_sleds { + let seed_bytes = seed.to_le_bytes(); + + let query: Box> = + Box::new( + base_query + .order_by(( + zpool_dsl::sled_id, + cockroach_md5::dsl::md5( + CastUuidToBytea::new(zpool_dsl::id) + .concat(seed_bytes.to_vec()), + ), + )) + .distinct_on(zpool_dsl::sled_id), + ); + + query + } else { + let query: Box> = + Box::new(base_query); + + query + }; + + Self { query } } } @@ -508,19 +533,47 @@ impl RegionAllocate { extent_count: u64, allocation_strategy: &RegionAllocationStrategy, ) -> Self { + let (seed, distinct_sleds) = { + let (input_seed, distinct_sleds) = match allocation_strategy { + RegionAllocationStrategy::Random { seed } => (seed, false), + RegionAllocationStrategy::RandomWithDistinctSleds { seed } => { + (seed, true) + } + }; + ( + input_seed.map_or_else( + || { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + }, + |seed| seed as u128, + ), + distinct_sleds, + ) + }; + let size_delta = block_size * blocks_per_extent * extent_count; let old_regions = OldRegions::new(volume_id); let old_pool_usage = OldPoolUsage::new(); - let candidate_zpools = - CandidateZpools::new(&old_pool_usage, size_delta); + let candidate_zpools = CandidateZpools::new( + &old_pool_usage, + size_delta, + seed, + distinct_sleds, + ); let candidate_datasets = - CandidateDatasets::new(&allocation_strategy, &candidate_zpools); + CandidateDatasets::new(&candidate_zpools, seed); + + let shuffled_candidate_datasets = + ShuffledCandidateDatasets::new(&candidate_datasets, seed); let candidate_regions = CandidateRegions::new( - &candidate_datasets, + &shuffled_candidate_datasets, volume_id, block_size, blocks_per_extent, @@ -577,6 +630,7 @@ impl RegionAllocate { .add_subquery(old_pool_usage) .add_subquery(candidate_zpools) .add_subquery(candidate_datasets) + .add_subquery(shuffled_candidate_datasets) .add_subquery(candidate_regions) .add_subquery(proposed_changes) .add_subquery(do_insert) diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index f1b20c32a1..1a9afbc6bd 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -92,3 +92,14 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 + +[default_region_allocation_strategy] +# allocate region on 3 random distinct zpools, on 3 random distinct sleds. +type = "random_with_distinct_sleds" + +# the same as random_with_distinct_sleds, but without requiring distinct sleds +# type = "random" + +# setting `seed` to a fixed value will make dataset selection ordering use the +# same shuffling order for every region allocation. +# seed = 0 diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 5bab5e2820..354df0ead3 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -23,6 +23,7 @@ use omicron_common::address::DENDRITE_PORT; use omicron_common::address::MGS_PORT; use omicron_common::api::external::Error; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_common::nexus_config::RegionAllocationStrategy; use slog::Logger; use std::collections::HashMap; use std::net::Ipv6Addr; @@ -153,6 +154,9 @@ pub struct Nexus { /// Background tasks background_tasks: background::BackgroundTasks, + + /// Default Crucible region allocation strategy + default_region_allocation_strategy: RegionAllocationStrategy, } impl Nexus { @@ -325,6 +329,10 @@ impl Nexus { external_resolver, dpd_clients, background_tasks, + default_region_allocation_strategy: config + .pkg + .default_region_allocation_strategy + .clone(), }; // TODO-cleanup all the extra Arcs here seems wrong diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index cca36cefa7..275c8738cc 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -12,11 +12,10 @@ use super::{ ACTION_GENERATE_ID, }; use crate::app::sagas::declare_saga_actions; +use crate::app::{authn, authz, db}; use crate::external_api::params; -use nexus_db_queries::db::datastore::RegionAllocationStrategy; use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; -use nexus_db_queries::{authn, authz, db}; use omicron_common::api::external::DiskState; use omicron_common::api::external::Error; use rand::{rngs::StdRng, RngCore, SeedableRng}; @@ -255,6 +254,9 @@ async fn sdc_alloc_regions( &sagactx, ¶ms.serialized_authn, ); + + let strategy = &osagactx.nexus().default_region_allocation_strategy; + let datasets_and_regions = osagactx .datastore() .region_allocate( @@ -262,7 +264,7 @@ async fn sdc_alloc_regions( volume_id, ¶ms.create_params.disk_source, params.create_params.size, - &RegionAllocationStrategy::Random(None), + &strategy, ) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index b27f4a3a9b..eeabf64894 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -100,14 +100,13 @@ use super::{ }; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::retry_until_known_result; +use crate::app::{authn, authz, db}; use crate::external_api::params; use anyhow::anyhow; use crucible_agent_client::{types::RegionId, Client as CrucibleAgentClient}; use nexus_db_model::Generation; -use nexus_db_queries::db::datastore::RegionAllocationStrategy; use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; -use nexus_db_queries::{authn, authz, db}; use omicron_common::api::external; use omicron_common::api::external::Error; use rand::{rngs::StdRng, RngCore, SeedableRng}; @@ -332,6 +331,8 @@ async fn ssc_alloc_regions( .await .map_err(ActionError::action_failed)?; + let strategy = &osagactx.nexus().default_region_allocation_strategy; + let datasets_and_regions = osagactx .datastore() .region_allocate( @@ -344,7 +345,7 @@ async fn ssc_alloc_regions( .map_err(|e| ActionError::action_failed(e.to_string()))?, }, external::ByteCount::from(disk.size), - &RegionAllocationStrategy::Random(None), + &strategy, ) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 6eeacceaed..1b1ae2c912 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -89,3 +89,8 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 + +[default_region_allocation_strategy] +# we only have one sled in the test environment, so we need to use the +# `Random` strategy, instead of `RandomWithDistinctSleds` +type = "random" \ No newline at end of file diff --git a/package-manifest.toml b/package-manifest.toml index 4dc0f6b616..c776f6d96d 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -92,7 +92,8 @@ source.rust.binary_names = ["nexus", "schema-updater"] source.rust.release = true source.paths = [ { from = "/opt/ooce/pgsql-13/lib/amd64", to = "/opt/ooce/pgsql-13/lib/amd64" }, - { from = "smf/nexus", to = "/var/svc/manifest/site/nexus" }, + { from = "smf/nexus/manifest.xml", to = "/var/svc/manifest/site/nexus/manifest.xml" }, + { from = "smf/nexus/{{rack-topology}}", to = "/var/svc/manifest/site/nexus" }, { from = "out/console-assets", to = "/var/nexus/static" }, { from = "schema/crdb", to = "/var/nexus/schema/crdb" }, ] diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index ea490e54cf..bc07b61234 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -211,11 +211,12 @@ async fn do_target( format!("failed to create directory {}", target_dir.display()) })?; match subcommand { - TargetCommand::Create { image, machine, switch } => { + TargetCommand::Create { image, machine, switch, rack_topology } => { let target = KnownTarget::new( image.clone(), machine.clone(), switch.clone(), + rack_topology.clone(), )?; let path = get_single_target(&target_dir, name).await?; diff --git a/package/src/lib.rs b/package/src/lib.rs index b0cc04970a..395f3ed472 100644 --- a/package/src/lib.rs +++ b/package/src/lib.rs @@ -46,6 +46,29 @@ pub enum TargetCommand { #[clap(short, long, default_value_if("image", "standard", "stub"))] switch: Option, + + #[clap( + short, + long, + default_value_if("image", "trampoline", Some("single-sled")), + + // This opt is required, and clap will enforce that even with + // `required = false`, since it's not an Option. But the + // default_value_if only works if we set `required` to false. It's + // jank, but it is what it is. + // https://github.com/clap-rs/clap/issues/4086 + required = false + )] + /// Specify whether nexus will run in a single-sled or multi-sled + /// environment. + /// + /// Set single-sled for dev purposes when you're running a single + /// sled-agent. Set multi-sled if you're running with mulitple sleds. + /// Currently this only affects the crucible disk allocation strategy- + /// VM disks will require 3 distinct sleds with `multi-sled`, which will + /// fail in a single-sled environment. `single-sled` relaxes this + /// requirement. + rack_topology: crate::target::RackTopology, }, /// List all existing targets List, diff --git a/package/src/target.rs b/package/src/target.rs index a7b2dd4539..d5d5e92c46 100644 --- a/package/src/target.rs +++ b/package/src/target.rs @@ -48,12 +48,27 @@ pub enum Switch { SoftNpu, } +/// Topology of the sleds within the rack. +#[derive(Clone, Debug, strum::EnumString, strum::Display, ValueEnum)] +#[strum(serialize_all = "kebab-case")] +#[clap(rename_all = "kebab-case")] +pub enum RackTopology { + /// Use configurations suitable for a multi-sled deployment, such as dogfood + /// and production racks. + MultiSled, + + /// Use configurations suitable for a single-sled deployment, such as CI and + /// dev machines. + SingleSled, +} + /// A strongly-typed variant of [Target]. #[derive(Clone, Debug)] pub struct KnownTarget { image: Image, machine: Option, switch: Option, + rack_topology: RackTopology, } impl KnownTarget { @@ -61,6 +76,7 @@ impl KnownTarget { image: Image, machine: Option, switch: Option, + rack_topology: RackTopology, ) -> Result { if matches!(image, Image::Trampoline) { if machine.is_some() { @@ -77,7 +93,7 @@ impl KnownTarget { bail!("'switch=asic' is only valid with 'machine=gimlet'"); } - Ok(Self { image, machine, switch }) + Ok(Self { image, machine, switch, rack_topology }) } } @@ -87,6 +103,7 @@ impl Default for KnownTarget { image: Image::Standard, machine: Some(Machine::NonGimlet), switch: Some(Switch::Stub), + rack_topology: RackTopology::MultiSled, } } } @@ -101,6 +118,7 @@ impl From for Target { if let Some(switch) = kt.switch { map.insert("switch".to_string(), switch.to_string()); } + map.insert("rack-topology".to_string(), kt.rack_topology.to_string()); Target(map) } } @@ -121,6 +139,7 @@ impl std::str::FromStr for KnownTarget { let mut image = Self::default().image; let mut machine = None; let mut switch = None; + let mut rack_topology = None; for (k, v) in target.0.into_iter() { match k.as_str() { @@ -133,6 +152,9 @@ impl std::str::FromStr for KnownTarget { "switch" => { switch = Some(v.parse()?); } + "rack-topology" => { + rack_topology = Some(v.parse()?); + } _ => { bail!( "Unknown target key {k}\nValid keys include: [{}]", @@ -146,6 +168,11 @@ impl std::str::FromStr for KnownTarget { } } } - KnownTarget::new(image, machine, switch) + KnownTarget::new( + image, + machine, + switch, + rack_topology.unwrap_or(RackTopology::MultiSled), + ) } } diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 88e51a3bc3..d4ccfc97c8 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -120,3 +120,5 @@ machine-non-gimlet = [] switch-asic = [] switch-stub = [] switch-softnpu = [] +rack-topology-single-sled = [] +rack-topology-multi-sled = [] diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 96cdf8222b..60f0965612 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1513,6 +1513,9 @@ impl ServiceManager { .open(&config_path) .await .map_err(|err| Error::io_path(&config_path, err))?; + file.write_all(b"\n\n") + .await + .map_err(|err| Error::io_path(&config_path, err))?; file.write_all(config_str.as_bytes()) .await .map_err(|err| Error::io_path(&config_path, err))?; diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml new file mode 100644 index 0000000000..2dfee81d02 --- /dev/null +++ b/smf/nexus/multi-sled/config-partial.toml @@ -0,0 +1,45 @@ +# +# Oxide API: partial configuration file +# + +[console] +# Directory for static assets. Absolute path or relative to CWD. +static_dir = "/var/nexus/static" +session_idle_timeout_minutes = 60 +session_absolute_timeout_minutes = 480 + +[authn] +schemes_external = ["session_cookie", "access_token"] + +[log] +# Show log messages of this level and more severe +level = "debug" +mode = "file" +path = "/dev/stdout" +if_exists = "append" + +# TODO: Uncomment the following lines to enable automatic schema +# migration on boot. +# +# [schema] +# schema_dir = "/var/nexus/schema/crdb" + +[background_tasks] +dns_internal.period_secs_config = 60 +dns_internal.period_secs_servers = 60 +dns_internal.period_secs_propagation = 60 +dns_internal.max_concurrent_server_updates = 5 +dns_external.period_secs_config = 60 +dns_external.period_secs_servers = 60 +dns_external.period_secs_propagation = 60 +dns_external.max_concurrent_server_updates = 5 +# How frequently we check the list of stored TLS certificates. This is +# approximately an upper bound on how soon after updating the list of +# certificates it will take _other_ Nexus instances to notice and stop serving +# them (on a sunny day). +external_endpoints.period_secs = 60 + +[default_region_allocation_strategy] +# by default, allocate across 3 distinct sleds +# seed is omitted so a new seed will be chosen with every allocation. +type = "random_with_distinct_sleds" \ No newline at end of file diff --git a/smf/nexus/config-partial.toml b/smf/nexus/single-sled/config-partial.toml similarity index 86% rename from smf/nexus/config-partial.toml rename to smf/nexus/single-sled/config-partial.toml index b29727c4aa..aff0a8a25f 100644 --- a/smf/nexus/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -38,3 +38,8 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 + +[default_region_allocation_strategy] +# by default, allocate without requirement for distinct sleds. +# seed is omitted so a new seed will be chosen with every allocation. +type = "random" \ No newline at end of file