From 5009fa85b83c9fa15a0866cec7b297d1ca1db68d Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Wed, 9 Aug 2023 04:03:50 +0000 Subject: [PATCH 01/13] RandomnWithDistinctSleds region allocation strategy PR #3650 introduced the Random region allocation strategy to allocate regions randomly across the rack. This expands on that with the addition of the RandomWithDistinctSleds region allocation strategy. This strategy is the same, but requires the 3 crucible regions be allocated on 3 different sleds to improve resiliency against a whole-sled failure. The Random strategy still exists, and does not require 3 distinct sleds. This is useful in one-sled environments such as the integration tests, and lab setups. This PR adds the ability to configure the allocation strategy in the Nexus PackageConfig toml. Anyone running in a one-sled setup will need to configure that to one-sled mode (as is done for the integration test environment). This also fixes a shortcoming of #3650 whereby multiple datasets on a single zpool could be selected. That fix applies to both the old Random strategy and the new RandomWithDistinctSleds strategy. `smf/nexus/config-partial.toml` is configured for RandomWithDistinctSleds, as that is what we want to use on prod. As I mentioned, the integration tests are not using the distinct sleds allocation strategy. I attempted to add 2 extra sleds to the simulated environment but found that this broke more things than I had the understanding to fix in this PR. It would be nice in the future for the sim environment to have 3 sleds in it though, not just for this but for anything else that might have different behaviors in a multi-sled setup. In the present, I have unit tests that verify the allocation behavior works correctly with cockroachdb, and we can try it out on dogfood. --- common/src/nexus_config.rs | 38 +++ .../db-model/src/queries/region_allocation.rs | 22 ++ nexus/db-queries/src/db/datastore/mod.rs | 271 +++++++++++++----- nexus/db-queries/src/db/datastore/region.rs | 2 +- .../src/db/queries/region_allocation.rs | 240 ++++++++++------ nexus/examples/config.toml | 11 + nexus/src/app/mod.rs | 8 + nexus/src/app/sagas/disk_create.rs | 7 +- nexus/src/app/sagas/snapshot_create.rs | 6 +- nexus/tests/config.test.toml | 5 + smf/nexus/config-partial.toml | 5 + 11 files changed, 444 insertions(+), 171 deletions(-) diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs index 73ccec996c..47c567dbe0 100644 --- a/common/src/nexus_config.rs +++ b/common/src/nexus_config.rs @@ -372,6 +372,8 @@ pub struct PackageConfig { pub dendrite: HashMap, /// Background task configuration pub background_tasks: BackgroundTaskConfig, + /// Default Crucible region allocation strategy + pub default_region_allocation_strategy: RegionAllocationStrategy, } #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] @@ -594,6 +596,9 @@ mod test { dns_external.period_secs_propagation = 7 dns_external.max_concurrent_server_updates = 8 external_endpoints.period_secs = 9 + [default_region_allocation_strategy] + type = "random" + seed = 0 "##, ) .unwrap(); @@ -677,6 +682,10 @@ mod test { period_secs: Duration::from_secs(9), } }, + default_region_allocation_strategy: + crate::nexus_config::RegionAllocationStrategy::Random { + seed: Some(0) + } }, } ); @@ -724,6 +733,8 @@ mod test { dns_external.period_secs_propagation = 7 dns_external.max_concurrent_server_updates = 8 external_endpoints.period_secs = 9 + [default_region_allocation_strategy] + type = "random" "##, ) .unwrap(); @@ -894,3 +905,30 @@ mod test { ); } } + +/// Defines a strategy for choosing what physical disks to use when allocating +/// new crucible regions. +/// +/// NOTE: More strategies can - and should! - be added. +/// +/// See for a more +/// complete discussion. +/// +/// Longer-term, we should consider: +/// - Storage size + remaining free space +/// - Sled placement of datasets +/// - What sort of loads we'd like to create (even split across all disks +/// may not be preferable, especially if maintenance is expected) +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum RegionAllocationStrategy { + /// Choose disks pseudo-randomly. An optional seed may be provided to make + /// the ordering deterministic, otherwise the current time in nanoseconds + /// will be used. Ordering is based on sorting the output of `md5(UUID of + /// candidate dataset + seed)`. The seed does not need to come from a + /// cryptographically secure source. + Random { seed: Option }, + + /// Like Random, but ensures that each region is allocated on its own sled. + RandomWithDistinctSleds { seed: Option }, +} diff --git a/nexus/db-model/src/queries/region_allocation.rs b/nexus/db-model/src/queries/region_allocation.rs index 43fac3c9a6..2025e79fb8 100644 --- a/nexus/db-model/src/queries/region_allocation.rs +++ b/nexus/db-model/src/queries/region_allocation.rs @@ -47,6 +47,13 @@ table! { } } +table! { + shuffled_candidate_datasets { + id -> Uuid, + pool_id -> Uuid, + } +} + table! { candidate_regions { id -> Uuid, @@ -89,6 +96,19 @@ table! { } } +table! { + one_zpool_per_sled (pool_id) { + pool_id -> Uuid + } +} + +table! { + one_dataset_per_zpool { + id -> Uuid, + pool_id -> Uuid + } +} + table! { inserted_regions { id -> Uuid, @@ -141,6 +161,7 @@ diesel::allow_tables_to_appear_in_same_query!( ); diesel::allow_tables_to_appear_in_same_query!(old_regions, dataset,); +diesel::allow_tables_to_appear_in_same_query!(old_regions, zpool,); diesel::allow_tables_to_appear_in_same_query!( inserted_regions, @@ -149,6 +170,7 @@ diesel::allow_tables_to_appear_in_same_query!( diesel::allow_tables_to_appear_in_same_query!(candidate_zpools, dataset,); diesel::allow_tables_to_appear_in_same_query!(candidate_zpools, zpool,); +diesel::allow_tables_to_appear_in_same_query!(candidate_datasets, dataset); // == Needed for random region allocation == diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index ff1df710bb..dca5239849 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -307,43 +307,6 @@ pub enum UpdatePrecondition { Value(T), } -/// Defines a strategy for choosing what physical disks to use when allocating -/// new crucible regions. -/// -/// NOTE: More strategies can - and should! - be added. -/// -/// See for a more -/// complete discussion. -/// -/// Longer-term, we should consider: -/// - Storage size + remaining free space -/// - Sled placement of datasets -/// - What sort of loads we'd like to create (even split across all disks -/// may not be preferable, especially if maintenance is expected) -#[derive(Debug, Clone)] -pub enum RegionAllocationStrategy { - /// Choose disks that have the least data usage in the rack. This strategy - /// can lead to bad failure states wherein the disks with the least usage - /// have the least usage because regions on them are actually failing in - /// some way. Further retried allocations will then continue to try to - /// allocate onto the disk, perpetuating the problem. Currently this - /// strategy only exists so we can test that using different allocation - /// strategies actually results in different allocation patterns, hence the - /// `#[cfg(test)]`. - /// - /// See https://github.com/oxidecomputer/omicron/issues/3416 for more on the - /// failure-states associated with this strategy - #[cfg(test)] - LeastUsedDisk, - - /// Choose disks pseudo-randomly. An optional seed may be provided to make - /// the ordering deterministic, otherwise the current time in nanoseconds - /// will be used. Ordering is based on sorting the output of `md5(UUID of - /// candidate dataset + seed)`. The seed does not need to come from a - /// cryptographically secure source. - Random(Option), -} - /// Constructs a DataStore for use in test suites that has preloaded the /// built-in users, roles, and role assignments that are needed for basic /// operation @@ -421,7 +384,9 @@ mod test { use omicron_common::api::external::{ self, ByteCount, Error, IdentityMetadataCreateParams, LookupType, Name, }; + use omicron_common::nexus_config::RegionAllocationStrategy; use omicron_test_utils::dev; + use std::collections::HashMap; use std::collections::HashSet; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddrV6}; use std::num::NonZeroU32; @@ -704,12 +669,18 @@ mod test { } } + struct TestDataset { + sled_id: Uuid, + dataset_id: Uuid, + } + async fn create_test_datasets_for_region_allocation( opctx: &OpContext, datastore: Arc, - ) -> Vec { + number_of_sleds: usize, + ) -> Vec { // Create sleds... - let sled_ids: Vec = stream::iter(0..REGION_REDUNDANCY_THRESHOLD) + let sled_ids: Vec = stream::iter(0..number_of_sleds) .then(|_| create_test_sled(&datastore)) .collect() .await; @@ -740,48 +711,69 @@ mod test { .collect() .await; + #[derive(Copy, Clone)] + struct Zpool { + sled_id: Uuid, + pool_id: Uuid, + } + // 1 pool per disk - let zpool_ids: Vec = stream::iter(physical_disks) + let zpools: Vec = stream::iter(physical_disks) .then(|disk| { - create_test_zpool(&datastore, disk.sled_id, disk.disk_id) + let pool_id_future = + create_test_zpool(&datastore, disk.sled_id, disk.disk_id); + async move { + let pool_id = pool_id_future.await; + Zpool { sled_id: disk.sled_id, pool_id } + } }) .collect() .await; let bogus_addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 8080, 0, 0); - // 1 dataset per zpool - let dataset_ids: Vec = stream::iter(zpool_ids) - .then(|zpool_id| { - let id = Uuid::new_v4(); - let dataset = Dataset::new( - id, - zpool_id, - bogus_addr, - DatasetKind::Crucible, - ); - let datastore = datastore.clone(); - async move { - datastore.dataset_upsert(dataset).await.unwrap(); - id - } + let datasets: Vec = stream::iter(zpools) + .map(|zpool| { + // 3 datasets per zpool, to test that pools are distinct + let zpool_iter: Vec = (0..3).map(|_| zpool).collect(); + stream::iter(zpool_iter).then(|zpool| { + let id = Uuid::new_v4(); + let dataset = Dataset::new( + id, + zpool.pool_id, + bogus_addr, + DatasetKind::Crucible, + ); + + let datastore = datastore.clone(); + async move { + datastore.dataset_upsert(dataset).await.unwrap(); + + TestDataset { sled_id: zpool.sled_id, dataset_id: id } + } + }) }) + .flatten() .collect() .await; - dataset_ids + datasets } #[tokio::test] /// Note that this test is currently non-deterministic. It can be made /// deterministic by generating deterministic *dataset* Uuids. The sled and /// pool IDs should not matter. - async fn test_region_allocation() { + async fn test_region_allocation_strat_random() { let logctx = dev::test_setup_log("test_region_allocation"); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - create_test_datasets_for_region_allocation(&opctx, datastore.clone()) - .await; + create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; // Allocate regions from the datasets for this disk. Do it a few times // for good measure. @@ -799,7 +791,9 @@ mod test { volume_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(alloc_seed as u128)), + &RegionAllocationStrategy::Random { + seed: Some(alloc_seed), + }, ) .await .unwrap(); @@ -809,8 +803,79 @@ mod test { let mut disk_datasets = HashSet::new(); let mut disk_zpools = HashSet::new(); - // TODO: When allocation chooses 3 distinct sleds, uncomment this. - // let mut disk1_sleds = HashSet::new(); + for (dataset, region) in dataset_and_regions { + // Must be 3 unique datasets + assert!(disk_datasets.insert(dataset.id())); + + // Must be 3 unique zpools + assert!(disk_zpools.insert(dataset.pool_id)); + + assert_eq!(volume_id, region.volume_id()); + assert_eq!(ByteCount::from(4096), region.block_size()); + let (_, extent_count) = DataStore::get_crucible_allocation( + &BlockSize::AdvancedFormat, + params.size, + ); + assert_eq!(extent_count, region.extent_count()); + } + } + + let _ = db.cleanup().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + /// Test the [`RegionAllocationStrategy::RandomWithDistinctSleds`] strategy. + /// It should always pick datasets where no two datasets are on the same + /// zpool and no two zpools are on the same sled. + async fn test_region_allocation_strat_random_with_distinct_sleds() { + let logctx = dev::test_setup_log("test_region_allocation"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create a rack without enough sleds for a successful allocation when + // we require 3 distinct sleds. + let test_datasets = create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; + + // We need to check that our datasets end up on 3 distinct sleds, but the query doesn't return the sled ID, so we need to reverse map from dataset ID to sled ID + let sled_id_map: HashMap = test_datasets + .into_iter() + .map(|test_dataset| (test_dataset.dataset_id, test_dataset.sled_id)) + .collect(); + + // Allocate regions from the datasets for this disk. Do it a few times + // for good measure. + for alloc_seed in 0..10 { + let params = create_test_disk_create_params( + &format!("disk{}", alloc_seed), + ByteCount::from_mebibytes_u32(1), + ); + let volume_id = Uuid::new_v4(); + + let expected_region_count = REGION_REDUNDANCY_THRESHOLD; + let dataset_and_regions = datastore + .region_allocate( + &opctx, + volume_id, + ¶ms.disk_source, + params.size, + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: Some(alloc_seed), + }, + ) + .await + .unwrap(); + + // Verify the allocation. + assert_eq!(expected_region_count, dataset_and_regions.len()); + let mut disk_datasets = HashSet::new(); + let mut disk_zpools = HashSet::new(); + let mut disk_sleds = HashSet::new(); for (dataset, region) in dataset_and_regions { // Must be 3 unique datasets assert!(disk_datasets.insert(dataset.id())); @@ -819,8 +884,8 @@ mod test { assert!(disk_zpools.insert(dataset.pool_id)); // Must be 3 unique sleds - // TODO: When allocation chooses 3 distinct sleds, uncomment this. - // assert!(disk1_sleds.insert(Err(dataset))); + let sled_id = sled_id_map.get(&dataset.id()).unwrap(); + assert!(disk_sleds.insert(*sled_id)); assert_eq!(volume_id, region.volume_id()); assert_eq!(ByteCount::from(4096), region.block_size()); @@ -836,14 +901,70 @@ mod test { logctx.cleanup_successful(); } + #[tokio::test] + /// Ensure the [`RegionAllocationStrategy::RandomWithDistinctSleds`] + /// strategy fails when there aren't enough distinct sleds. + async fn test_region_allocation_strat_random_with_distinct_sleds_fails() { + let logctx = dev::test_setup_log("test_region_allocation"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create a rack without enough sleds for a successful allocation when + // we require 3 distinct sleds. + create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD - 1, + ) + .await; + + // Allocate regions from the datasets for this disk. Do it a few times + // for good measure. + for alloc_seed in 0..10 { + let params = create_test_disk_create_params( + &format!("disk{}", alloc_seed), + ByteCount::from_mebibytes_u32(1), + ); + let volume_id = Uuid::new_v4(); + + let err = datastore + .region_allocate( + &opctx, + volume_id, + ¶ms.disk_source, + params.size, + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: Some(alloc_seed), + }, + ) + .await + .unwrap_err(); + + let expected = "Not enough zpool space to allocate disks"; + assert!( + err.to_string().contains(expected), + "Saw error: \'{err}\', but expected \'{expected}\'" + ); + + assert!(matches!(err, Error::ServiceUnavailable { .. })); + } + + let _ = db.cleanup().await; + logctx.cleanup_successful(); + } + #[tokio::test] async fn test_region_allocation_is_idempotent() { let logctx = dev::test_setup_log("test_region_allocation_is_idempotent"); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - create_test_datasets_for_region_allocation(&opctx, datastore.clone()) - .await; + create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; // Allocate regions from the datasets for this volume. let params = create_test_disk_create_params( @@ -857,7 +978,7 @@ mod test { volume_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(0)), + &RegionAllocationStrategy::Random { seed: Some(0) }, ) .await .unwrap(); @@ -870,7 +991,7 @@ mod test { volume_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(1)), + &RegionAllocationStrategy::Random { seed: Some(1) }, ) .await .unwrap(); @@ -959,7 +1080,7 @@ mod test { volume1_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(0)), + &RegionAllocationStrategy::Random { seed: Some(0) }, ) .await .unwrap_err(); @@ -983,8 +1104,12 @@ mod test { let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - create_test_datasets_for_region_allocation(&opctx, datastore.clone()) - .await; + create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; let disk_size = test_zpool_size(); let alloc_size = ByteCount::try_from(disk_size.to_bytes() * 2).unwrap(); @@ -997,7 +1122,7 @@ mod test { volume1_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(0)), + &RegionAllocationStrategy::Random { seed: Some(0) }, ) .await .is_err()); diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index 5bc79b9481..9465fe2792 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -5,7 +5,6 @@ //! [`DataStore`] methods on [`Region`]s. use super::DataStore; -use super::RegionAllocationStrategy; use super::RunnableQuery; use crate::context::OpContext; use crate::db; @@ -23,6 +22,7 @@ use omicron_common::api::external; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; use omicron_common::backoff::{self, BackoffError}; +use omicron_common::nexus_config::RegionAllocationStrategy; use slog::Logger; use uuid::Uuid; diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index b071ee3f44..b73697ec34 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -6,7 +6,6 @@ use crate::db::alias::ExpressionAlias; use crate::db::cast_uuid_as_bytea::CastUuidToBytea; -use crate::db::datastore::RegionAllocationStrategy; use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD; use crate::db::model::{Dataset, DatasetKind, Region}; use crate::db::pool::DbConnection; @@ -24,10 +23,11 @@ use diesel::{ use nexus_db_model::queries::region_allocation::{ candidate_datasets, candidate_regions, candidate_zpools, cockroach_md5, do_insert, inserted_regions, old_regions, old_zpool_usage, - proposed_dataset_changes, updated_datasets, + proposed_dataset_changes, shuffled_candidate_datasets, updated_datasets, }; use nexus_db_model::schema; use omicron_common::api::external; +use omicron_common::nexus_config::RegionAllocationStrategy; const NOT_ENOUGH_DATASETS_SENTINEL: &'static str = "Not enough datasets"; const NOT_ENOUGH_ZPOOL_SPACE_SENTINEL: &'static str = "Not enough space"; @@ -91,6 +91,8 @@ impl OldRegions { /// This implicitly distinguishes between "M.2s" and "U.2s" -- Nexus needs to /// determine during dataset provisioning which devices should be considered for /// usage as Crucible storage. +/// +/// We select only one dataset from each zpool. #[derive(Subquery, QueryId)] #[subquery(name = candidate_datasets)] struct CandidateDatasets { @@ -98,71 +100,65 @@ struct CandidateDatasets { } impl CandidateDatasets { - fn new( - allocation_strategy: &RegionAllocationStrategy, - candidate_zpools: &CandidateZpools, - ) -> Self { + fn new(candidate_zpools: &CandidateZpools, seed: u128) -> Self { use crate::db::schema::dataset::dsl as dataset_dsl; use candidate_zpools::dsl as candidate_zpool_dsl; - let query = match allocation_strategy { - #[cfg(test)] - RegionAllocationStrategy::LeastUsedDisk => { - let query: Box< - dyn CteQuery, - > = Box::new( - dataset_dsl::dataset - .inner_join( - candidate_zpools - .query_source() - .on(dataset_dsl::pool_id - .eq(candidate_zpool_dsl::pool_id)), - ) - .filter(dataset_dsl::time_deleted.is_null()) - .filter(dataset_dsl::size_used.is_not_null()) - .filter(dataset_dsl::kind.eq(DatasetKind::Crucible)) - .order(dataset_dsl::size_used.asc()) - .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap()) - .select((dataset_dsl::id, dataset_dsl::pool_id)), - ); - query - } - RegionAllocationStrategy::Random(seed) => { - let seed = seed.unwrap_or_else(|| { - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_nanos() - }); - - let seed_bytes = seed.to_le_bytes(); - - let query: Box< - dyn CteQuery, - > = Box::new( - dataset_dsl::dataset - .inner_join( - candidate_zpools - .query_source() - .on(dataset_dsl::pool_id - .eq(candidate_zpool_dsl::pool_id)), - ) - .filter(dataset_dsl::time_deleted.is_null()) - .filter(dataset_dsl::size_used.is_not_null()) - .filter(dataset_dsl::kind.eq(DatasetKind::Crucible)) - // We order by md5 to shuffle the ordering of the datasets. - // md5 has a uniform output distribution so it does the job. - .order(cockroach_md5::dsl::md5( + let seed_bytes = seed.to_le_bytes(); + + let query: Box> = + Box::new( + dataset_dsl::dataset + .inner_join(candidate_zpools.query_source().on( + dataset_dsl::pool_id.eq(candidate_zpool_dsl::pool_id), + )) + .filter(dataset_dsl::time_deleted.is_null()) + .filter(dataset_dsl::size_used.is_not_null()) + .filter(dataset_dsl::kind.eq(DatasetKind::Crucible)) + .distinct_on(dataset_dsl::pool_id) + .order_by(( + dataset_dsl::pool_id, + cockroach_md5::dsl::md5( CastUuidToBytea::new(dataset_dsl::id) .concat(seed_bytes.to_vec()), - )) - .select((dataset_dsl::id, dataset_dsl::pool_id)) - .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap()), - ); - query - } - }; + ), + )) + .select((dataset_dsl::id, dataset_dsl::pool_id)), + ); + Self { query } + } +} + +/// Shuffle the candidate datasets, and select REGION_REDUNDANCY_THRESHOLD +/// regions from it. +#[derive(Subquery, QueryId)] +#[subquery(name = shuffled_candidate_datasets)] +struct ShuffledCandidateDatasets { + query: Box>, +} +impl ShuffledCandidateDatasets { + fn new(candidate_datasets: &CandidateDatasets, seed: u128) -> Self { + use candidate_datasets::dsl as candidate_datasets_dsl; + + let seed_bytes = seed.to_le_bytes(); + + let query: Box> = + Box::new( + candidate_datasets + .query_source() + // We order by md5 to shuffle the ordering of the datasets. + // md5 has a uniform output distribution so it does the job. + .order(cockroach_md5::dsl::md5( + CastUuidToBytea::new(candidate_datasets_dsl::id) + .concat(seed_bytes.to_vec()), + )) + .select(( + candidate_datasets_dsl::id, + candidate_datasets_dsl::pool_id, + )) + .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap()), + ); Self { query } } } @@ -179,14 +175,14 @@ diesel::sql_function!(fn now() -> Timestamptz); impl CandidateRegions { fn new( - candidate_datasets: &CandidateDatasets, + shuffled_candidate_datasets: &ShuffledCandidateDatasets, volume_id: uuid::Uuid, block_size: u64, blocks_per_extent: u64, extent_count: u64, ) -> Self { - use candidate_datasets::dsl as candidate_datasets_dsl; use schema::region; + use shuffled_candidate_datasets::dsl as shuffled_candidate_datasets_dsl; let volume_id = volume_id.into_sql::(); let block_size = (block_size as i64).into_sql::(); @@ -195,20 +191,22 @@ impl CandidateRegions { let extent_count = (extent_count as i64).into_sql::(); Self { - query: Box::new(candidate_datasets.query_source().select(( - ExpressionAlias::new::(gen_random_uuid()), - ExpressionAlias::new::(now()), - ExpressionAlias::new::(now()), - ExpressionAlias::new::( - candidate_datasets_dsl::id, + query: Box::new(shuffled_candidate_datasets.query_source().select( + ( + ExpressionAlias::new::(gen_random_uuid()), + ExpressionAlias::new::(now()), + ExpressionAlias::new::(now()), + ExpressionAlias::new::( + shuffled_candidate_datasets_dsl::id, + ), + ExpressionAlias::new::(volume_id), + ExpressionAlias::new::(block_size), + ExpressionAlias::new::( + blocks_per_extent, + ), + ExpressionAlias::new::(extent_count), ), - ExpressionAlias::new::(volume_id), - ExpressionAlias::new::(block_size), - ExpressionAlias::new::( - blocks_per_extent, - ), - ExpressionAlias::new::(extent_count), - ))), + )), } } } @@ -285,12 +283,14 @@ struct CandidateZpools { } impl CandidateZpools { - fn new(old_zpool_usage: &OldPoolUsage, zpool_size_delta: u64) -> Self { + fn new( + old_zpool_usage: &OldPoolUsage, + zpool_size_delta: u64, + seed: u128, + distinct_sleds: bool, + ) -> Self { use schema::zpool::dsl as zpool_dsl; - let with_zpool = zpool_dsl::zpool - .on(zpool_dsl::id.eq(old_zpool_usage::dsl::pool_id)); - // Why are we using raw `diesel::dsl::sql` here? // // When SQL performs the "SUM" operation on "bigint" type, the result @@ -309,15 +309,40 @@ impl CandidateZpools { + diesel::dsl::sql(&zpool_size_delta.to_string())) .le(diesel::dsl::sql(zpool_dsl::total_size::NAME)); - Self { - query: Box::new( - old_zpool_usage - .query_source() - .inner_join(with_zpool) - .filter(it_will_fit) - .select((old_zpool_usage::dsl::pool_id,)), - ), - } + let with_zpool = zpool_dsl::zpool + .on(zpool_dsl::id.eq(old_zpool_usage::dsl::pool_id)); + + let base_query = old_zpool_usage + .query_source() + .inner_join(with_zpool) + .filter(it_will_fit) + .select((old_zpool_usage::dsl::pool_id,)); + + let query = if distinct_sleds { + let seed_bytes = seed.to_le_bytes(); + + let query: Box> = + Box::new( + base_query + .order_by(( + zpool_dsl::sled_id, + cockroach_md5::dsl::md5( + CastUuidToBytea::new(zpool_dsl::id) + .concat(seed_bytes.to_vec()), + ), + )) + .distinct_on(zpool_dsl::sled_id), + ); + + query + } else { + let query: Box> = + Box::new(base_query); + + query + }; + + Self { query } } } @@ -508,19 +533,47 @@ impl RegionAllocate { extent_count: u64, allocation_strategy: &RegionAllocationStrategy, ) -> Self { + let (seed, distinct_sleds) = { + let (input_seed, distinct_sleds) = match allocation_strategy { + RegionAllocationStrategy::Random { seed } => (seed, false), + RegionAllocationStrategy::RandomWithDistinctSleds { seed } => { + (seed, true) + } + }; + ( + input_seed.map_or_else( + || { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + }, + |seed| seed as u128, + ), + distinct_sleds, + ) + }; + let size_delta = block_size * blocks_per_extent * extent_count; let old_regions = OldRegions::new(volume_id); let old_pool_usage = OldPoolUsage::new(); - let candidate_zpools = - CandidateZpools::new(&old_pool_usage, size_delta); + let candidate_zpools = CandidateZpools::new( + &old_pool_usage, + size_delta, + seed, + distinct_sleds, + ); let candidate_datasets = - CandidateDatasets::new(&allocation_strategy, &candidate_zpools); + CandidateDatasets::new(&candidate_zpools, seed); + + let shuffled_candidate_datasets = + ShuffledCandidateDatasets::new(&candidate_datasets, seed); let candidate_regions = CandidateRegions::new( - &candidate_datasets, + &shuffled_candidate_datasets, volume_id, block_size, blocks_per_extent, @@ -577,6 +630,7 @@ impl RegionAllocate { .add_subquery(old_pool_usage) .add_subquery(candidate_zpools) .add_subquery(candidate_datasets) + .add_subquery(shuffled_candidate_datasets) .add_subquery(candidate_regions) .add_subquery(proposed_changes) .add_subquery(do_insert) diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index f1b20c32a1..1a9afbc6bd 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -92,3 +92,14 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 + +[default_region_allocation_strategy] +# allocate region on 3 random distinct zpools, on 3 random distinct sleds. +type = "random_with_distinct_sleds" + +# the same as random_with_distinct_sleds, but without requiring distinct sleds +# type = "random" + +# setting `seed` to a fixed value will make dataset selection ordering use the +# same shuffling order for every region allocation. +# seed = 0 diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 5bab5e2820..354df0ead3 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -23,6 +23,7 @@ use omicron_common::address::DENDRITE_PORT; use omicron_common::address::MGS_PORT; use omicron_common::api::external::Error; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_common::nexus_config::RegionAllocationStrategy; use slog::Logger; use std::collections::HashMap; use std::net::Ipv6Addr; @@ -153,6 +154,9 @@ pub struct Nexus { /// Background tasks background_tasks: background::BackgroundTasks, + + /// Default Crucible region allocation strategy + default_region_allocation_strategy: RegionAllocationStrategy, } impl Nexus { @@ -325,6 +329,10 @@ impl Nexus { external_resolver, dpd_clients, background_tasks, + default_region_allocation_strategy: config + .pkg + .default_region_allocation_strategy + .clone(), }; // TODO-cleanup all the extra Arcs here seems wrong diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index cca36cefa7..30e0e6c8dd 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -16,7 +16,7 @@ use crate::external_api::params; use nexus_db_queries::db::datastore::RegionAllocationStrategy; use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; -use nexus_db_queries::{authn, authz, db}; +use crate::{authn, authz, db}; use omicron_common::api::external::DiskState; use omicron_common::api::external::Error; use rand::{rngs::StdRng, RngCore, SeedableRng}; @@ -255,6 +255,9 @@ async fn sdc_alloc_regions( &sagactx, ¶ms.serialized_authn, ); + + let strategy = &osagactx.nexus().default_region_allocation_strategy; + let datasets_and_regions = osagactx .datastore() .region_allocate( @@ -262,7 +265,7 @@ async fn sdc_alloc_regions( volume_id, ¶ms.create_params.disk_source, params.create_params.size, - &RegionAllocationStrategy::Random(None), + &strategy, ) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index b27f4a3a9b..85ec91ffd0 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -101,13 +101,13 @@ use super::{ use crate::app::sagas::declare_saga_actions; use crate::app::sagas::retry_until_known_result; use crate::external_api::params; +use crate::{authn, authz, db}; use anyhow::anyhow; use crucible_agent_client::{types::RegionId, Client as CrucibleAgentClient}; use nexus_db_model::Generation; use nexus_db_queries::db::datastore::RegionAllocationStrategy; use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; -use nexus_db_queries::{authn, authz, db}; use omicron_common::api::external; use omicron_common::api::external::Error; use rand::{rngs::StdRng, RngCore, SeedableRng}; @@ -332,6 +332,8 @@ async fn ssc_alloc_regions( .await .map_err(ActionError::action_failed)?; + let strategy = &osagactx.nexus().default_region_allocation_strategy; + let datasets_and_regions = osagactx .datastore() .region_allocate( @@ -344,7 +346,7 @@ async fn ssc_alloc_regions( .map_err(|e| ActionError::action_failed(e.to_string()))?, }, external::ByteCount::from(disk.size), - &RegionAllocationStrategy::Random(None), + &strategy, ) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 6eeacceaed..1b1ae2c912 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -89,3 +89,8 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 + +[default_region_allocation_strategy] +# we only have one sled in the test environment, so we need to use the +# `Random` strategy, instead of `RandomWithDistinctSleds` +type = "random" \ No newline at end of file diff --git a/smf/nexus/config-partial.toml b/smf/nexus/config-partial.toml index b29727c4aa..2dfee81d02 100644 --- a/smf/nexus/config-partial.toml +++ b/smf/nexus/config-partial.toml @@ -38,3 +38,8 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 + +[default_region_allocation_strategy] +# by default, allocate across 3 distinct sleds +# seed is omitted so a new seed will be chosen with every allocation. +type = "random_with_distinct_sleds" \ No newline at end of file From a7aa8962df6e6993a32d1c753dd2779a68fb813e Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Fri, 18 Aug 2023 05:31:14 +0000 Subject: [PATCH 02/13] fix log messages --- nexus/db-queries/src/db/datastore/mod.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index dca5239849..b1f3203c60 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -765,7 +765,7 @@ mod test { /// deterministic by generating deterministic *dataset* Uuids. The sled and /// pool IDs should not matter. async fn test_region_allocation_strat_random() { - let logctx = dev::test_setup_log("test_region_allocation"); + let logctx = dev::test_setup_log("test_region_allocation_strat_random"); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; create_test_datasets_for_region_allocation( @@ -829,7 +829,9 @@ mod test { /// It should always pick datasets where no two datasets are on the same /// zpool and no two zpools are on the same sled. async fn test_region_allocation_strat_random_with_distinct_sleds() { - let logctx = dev::test_setup_log("test_region_allocation"); + let logctx = dev::test_setup_log( + "test_region_allocation_strat_random_with_distinct_sleds", + ); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; @@ -905,7 +907,9 @@ mod test { /// Ensure the [`RegionAllocationStrategy::RandomWithDistinctSleds`] /// strategy fails when there aren't enough distinct sleds. async fn test_region_allocation_strat_random_with_distinct_sleds_fails() { - let logctx = dev::test_setup_log("test_region_allocation"); + let logctx = dev::test_setup_log( + "test_region_allocation_strat_random_with_distinct_sleds_fails", + ); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; From a897c83885fe0ff3f1b5f593e7a6be8c36f465ca Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Fri, 18 Aug 2023 05:31:52 +0000 Subject: [PATCH 03/13] add rack-topology package feature This adds the rack-topology package feature with possible values of single-sled or multi-sled. single-sled is intended for dev/CI deployments, while multi-sled is intended for dogfood/prod. The value of this determines which nexus config-partial.toml is packaged. Right now the only difference single/multi is the crucible region allocation strategy. --- .github/buildomat/jobs/package.sh | 4 +- .github/buildomat/jobs/tuf-repo.sh | 2 +- docs/how-to-run.adoc | 34 +++++++++++--- package-manifest.toml | 2 +- package/src/bin/omicron-package.rs | 3 +- package/src/lib.rs | 7 +++ package/src/target.rs | 28 +++++++++++- sled-agent/Cargo.toml | 2 + .../{ => multi-sled}/config-partial.toml | 0 smf/nexus/single-sled/config-partial.toml | 45 +++++++++++++++++++ 10 files changed, 114 insertions(+), 13 deletions(-) rename smf/nexus/{ => multi-sled}/config-partial.toml (100%) create mode 100644 smf/nexus/single-sled/config-partial.toml diff --git a/.github/buildomat/jobs/package.sh b/.github/buildomat/jobs/package.sh index fe5d6a9b7f..4c9bd01a73 100755 --- a/.github/buildomat/jobs/package.sh +++ b/.github/buildomat/jobs/package.sh @@ -45,7 +45,7 @@ ptime -m ./tools/ci_download_softnpu_machinery # Build the test target ptime -m cargo run --locked --release --bin omicron-package -- \ - -t test target create -i standard -m non-gimlet -s softnpu + -t test target create -i standard -m non-gimlet -s softnpu -r single-sled ptime -m cargo run --locked --release --bin omicron-package -- \ -t test package @@ -83,7 +83,7 @@ stamp_packages() { # Build necessary for the global zone ptime -m cargo run --locked --release --bin omicron-package -- \ - -t host target create -i standard -m gimlet -s asic + -t host target create -i standard -m gimlet -s asic -r multi-sled ptime -m cargo run --locked --release --bin omicron-package -- \ -t host package stamp_packages omicron-sled-agent maghemite propolis-server overlay diff --git a/.github/buildomat/jobs/tuf-repo.sh b/.github/buildomat/jobs/tuf-repo.sh index a06468c6b2..0212c98769 100644 --- a/.github/buildomat/jobs/tuf-repo.sh +++ b/.github/buildomat/jobs/tuf-repo.sh @@ -77,7 +77,7 @@ done mkdir /work/package pushd /work/package tar xf /input/package/work/package.tar.gz out package-manifest.toml target/release/omicron-package -target/release/omicron-package -t default target create -i standard -m gimlet -s asic +target/release/omicron-package -t default target create -i standard -m gimlet -s asic -r multi-sled ln -s /input/package/work/zones/* out/ rm out/switch-softnpu.tar.gz # not used when target switch=asic rm out/omicron-gateway-softnpu.tar.gz # not used when target switch=asic diff --git a/docs/how-to-run.adoc b/docs/how-to-run.adoc index 1988a42669..7539c5183f 100644 --- a/docs/how-to-run.adoc +++ b/docs/how-to-run.adoc @@ -321,10 +321,32 @@ Error: Creates a new build target, and sets it as "active" Usage: omicron-package target create [OPTIONS] Options: - -i, --image [default: standard] [possible values: standard, trampoline] - -m, --machine [possible values: gimlet, gimlet-standalone, non-gimlet] - -s, --switch [possible values: asic, stub, softnpu] - -h, --help Print help (see more with '--help') + -i, --image + [default: standard] + + Possible values: + - standard: A typical host OS image + - trampoline: A recovery host OS image, intended to bootstrap a Standard image + + -m, --machine + Possible values: + - gimlet: Use sled agent configuration for a Gimlet + - gimlet-standalone: Use sled agent configuration for a Gimlet running in isolation + - non-gimlet: Use sled agent configuration for a device emulating a Gimlet + + -s, --switch + Possible values: + - asic: Use the "real" Dendrite, that attempts to interact with the Tofino + - stub: Use a "stub" Dendrite that does not require any real hardware + - softnpu: Use a "softnpu" Dendrite that uses the SoftNPU asic emulator + + -r, --rack-topology + Possible values: + - multi-sled: Use configurations suitable for a multi-sled deployment, such as dogfood and production racks + - single-sled: Use configurations suitable for a single-sled deployment, such as CI and dev machines + + -h, --help + Print help (see a summary with '-h') ---- @@ -332,9 +354,9 @@ To set up a build target for a non-Gimlet machine with simulated (but fully func [source,console] ---- -$ cargo run --release --bin omicron-package -- -t default target create -i standard -m non-gimlet -s softnpu +$ cargo run --release --bin omicron-package -- -t default target create -i standard -m non-gimlet -s softnpu -r single-sled Finished release [optimized] target(s) in 0.66s - Running `target/release/omicron-package -t default target create -i standard -m non-gimlet -s softnpu` + Running `target/release/omicron-package -t default target create -i standard -m non-gimlet -s softnpu -r single-sled` Created new build target 'default' and set it as active ---- diff --git a/package-manifest.toml b/package-manifest.toml index 4dc0f6b616..d892b9f8b0 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -92,7 +92,7 @@ source.rust.binary_names = ["nexus", "schema-updater"] source.rust.release = true source.paths = [ { from = "/opt/ooce/pgsql-13/lib/amd64", to = "/opt/ooce/pgsql-13/lib/amd64" }, - { from = "smf/nexus", to = "/var/svc/manifest/site/nexus" }, + { from = "smf/nexus/{{rack-topology}}", to = "/var/svc/manifest/site/nexus" }, { from = "out/console-assets", to = "/var/nexus/static" }, { from = "schema/crdb", to = "/var/nexus/schema/crdb" }, ] diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index a0146eee50..bbb6c6ef2d 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -189,11 +189,12 @@ async fn do_target( format!("failed to create directory {}", target_dir.display()) })?; match subcommand { - TargetCommand::Create { image, machine, switch } => { + TargetCommand::Create { image, machine, switch, rack_topology } => { let target = KnownTarget::new( image.clone(), machine.clone(), switch.clone(), + rack_topology.clone(), )?; let path = get_single_target(&target_dir, name).await?; diff --git a/package/src/lib.rs b/package/src/lib.rs index b0cc04970a..0d6627e714 100644 --- a/package/src/lib.rs +++ b/package/src/lib.rs @@ -46,6 +46,13 @@ pub enum TargetCommand { #[clap(short, long, default_value_if("image", "standard", "stub"))] switch: Option, + + #[clap( + short, + long, + default_value_if("image", "standard", "single-sled") + )] + rack_topology: Option, }, /// List all existing targets List, diff --git a/package/src/target.rs b/package/src/target.rs index a7b2dd4539..7829928e82 100644 --- a/package/src/target.rs +++ b/package/src/target.rs @@ -48,12 +48,27 @@ pub enum Switch { SoftNpu, } +/// Topology of the sleds within the rack. +#[derive(Clone, Debug, strum::EnumString, strum::Display, ValueEnum)] +#[strum(serialize_all = "kebab-case")] +#[clap(rename_all = "kebab-case")] +pub enum RackTopology { + /// Use configurations suitable for a multi-sled deployment, such as dogfood + /// and production racks. + MultiSled, + + /// Use configurations suitable for a single-sled deployment, such as CI and + /// dev machines. + SingleSled, +} + /// A strongly-typed variant of [Target]. #[derive(Clone, Debug)] pub struct KnownTarget { image: Image, machine: Option, switch: Option, + rack_topology: Option, } impl KnownTarget { @@ -61,6 +76,7 @@ impl KnownTarget { image: Image, machine: Option, switch: Option, + rack_topology: Option, ) -> Result { if matches!(image, Image::Trampoline) { if machine.is_some() { @@ -77,7 +93,7 @@ impl KnownTarget { bail!("'switch=asic' is only valid with 'machine=gimlet'"); } - Ok(Self { image, machine, switch }) + Ok(Self { image, machine, switch, rack_topology }) } } @@ -87,6 +103,7 @@ impl Default for KnownTarget { image: Image::Standard, machine: Some(Machine::NonGimlet), switch: Some(Switch::Stub), + rack_topology: Some(RackTopology::MultiSled), } } } @@ -101,6 +118,9 @@ impl From for Target { if let Some(switch) = kt.switch { map.insert("switch".to_string(), switch.to_string()); } + if let Some(rack_topology) = kt.rack_topology { + map.insert("rack-topology".to_string(), rack_topology.to_string()); + } Target(map) } } @@ -121,6 +141,7 @@ impl std::str::FromStr for KnownTarget { let mut image = Self::default().image; let mut machine = None; let mut switch = None; + let mut rack_topology = None; for (k, v) in target.0.into_iter() { match k.as_str() { @@ -133,6 +154,9 @@ impl std::str::FromStr for KnownTarget { "switch" => { switch = Some(v.parse()?); } + "rack-topology" => { + rack_topology = Some(v.parse()?); + } _ => { bail!( "Unknown target key {k}\nValid keys include: [{}]", @@ -146,6 +170,6 @@ impl std::str::FromStr for KnownTarget { } } } - KnownTarget::new(image, machine, switch) + KnownTarget::new(image, machine, switch, rack_topology) } } diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index b131698395..8722a8d057 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -119,3 +119,5 @@ machine-non-gimlet = [] switch-asic = [] switch-stub = [] switch-softnpu = [] +rack-topology-single-sled = [] +rack-topology-multi-sled = [] diff --git a/smf/nexus/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml similarity index 100% rename from smf/nexus/config-partial.toml rename to smf/nexus/multi-sled/config-partial.toml diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml new file mode 100644 index 0000000000..aff0a8a25f --- /dev/null +++ b/smf/nexus/single-sled/config-partial.toml @@ -0,0 +1,45 @@ +# +# Oxide API: partial configuration file +# + +[console] +# Directory for static assets. Absolute path or relative to CWD. +static_dir = "/var/nexus/static" +session_idle_timeout_minutes = 60 +session_absolute_timeout_minutes = 480 + +[authn] +schemes_external = ["session_cookie", "access_token"] + +[log] +# Show log messages of this level and more severe +level = "debug" +mode = "file" +path = "/dev/stdout" +if_exists = "append" + +# TODO: Uncomment the following lines to enable automatic schema +# migration on boot. +# +# [schema] +# schema_dir = "/var/nexus/schema/crdb" + +[background_tasks] +dns_internal.period_secs_config = 60 +dns_internal.period_secs_servers = 60 +dns_internal.period_secs_propagation = 60 +dns_internal.max_concurrent_server_updates = 5 +dns_external.period_secs_config = 60 +dns_external.period_secs_servers = 60 +dns_external.period_secs_propagation = 60 +dns_external.max_concurrent_server_updates = 5 +# How frequently we check the list of stored TLS certificates. This is +# approximately an upper bound on how soon after updating the list of +# certificates it will take _other_ Nexus instances to notice and stop serving +# them (on a sunny day). +external_endpoints.period_secs = 60 + +[default_region_allocation_strategy] +# by default, allocate without requirement for distinct sleds. +# seed is omitted so a new seed will be chosen with every allocation. +type = "random" \ No newline at end of file From 82a5867a8419fdc2a6c09e2292b5530c0c3eb4be Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Fri, 1 Sep 2023 22:02:38 +0000 Subject: [PATCH 04/13] fix test_repo_configs_are_valid test --- common/src/nexus_config.rs | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs index 47c567dbe0..ad62c34f92 100644 --- a/common/src/nexus_config.rs +++ b/common/src/nexus_config.rs @@ -875,25 +875,31 @@ mod test { struct DummyConfig { deployment: DeploymentConfig, } - let config_path = "../smf/nexus/config-partial.toml"; - println!( - "checking {:?} with example deployment section added", - config_path - ); - let mut contents = std::fs::read_to_string(config_path) - .expect("failed to read Nexus SMF config file"); - contents.push_str( - "\n\n\n \ - # !! content below added by test_repo_configs_are_valid()\n\ - \n\n\n", - ); let example_deployment = toml::to_string_pretty(&DummyConfig { deployment: example_config.deployment, }) .unwrap(); - contents.push_str(&example_deployment); - let _: Config = toml::from_str(&contents) - .expect("Nexus SMF config file is not valid"); + + let nexus_config_paths = [ + "../smf/nexus/single-sled/config-partial.toml", + "../smf/nexus/multi-sled/config-partial.toml", + ]; + for config_path in nexus_config_paths { + println!( + "checking {:?} with example deployment section added", + config_path + ); + let mut contents = std::fs::read_to_string(config_path) + .expect("failed to read Nexus SMF config file"); + contents.push_str( + "\n\n\n \ + # !! content below added by test_repo_configs_are_valid()\n\ + \n\n\n", + ); + contents.push_str(&example_deployment); + let _: Config = toml::from_str(&contents) + .expect("Nexus SMF config file is not valid"); + } } #[test] From 61ae4aca1d107d639519d497866b6a0caac841e8 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Fri, 1 Sep 2023 22:26:14 +0000 Subject: [PATCH 05/13] add a nice error message for you --- nexus/db-queries/src/db/queries/region_allocation.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index b73697ec34..7f7b2ea9bf 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -53,7 +53,7 @@ pub fn from_diesel(e: async_bb8_diesel::ConnectionError) -> external::Error { } NOT_ENOUGH_ZPOOL_SPACE_SENTINEL => { return external::Error::unavail( - "Not enough zpool space to allocate disks", + "Not enough zpool space to allocate disks. There may not be enough disks with space for the requested region. You may also see this if your rack is in a degraded state, or you're running the default multi-rack topology configuration in a 1-sled development environment.", ); } NOT_ENOUGH_UNIQUE_ZPOOLS_SENTINEL => { From e859e1263b8593929b13205e24bc206ac37f8e24 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Sat, 2 Sep 2023 00:12:08 +0000 Subject: [PATCH 06/13] actually put the manifest.xml --- package-manifest.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/package-manifest.toml b/package-manifest.toml index d892b9f8b0..c776f6d96d 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -92,6 +92,7 @@ source.rust.binary_names = ["nexus", "schema-updater"] source.rust.release = true source.paths = [ { from = "/opt/ooce/pgsql-13/lib/amd64", to = "/opt/ooce/pgsql-13/lib/amd64" }, + { from = "smf/nexus/manifest.xml", to = "/var/svc/manifest/site/nexus/manifest.xml" }, { from = "smf/nexus/{{rack-topology}}", to = "/var/svc/manifest/site/nexus" }, { from = "out/console-assets", to = "/var/nexus/static" }, { from = "schema/crdb", to = "/var/nexus/schema/crdb" }, From dc1893fa825d1b9049fc306eebef0ce3a86235ee Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Thu, 7 Sep 2023 04:52:46 +0000 Subject: [PATCH 07/13] ensure newlines after nexus config-partial.toml --- sled-agent/src/services.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 96cdf8222b..60f0965612 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1513,6 +1513,9 @@ impl ServiceManager { .open(&config_path) .await .map_err(|err| Error::io_path(&config_path, err))?; + file.write_all(b"\n\n") + .await + .map_err(|err| Error::io_path(&config_path, err))?; file.write_all(config_str.as_bytes()) .await .map_err(|err| Error::io_path(&config_path, err))?; From 75fa7261b3e27ed6b3997029882d1a55e8b2345c Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Fri, 8 Sep 2023 00:07:37 +0000 Subject: [PATCH 08/13] does... does this work? --- .github/buildomat/jobs/deploy.sh | 1 + .github/buildomat/jobs/package.sh | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/.github/buildomat/jobs/deploy.sh b/.github/buildomat/jobs/deploy.sh index 5d3dd8ec39..c2579d98ea 100755 --- a/.github/buildomat/jobs/deploy.sh +++ b/.github/buildomat/jobs/deploy.sh @@ -143,6 +143,7 @@ cd /opt/oxide/work ptime -m tar xvzf /input/package/work/package.tar.gz cp /input/package/work/zones/* out/ +mv out/omicron-nexus-single-sled.tar.gz out/omicron-nexus.tar.gz mkdir tests for p in /input/ci-tools/work/end-to-end-tests/*.gz; do ptime -m gunzip < "$p" > "tests/$(basename "${p%.gz}")" diff --git a/.github/buildomat/jobs/package.sh b/.github/buildomat/jobs/package.sh index 4c9bd01a73..64c087524e 100755 --- a/.github/buildomat/jobs/package.sh +++ b/.github/buildomat/jobs/package.sh @@ -81,6 +81,10 @@ stamp_packages() { done } +# Keep the single-sled Nexus zone around for the deploy job. (The global zone +# build below overwrites the file.) +mv out/omicron-nexus.tar.gz out/omicron-nexus-single-sled.tar.gz + # Build necessary for the global zone ptime -m cargo run --locked --release --bin omicron-package -- \ -t host target create -i standard -m gimlet -s asic -r multi-sled @@ -111,6 +115,7 @@ zones=( out/external-dns.tar.gz out/internal-dns.tar.gz out/omicron-nexus.tar.gz + out/omicron-nexus-single-sled.tar.gz out/oximeter-collector.tar.gz out/propolis-server.tar.gz out/switch-*.tar.gz From 3fa73a4ca2fdf4a9cdfe0b2d8040fe92e8892b40 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Sun, 17 Sep 2023 03:41:50 +0000 Subject: [PATCH 09/13] is this anything --- .github/buildomat/jobs/tuf-repo.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/buildomat/jobs/tuf-repo.sh b/.github/buildomat/jobs/tuf-repo.sh index 0212c98769..d6c97fe902 100644 --- a/.github/buildomat/jobs/tuf-repo.sh +++ b/.github/buildomat/jobs/tuf-repo.sh @@ -81,6 +81,7 @@ target/release/omicron-package -t default target create -i standard -m gimlet -s ln -s /input/package/work/zones/* out/ rm out/switch-softnpu.tar.gz # not used when target switch=asic rm out/omicron-gateway-softnpu.tar.gz # not used when target switch=asic +rm out/omicron-nexus-single-sled.tar.gz # only used for deploy tests for zone in out/*.tar.gz; do target/release/omicron-package stamp "$(basename "${zone%.tar.gz}")" "$VERSION" done From d9845ec7dff25e87fd5a9dead185a8e2881e7d53 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Sun, 17 Sep 2023 05:17:08 +0000 Subject: [PATCH 10/13] rebase --- nexus/src/app/sagas/disk_create.rs | 3 +-- nexus/src/app/sagas/snapshot_create.rs | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index 30e0e6c8dd..275c8738cc 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -12,11 +12,10 @@ use super::{ ACTION_GENERATE_ID, }; use crate::app::sagas::declare_saga_actions; +use crate::app::{authn, authz, db}; use crate::external_api::params; -use nexus_db_queries::db::datastore::RegionAllocationStrategy; use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; -use crate::{authn, authz, db}; use omicron_common::api::external::DiskState; use omicron_common::api::external::Error; use rand::{rngs::StdRng, RngCore, SeedableRng}; diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index 85ec91ffd0..eeabf64894 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -100,12 +100,11 @@ use super::{ }; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::retry_until_known_result; +use crate::app::{authn, authz, db}; use crate::external_api::params; -use crate::{authn, authz, db}; use anyhow::anyhow; use crucible_agent_client::{types::RegionId, Client as CrucibleAgentClient}; use nexus_db_model::Generation; -use nexus_db_queries::db::datastore::RegionAllocationStrategy; use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external; From e68910806d6ad4cc9c7edc8fafd457dac1479613 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Fri, 29 Sep 2023 23:20:11 +0000 Subject: [PATCH 11/13] make rack_topology required to save me from myself --- package/src/lib.rs | 20 ++++++++++++++++++-- package/src/target.rs | 17 ++++++++++------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/package/src/lib.rs b/package/src/lib.rs index 0d6627e714..395f3ed472 100644 --- a/package/src/lib.rs +++ b/package/src/lib.rs @@ -50,9 +50,25 @@ pub enum TargetCommand { #[clap( short, long, - default_value_if("image", "standard", "single-sled") + default_value_if("image", "trampoline", Some("single-sled")), + + // This opt is required, and clap will enforce that even with + // `required = false`, since it's not an Option. But the + // default_value_if only works if we set `required` to false. It's + // jank, but it is what it is. + // https://github.com/clap-rs/clap/issues/4086 + required = false )] - rack_topology: Option, + /// Specify whether nexus will run in a single-sled or multi-sled + /// environment. + /// + /// Set single-sled for dev purposes when you're running a single + /// sled-agent. Set multi-sled if you're running with mulitple sleds. + /// Currently this only affects the crucible disk allocation strategy- + /// VM disks will require 3 distinct sleds with `multi-sled`, which will + /// fail in a single-sled environment. `single-sled` relaxes this + /// requirement. + rack_topology: crate::target::RackTopology, }, /// List all existing targets List, diff --git a/package/src/target.rs b/package/src/target.rs index 7829928e82..d5d5e92c46 100644 --- a/package/src/target.rs +++ b/package/src/target.rs @@ -68,7 +68,7 @@ pub struct KnownTarget { image: Image, machine: Option, switch: Option, - rack_topology: Option, + rack_topology: RackTopology, } impl KnownTarget { @@ -76,7 +76,7 @@ impl KnownTarget { image: Image, machine: Option, switch: Option, - rack_topology: Option, + rack_topology: RackTopology, ) -> Result { if matches!(image, Image::Trampoline) { if machine.is_some() { @@ -103,7 +103,7 @@ impl Default for KnownTarget { image: Image::Standard, machine: Some(Machine::NonGimlet), switch: Some(Switch::Stub), - rack_topology: Some(RackTopology::MultiSled), + rack_topology: RackTopology::MultiSled, } } } @@ -118,9 +118,7 @@ impl From for Target { if let Some(switch) = kt.switch { map.insert("switch".to_string(), switch.to_string()); } - if let Some(rack_topology) = kt.rack_topology { - map.insert("rack-topology".to_string(), rack_topology.to_string()); - } + map.insert("rack-topology".to_string(), kt.rack_topology.to_string()); Target(map) } } @@ -170,6 +168,11 @@ impl std::str::FromStr for KnownTarget { } } } - KnownTarget::new(image, machine, switch, rack_topology) + KnownTarget::new( + image, + machine, + switch, + rack_topology.unwrap_or(RackTopology::MultiSled), + ) } } From ba1c53f710deccad639925ab53b95333001690da Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Fri, 29 Sep 2023 23:34:00 +0000 Subject: [PATCH 12/13] fix the ci --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 722aacbe0f..f5cf1dc885 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -41,7 +41,7 @@ jobs: - name: Install Pre-Requisites run: ./tools/install_builder_prerequisites.sh -y - name: Set default target - run: cargo run --bin omicron-package -- -t default target create + run: cargo run --bin omicron-package -- -t default target create -r single-sled - name: Check build of deployed Omicron packages run: cargo run --bin omicron-package -- -t default check From 140f973504dc0c6251fd04fa00949ac278650d42 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Sat, 30 Sep 2023 00:34:50 +0000 Subject: [PATCH 13/13] sorry lol --- installinator/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/installinator/Cargo.toml b/installinator/Cargo.toml index 3b2f04c38f..428ea0d08e 100644 --- a/installinator/Cargo.toml +++ b/installinator/Cargo.toml @@ -57,3 +57,4 @@ tokio-stream.workspace = true [features] image-standard = [] image-trampoline = [] +rack-topology-single-sled = [] \ No newline at end of file