diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs index 73ccec996cb..47c567dbe0e 100644 --- a/common/src/nexus_config.rs +++ b/common/src/nexus_config.rs @@ -372,6 +372,8 @@ pub struct PackageConfig { pub dendrite: HashMap, /// Background task configuration pub background_tasks: BackgroundTaskConfig, + /// Default Crucible region allocation strategy + pub default_region_allocation_strategy: RegionAllocationStrategy, } #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] @@ -594,6 +596,9 @@ mod test { dns_external.period_secs_propagation = 7 dns_external.max_concurrent_server_updates = 8 external_endpoints.period_secs = 9 + [default_region_allocation_strategy] + type = "random" + seed = 0 "##, ) .unwrap(); @@ -677,6 +682,10 @@ mod test { period_secs: Duration::from_secs(9), } }, + default_region_allocation_strategy: + crate::nexus_config::RegionAllocationStrategy::Random { + seed: Some(0) + } }, } ); @@ -724,6 +733,8 @@ mod test { dns_external.period_secs_propagation = 7 dns_external.max_concurrent_server_updates = 8 external_endpoints.period_secs = 9 + [default_region_allocation_strategy] + type = "random" "##, ) .unwrap(); @@ -894,3 +905,30 @@ mod test { ); } } + +/// Defines a strategy for choosing what physical disks to use when allocating +/// new crucible regions. +/// +/// NOTE: More strategies can - and should! - be added. +/// +/// See for a more +/// complete discussion. +/// +/// Longer-term, we should consider: +/// - Storage size + remaining free space +/// - Sled placement of datasets +/// - What sort of loads we'd like to create (even split across all disks +/// may not be preferable, especially if maintenance is expected) +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum RegionAllocationStrategy { + /// Choose disks pseudo-randomly. An optional seed may be provided to make + /// the ordering deterministic, otherwise the current time in nanoseconds + /// will be used. Ordering is based on sorting the output of `md5(UUID of + /// candidate dataset + seed)`. The seed does not need to come from a + /// cryptographically secure source. + Random { seed: Option }, + + /// Like Random, but ensures that each region is allocated on its own sled. + RandomWithDistinctSleds { seed: Option }, +} diff --git a/nexus/db-model/src/queries/region_allocation.rs b/nexus/db-model/src/queries/region_allocation.rs index 43fac3c9a6c..2025e79fb88 100644 --- a/nexus/db-model/src/queries/region_allocation.rs +++ b/nexus/db-model/src/queries/region_allocation.rs @@ -47,6 +47,13 @@ table! { } } +table! { + shuffled_candidate_datasets { + id -> Uuid, + pool_id -> Uuid, + } +} + table! { candidate_regions { id -> Uuid, @@ -89,6 +96,19 @@ table! { } } +table! { + one_zpool_per_sled (pool_id) { + pool_id -> Uuid + } +} + +table! { + one_dataset_per_zpool { + id -> Uuid, + pool_id -> Uuid + } +} + table! { inserted_regions { id -> Uuid, @@ -141,6 +161,7 @@ diesel::allow_tables_to_appear_in_same_query!( ); diesel::allow_tables_to_appear_in_same_query!(old_regions, dataset,); +diesel::allow_tables_to_appear_in_same_query!(old_regions, zpool,); diesel::allow_tables_to_appear_in_same_query!( inserted_regions, @@ -149,6 +170,7 @@ diesel::allow_tables_to_appear_in_same_query!( diesel::allow_tables_to_appear_in_same_query!(candidate_zpools, dataset,); diesel::allow_tables_to_appear_in_same_query!(candidate_zpools, zpool,); +diesel::allow_tables_to_appear_in_same_query!(candidate_datasets, dataset); // == Needed for random region allocation == diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 13fb132abb6..c3788aec8ef 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -286,43 +286,6 @@ pub enum UpdatePrecondition { Value(T), } -/// Defines a strategy for choosing what physical disks to use when allocating -/// new crucible regions. -/// -/// NOTE: More strategies can - and should! - be added. -/// -/// See for a more -/// complete discussion. -/// -/// Longer-term, we should consider: -/// - Storage size + remaining free space -/// - Sled placement of datasets -/// - What sort of loads we'd like to create (even split across all disks -/// may not be preferable, especially if maintenance is expected) -#[derive(Debug, Clone)] -pub enum RegionAllocationStrategy { - /// Choose disks that have the least data usage in the rack. This strategy - /// can lead to bad failure states wherein the disks with the least usage - /// have the least usage because regions on them are actually failing in - /// some way. Further retried allocations will then continue to try to - /// allocate onto the disk, perpetuating the problem. Currently this - /// strategy only exists so we can test that using different allocation - /// strategies actually results in different allocation patterns, hence the - /// `#[cfg(test)]`. - /// - /// See https://github.com/oxidecomputer/omicron/issues/3416 for more on the - /// failure-states associated with this strategy - #[cfg(test)] - LeastUsedDisk, - - /// Choose disks pseudo-randomly. An optional seed may be provided to make - /// the ordering deterministic, otherwise the current time in nanoseconds - /// will be used. Ordering is based on sorting the output of `md5(UUID of - /// candidate dataset + seed)`. The seed does not need to come from a - /// cryptographically secure source. - Random(Option), -} - /// Constructs a DataStore for use in test suites that has preloaded the /// built-in users, roles, and role assignments that are needed for basic /// operation @@ -400,7 +363,9 @@ mod test { use omicron_common::api::external::{ self, ByteCount, Error, IdentityMetadataCreateParams, LookupType, Name, }; + use omicron_common::nexus_config::RegionAllocationStrategy; use omicron_test_utils::dev; + use std::collections::HashMap; use std::collections::HashSet; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddrV6}; use std::num::NonZeroU32; @@ -683,12 +648,18 @@ mod test { } } + struct TestDataset { + sled_id: Uuid, + dataset_id: Uuid, + } + async fn create_test_datasets_for_region_allocation( opctx: &OpContext, datastore: Arc, - ) -> Vec { + number_of_sleds: usize, + ) -> Vec { // Create sleds... - let sled_ids: Vec = stream::iter(0..REGION_REDUNDANCY_THRESHOLD) + let sled_ids: Vec = stream::iter(0..number_of_sleds) .then(|_| create_test_sled(&datastore)) .collect() .await; @@ -719,48 +690,69 @@ mod test { .collect() .await; + #[derive(Copy, Clone)] + struct Zpool { + sled_id: Uuid, + pool_id: Uuid, + } + // 1 pool per disk - let zpool_ids: Vec = stream::iter(physical_disks) + let zpools: Vec = stream::iter(physical_disks) .then(|disk| { - create_test_zpool(&datastore, disk.sled_id, disk.disk_id) + let pool_id_future = + create_test_zpool(&datastore, disk.sled_id, disk.disk_id); + async move { + let pool_id = pool_id_future.await; + Zpool { sled_id: disk.sled_id, pool_id } + } }) .collect() .await; let bogus_addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 8080, 0, 0); - // 1 dataset per zpool - let dataset_ids: Vec = stream::iter(zpool_ids) - .then(|zpool_id| { - let id = Uuid::new_v4(); - let dataset = Dataset::new( - id, - zpool_id, - bogus_addr, - DatasetKind::Crucible, - ); - let datastore = datastore.clone(); - async move { - datastore.dataset_upsert(dataset).await.unwrap(); - id - } + let datasets: Vec = stream::iter(zpools) + .map(|zpool| { + // 3 datasets per zpool, to test that pools are distinct + let zpool_iter: Vec = (0..3).map(|_| zpool).collect(); + stream::iter(zpool_iter).then(|zpool| { + let id = Uuid::new_v4(); + let dataset = Dataset::new( + id, + zpool.pool_id, + bogus_addr, + DatasetKind::Crucible, + ); + + let datastore = datastore.clone(); + async move { + datastore.dataset_upsert(dataset).await.unwrap(); + + TestDataset { sled_id: zpool.sled_id, dataset_id: id } + } + }) }) + .flatten() .collect() .await; - dataset_ids + datasets } #[tokio::test] /// Note that this test is currently non-deterministic. It can be made /// deterministic by generating deterministic *dataset* Uuids. The sled and /// pool IDs should not matter. - async fn test_region_allocation() { + async fn test_region_allocation_strat_random() { let logctx = dev::test_setup_log("test_region_allocation"); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - create_test_datasets_for_region_allocation(&opctx, datastore.clone()) - .await; + create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; // Allocate regions from the datasets for this disk. Do it a few times // for good measure. @@ -778,7 +770,9 @@ mod test { volume_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(alloc_seed as u128)), + &RegionAllocationStrategy::Random { + seed: Some(alloc_seed), + }, ) .await .unwrap(); @@ -788,8 +782,79 @@ mod test { let mut disk_datasets = HashSet::new(); let mut disk_zpools = HashSet::new(); - // TODO: When allocation chooses 3 distinct sleds, uncomment this. - // let mut disk1_sleds = HashSet::new(); + for (dataset, region) in dataset_and_regions { + // Must be 3 unique datasets + assert!(disk_datasets.insert(dataset.id())); + + // Must be 3 unique zpools + assert!(disk_zpools.insert(dataset.pool_id)); + + assert_eq!(volume_id, region.volume_id()); + assert_eq!(ByteCount::from(4096), region.block_size()); + let (_, extent_count) = DataStore::get_crucible_allocation( + &BlockSize::AdvancedFormat, + params.size, + ); + assert_eq!(extent_count, region.extent_count()); + } + } + + let _ = db.cleanup().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + /// Test the [`RegionAllocationStrategy::RandomWithDistinctSleds`] strategy. + /// It should always pick datasets where no two datasets are on the same + /// zpool and no two zpools are on the same sled. + async fn test_region_allocation_strat_random_with_distinct_sleds() { + let logctx = dev::test_setup_log("test_region_allocation"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create a rack without enough sleds for a successful allocation when + // we require 3 distinct sleds. + let test_datasets = create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; + + // We need to check that our datasets end up on 3 distinct sleds, but the query doesn't return the sled ID, so we need to reverse map from dataset ID to sled ID + let sled_id_map: HashMap = test_datasets + .into_iter() + .map(|test_dataset| (test_dataset.dataset_id, test_dataset.sled_id)) + .collect(); + + // Allocate regions from the datasets for this disk. Do it a few times + // for good measure. + for alloc_seed in 0..10 { + let params = create_test_disk_create_params( + &format!("disk{}", alloc_seed), + ByteCount::from_mebibytes_u32(1), + ); + let volume_id = Uuid::new_v4(); + + let expected_region_count = REGION_REDUNDANCY_THRESHOLD; + let dataset_and_regions = datastore + .region_allocate( + &opctx, + volume_id, + ¶ms.disk_source, + params.size, + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: Some(alloc_seed), + }, + ) + .await + .unwrap(); + + // Verify the allocation. + assert_eq!(expected_region_count, dataset_and_regions.len()); + let mut disk_datasets = HashSet::new(); + let mut disk_zpools = HashSet::new(); + let mut disk_sleds = HashSet::new(); for (dataset, region) in dataset_and_regions { // Must be 3 unique datasets assert!(disk_datasets.insert(dataset.id())); @@ -798,8 +863,8 @@ mod test { assert!(disk_zpools.insert(dataset.pool_id)); // Must be 3 unique sleds - // TODO: When allocation chooses 3 distinct sleds, uncomment this. - // assert!(disk1_sleds.insert(Err(dataset))); + let sled_id = sled_id_map.get(&dataset.id()).unwrap(); + assert!(disk_sleds.insert(*sled_id)); assert_eq!(volume_id, region.volume_id()); assert_eq!(ByteCount::from(4096), region.block_size()); @@ -815,14 +880,70 @@ mod test { logctx.cleanup_successful(); } + #[tokio::test] + /// Ensure the [`RegionAllocationStrategy::RandomWithDistinctSleds`] + /// strategy fails when there aren't enough distinct sleds. + async fn test_region_allocation_strat_random_with_distinct_sleds_fails() { + let logctx = dev::test_setup_log("test_region_allocation"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create a rack without enough sleds for a successful allocation when + // we require 3 distinct sleds. + create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD - 1, + ) + .await; + + // Allocate regions from the datasets for this disk. Do it a few times + // for good measure. + for alloc_seed in 0..10 { + let params = create_test_disk_create_params( + &format!("disk{}", alloc_seed), + ByteCount::from_mebibytes_u32(1), + ); + let volume_id = Uuid::new_v4(); + + let err = datastore + .region_allocate( + &opctx, + volume_id, + ¶ms.disk_source, + params.size, + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: Some(alloc_seed), + }, + ) + .await + .unwrap_err(); + + let expected = "Not enough zpool space to allocate disks"; + assert!( + err.to_string().contains(expected), + "Saw error: \'{err}\', but expected \'{expected}\'" + ); + + assert!(matches!(err, Error::ServiceUnavailable { .. })); + } + + let _ = db.cleanup().await; + logctx.cleanup_successful(); + } + #[tokio::test] async fn test_region_allocation_is_idempotent() { let logctx = dev::test_setup_log("test_region_allocation_is_idempotent"); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - create_test_datasets_for_region_allocation(&opctx, datastore.clone()) - .await; + create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; // Allocate regions from the datasets for this volume. let params = create_test_disk_create_params( @@ -836,7 +957,7 @@ mod test { volume_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(0)), + &RegionAllocationStrategy::Random { seed: Some(0) }, ) .await .unwrap(); @@ -849,7 +970,7 @@ mod test { volume_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(1)), + &RegionAllocationStrategy::Random { seed: Some(1) }, ) .await .unwrap(); @@ -938,7 +1059,7 @@ mod test { volume1_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(0)), + &RegionAllocationStrategy::Random { seed: Some(0) }, ) .await .unwrap_err(); @@ -962,8 +1083,12 @@ mod test { let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - create_test_datasets_for_region_allocation(&opctx, datastore.clone()) - .await; + create_test_datasets_for_region_allocation( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; let disk_size = test_zpool_size(); let alloc_size = ByteCount::try_from(disk_size.to_bytes() * 2).unwrap(); @@ -976,7 +1101,7 @@ mod test { volume1_id, ¶ms.disk_source, params.size, - &RegionAllocationStrategy::Random(Some(0)), + &RegionAllocationStrategy::Random { seed: Some(0) }, ) .await .is_err()); diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index 6bfea9085d0..a26442280d5 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -5,7 +5,6 @@ //! [`DataStore`] methods on [`Region`]s. use super::DataStore; -use super::RegionAllocationStrategy; use super::RunnableQuery; use crate::context::OpContext; use crate::db; @@ -23,6 +22,7 @@ use omicron_common::api::external; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; use omicron_common::backoff::{self, BackoffError}; +use omicron_common::nexus_config::RegionAllocationStrategy; use slog::Logger; use uuid::Uuid; diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index 674a525c5c8..0922c533dd4 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -6,7 +6,6 @@ use crate::db::alias::ExpressionAlias; use crate::db::cast_uuid_as_bytea::CastUuidToBytea; -use crate::db::datastore::RegionAllocationStrategy; use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD; use crate::db::model::{Dataset, DatasetKind, Region}; use crate::db::pool::DbConnection; @@ -24,10 +23,11 @@ use diesel::{ use nexus_db_model::queries::region_allocation::{ candidate_datasets, candidate_regions, candidate_zpools, cockroach_md5, do_insert, inserted_regions, old_regions, old_zpool_usage, - proposed_dataset_changes, updated_datasets, + proposed_dataset_changes, shuffled_candidate_datasets, updated_datasets, }; use nexus_db_model::schema; use omicron_common::api::external; +use omicron_common::nexus_config::RegionAllocationStrategy; const NOT_ENOUGH_DATASETS_SENTINEL: &'static str = "Not enough datasets"; const NOT_ENOUGH_ZPOOL_SPACE_SENTINEL: &'static str = "Not enough space"; @@ -91,6 +91,8 @@ impl OldRegions { /// This implicitly distinguishes between "M.2s" and "U.2s" -- Nexus needs to /// determine during dataset provisioning which devices should be considered for /// usage as Crucible storage. +/// +/// We select only one dataset from each zpool. #[derive(Subquery, QueryId)] #[subquery(name = candidate_datasets)] struct CandidateDatasets { @@ -98,71 +100,65 @@ struct CandidateDatasets { } impl CandidateDatasets { - fn new( - allocation_strategy: &RegionAllocationStrategy, - candidate_zpools: &CandidateZpools, - ) -> Self { + fn new(candidate_zpools: &CandidateZpools, seed: u128) -> Self { use crate::db::schema::dataset::dsl as dataset_dsl; use candidate_zpools::dsl as candidate_zpool_dsl; - let query = match allocation_strategy { - #[cfg(test)] - RegionAllocationStrategy::LeastUsedDisk => { - let query: Box< - dyn CteQuery, - > = Box::new( - dataset_dsl::dataset - .inner_join( - candidate_zpools - .query_source() - .on(dataset_dsl::pool_id - .eq(candidate_zpool_dsl::pool_id)), - ) - .filter(dataset_dsl::time_deleted.is_null()) - .filter(dataset_dsl::size_used.is_not_null()) - .filter(dataset_dsl::kind.eq(DatasetKind::Crucible)) - .order(dataset_dsl::size_used.asc()) - .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap()) - .select((dataset_dsl::id, dataset_dsl::pool_id)), - ); - query - } - RegionAllocationStrategy::Random(seed) => { - let seed = seed.unwrap_or_else(|| { - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_nanos() - }); - - let seed_bytes = seed.to_le_bytes(); - - let query: Box< - dyn CteQuery, - > = Box::new( - dataset_dsl::dataset - .inner_join( - candidate_zpools - .query_source() - .on(dataset_dsl::pool_id - .eq(candidate_zpool_dsl::pool_id)), - ) - .filter(dataset_dsl::time_deleted.is_null()) - .filter(dataset_dsl::size_used.is_not_null()) - .filter(dataset_dsl::kind.eq(DatasetKind::Crucible)) - // We order by md5 to shuffle the ordering of the datasets. - // md5 has a uniform output distribution so it does the job. - .order(cockroach_md5::dsl::md5( + let seed_bytes = seed.to_le_bytes(); + + let query: Box> = + Box::new( + dataset_dsl::dataset + .inner_join(candidate_zpools.query_source().on( + dataset_dsl::pool_id.eq(candidate_zpool_dsl::pool_id), + )) + .filter(dataset_dsl::time_deleted.is_null()) + .filter(dataset_dsl::size_used.is_not_null()) + .filter(dataset_dsl::kind.eq(DatasetKind::Crucible)) + .distinct_on(dataset_dsl::pool_id) + .order_by(( + dataset_dsl::pool_id, + cockroach_md5::dsl::md5( CastUuidToBytea::new(dataset_dsl::id) .concat(seed_bytes.to_vec()), - )) - .select((dataset_dsl::id, dataset_dsl::pool_id)) - .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap()), - ); - query - } - }; + ), + )) + .select((dataset_dsl::id, dataset_dsl::pool_id)), + ); + Self { query } + } +} + +/// Shuffle the candidate datasets, and select REGION_REDUNDANCY_THRESHOLD +/// regions from it. +#[derive(Subquery, QueryId)] +#[subquery(name = shuffled_candidate_datasets)] +struct ShuffledCandidateDatasets { + query: Box>, +} +impl ShuffledCandidateDatasets { + fn new(candidate_datasets: &CandidateDatasets, seed: u128) -> Self { + use candidate_datasets::dsl as candidate_datasets_dsl; + + let seed_bytes = seed.to_le_bytes(); + + let query: Box> = + Box::new( + candidate_datasets + .query_source() + // We order by md5 to shuffle the ordering of the datasets. + // md5 has a uniform output distribution so it does the job. + .order(cockroach_md5::dsl::md5( + CastUuidToBytea::new(candidate_datasets_dsl::id) + .concat(seed_bytes.to_vec()), + )) + .select(( + candidate_datasets_dsl::id, + candidate_datasets_dsl::pool_id, + )) + .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap()), + ); Self { query } } } @@ -179,14 +175,14 @@ diesel::sql_function!(fn now() -> Timestamptz); impl CandidateRegions { fn new( - candidate_datasets: &CandidateDatasets, + shuffled_candidate_datasets: &ShuffledCandidateDatasets, volume_id: uuid::Uuid, block_size: u64, blocks_per_extent: u64, extent_count: u64, ) -> Self { - use candidate_datasets::dsl as candidate_datasets_dsl; use schema::region; + use shuffled_candidate_datasets::dsl as shuffled_candidate_datasets_dsl; let volume_id = volume_id.into_sql::(); let block_size = (block_size as i64).into_sql::(); @@ -195,20 +191,22 @@ impl CandidateRegions { let extent_count = (extent_count as i64).into_sql::(); Self { - query: Box::new(candidate_datasets.query_source().select(( - ExpressionAlias::new::(gen_random_uuid()), - ExpressionAlias::new::(now()), - ExpressionAlias::new::(now()), - ExpressionAlias::new::( - candidate_datasets_dsl::id, + query: Box::new(shuffled_candidate_datasets.query_source().select( + ( + ExpressionAlias::new::(gen_random_uuid()), + ExpressionAlias::new::(now()), + ExpressionAlias::new::(now()), + ExpressionAlias::new::( + shuffled_candidate_datasets_dsl::id, + ), + ExpressionAlias::new::(volume_id), + ExpressionAlias::new::(block_size), + ExpressionAlias::new::( + blocks_per_extent, + ), + ExpressionAlias::new::(extent_count), ), - ExpressionAlias::new::(volume_id), - ExpressionAlias::new::(block_size), - ExpressionAlias::new::( - blocks_per_extent, - ), - ExpressionAlias::new::(extent_count), - ))), + )), } } } @@ -285,12 +283,14 @@ struct CandidateZpools { } impl CandidateZpools { - fn new(old_zpool_usage: &OldPoolUsage, zpool_size_delta: u64) -> Self { + fn new( + old_zpool_usage: &OldPoolUsage, + zpool_size_delta: u64, + seed: u128, + distinct_sleds: bool, + ) -> Self { use schema::zpool::dsl as zpool_dsl; - let with_zpool = zpool_dsl::zpool - .on(zpool_dsl::id.eq(old_zpool_usage::dsl::pool_id)); - // Why are we using raw `diesel::dsl::sql` here? // // When SQL performs the "SUM" operation on "bigint" type, the result @@ -309,15 +309,40 @@ impl CandidateZpools { + diesel::dsl::sql(&zpool_size_delta.to_string())) .le(diesel::dsl::sql(zpool_dsl::total_size::NAME)); - Self { - query: Box::new( - old_zpool_usage - .query_source() - .inner_join(with_zpool) - .filter(it_will_fit) - .select((old_zpool_usage::dsl::pool_id,)), - ), - } + let with_zpool = zpool_dsl::zpool + .on(zpool_dsl::id.eq(old_zpool_usage::dsl::pool_id)); + + let base_query = old_zpool_usage + .query_source() + .inner_join(with_zpool) + .filter(it_will_fit) + .select((old_zpool_usage::dsl::pool_id,)); + + let query = if distinct_sleds { + let seed_bytes = seed.to_le_bytes(); + + let query: Box> = + Box::new( + base_query + .order_by(( + zpool_dsl::sled_id, + cockroach_md5::dsl::md5( + CastUuidToBytea::new(zpool_dsl::id) + .concat(seed_bytes.to_vec()), + ), + )) + .distinct_on(zpool_dsl::sled_id), + ); + + query + } else { + let query: Box> = + Box::new(base_query); + + query + }; + + Self { query } } } @@ -508,19 +533,47 @@ impl RegionAllocate { extent_count: u64, allocation_strategy: &RegionAllocationStrategy, ) -> Self { + let (seed, distinct_sleds) = { + let (input_seed, distinct_sleds) = match allocation_strategy { + RegionAllocationStrategy::Random { seed } => (seed, false), + RegionAllocationStrategy::RandomWithDistinctSleds { seed } => { + (seed, true) + } + }; + ( + input_seed.map_or_else( + || { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + }, + |seed| seed as u128, + ), + distinct_sleds, + ) + }; + let size_delta = block_size * blocks_per_extent * extent_count; let old_regions = OldRegions::new(volume_id); let old_pool_usage = OldPoolUsage::new(); - let candidate_zpools = - CandidateZpools::new(&old_pool_usage, size_delta); + let candidate_zpools = CandidateZpools::new( + &old_pool_usage, + size_delta, + seed, + distinct_sleds, + ); let candidate_datasets = - CandidateDatasets::new(&allocation_strategy, &candidate_zpools); + CandidateDatasets::new(&candidate_zpools, seed); + + let shuffled_candidate_datasets = + ShuffledCandidateDatasets::new(&candidate_datasets, seed); let candidate_regions = CandidateRegions::new( - &candidate_datasets, + &shuffled_candidate_datasets, volume_id, block_size, blocks_per_extent, @@ -577,6 +630,7 @@ impl RegionAllocate { .add_subquery(old_pool_usage) .add_subquery(candidate_zpools) .add_subquery(candidate_datasets) + .add_subquery(shuffled_candidate_datasets) .add_subquery(candidate_regions) .add_subquery(proposed_changes) .add_subquery(do_insert) diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index f1b20c32a10..1a9afbc6bdc 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -92,3 +92,14 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 + +[default_region_allocation_strategy] +# allocate region on 3 random distinct zpools, on 3 random distinct sleds. +type = "random_with_distinct_sleds" + +# the same as random_with_distinct_sleds, but without requiring distinct sleds +# type = "random" + +# setting `seed` to a fixed value will make dataset selection ordering use the +# same shuffling order for every region allocation. +# seed = 0 diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 99ed75f14ba..7a03caca404 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -23,6 +23,7 @@ use omicron_common::address::DENDRITE_PORT; use omicron_common::address::MGS_PORT; use omicron_common::api::external::Error; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_common::nexus_config::RegionAllocationStrategy; use slog::Logger; use std::collections::HashMap; use std::net::Ipv6Addr; @@ -152,6 +153,9 @@ pub struct Nexus { /// Background tasks background_tasks: background::BackgroundTasks, + + /// Default Crucible region allocation strategy + default_region_allocation_strategy: RegionAllocationStrategy, } impl Nexus { @@ -324,6 +328,10 @@ impl Nexus { external_resolver, dpd_clients, background_tasks, + default_region_allocation_strategy: config + .pkg + .default_region_allocation_strategy + .clone(), }; // TODO-cleanup all the extra Arcs here seems wrong diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index ff1d0b7174d..ef0eb79b75c 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -16,7 +16,6 @@ use crate::db::identity::{Asset, Resource}; use crate::db::lookup::LookupPath; use crate::external_api::params; use crate::{authn, authz, db}; -use nexus_db_queries::db::datastore::RegionAllocationStrategy; use omicron_common::api::external::DiskState; use omicron_common::api::external::Error; use rand::{rngs::StdRng, RngCore, SeedableRng}; @@ -251,6 +250,9 @@ async fn sdc_alloc_regions( &sagactx, ¶ms.serialized_authn, ); + + let strategy = &osagactx.nexus().default_region_allocation_strategy; + let datasets_and_regions = osagactx .datastore() .region_allocate( @@ -258,7 +260,7 @@ async fn sdc_alloc_regions( volume_id, ¶ms.create_params.disk_source, params.create_params.size, - &RegionAllocationStrategy::Random(None), + &strategy, ) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index 81212571e25..8d34bd7a58a 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -107,7 +107,6 @@ use crate::{authn, authz, db}; use anyhow::anyhow; use crucible_agent_client::{types::RegionId, Client as CrucibleAgentClient}; use nexus_db_model::Generation; -use nexus_db_queries::db::datastore::RegionAllocationStrategy; use omicron_common::api::external; use omicron_common::api::external::Error; use rand::{rngs::StdRng, RngCore, SeedableRng}; @@ -328,6 +327,8 @@ async fn ssc_alloc_regions( .await .map_err(ActionError::action_failed)?; + let strategy = &osagactx.nexus().default_region_allocation_strategy; + let datasets_and_regions = osagactx .datastore() .region_allocate( @@ -340,7 +341,7 @@ async fn ssc_alloc_regions( .map_err(|e| ActionError::action_failed(e.to_string()))?, }, external::ByteCount::from(disk.size), - &RegionAllocationStrategy::Random(None), + &strategy, ) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 6eeacceaedd..1b1ae2c9129 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -89,3 +89,8 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 + +[default_region_allocation_strategy] +# we only have one sled in the test environment, so we need to use the +# `Random` strategy, instead of `RandomWithDistinctSleds` +type = "random" \ No newline at end of file diff --git a/smf/nexus/config-partial.toml b/smf/nexus/config-partial.toml index b29727c4aa5..2dfee81d026 100644 --- a/smf/nexus/config-partial.toml +++ b/smf/nexus/config-partial.toml @@ -38,3 +38,8 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 + +[default_region_allocation_strategy] +# by default, allocate across 3 distinct sleds +# seed is omitted so a new seed will be chosen with every allocation. +type = "random_with_distinct_sleds" \ No newline at end of file