From abc7293397e60fcfb77e18075b9b5c0374fbf936 Mon Sep 17 00:00:00 2001
From: Artemis Everfree <artemis@oxidecomputer.com>
Date: Wed, 9 Aug 2023 04:03:50 +0000
Subject: [PATCH] RandomnWithDistinctSleds region allocation strategy

PR #3650 introduced the Random region allocation strategy to allocate
regions randomly across the rack. This expands on that with the addition
of the RandomWithDistinctSleds region allocation strategy. This strategy
is the same, but requires the 3 crucible regions be allocated on 3
different sleds to improve resiliency against a whole-sled failure.

The Random strategy still exists, and does not require 3 distinct sleds.
This is useful in one-sled environments such as the integration tests,
and lab setups. This PR adds the ability to configure the allocation
strategy in the Nexus PackageConfig toml. Anyone running in a one-sled
setup will need to configure that to one-sled mode (as is done for the
integration test environment).

This also fixes a shortcoming of #3650 whereby multiple datasets on a
single zpool could be selected. That fix applies to both the old Random
strategy and the new RandomWithDistinctSleds strategy.

`smf/nexus/config-partial.toml` is configured for
RandomWithDistinctSleds, as that is what we want to use on prod.

As I mentioned, the integration tests are not using the distinct sleds
allocation strategy. I attempted to add 2 extra sleds to the simulated
environment but found that this broke more things than I had the
understanding to fix in this PR. It would be nice in the future for the
sim environment to have 3 sleds in it though, not just for this but for
anything else that might have different behaviors in a multi-sled setup.

In the present, I have unit tests that verify the allocation behavior
works correctly with cockroachdb, and we can try it out on dogfood.
---
 common/src/nexus_config.rs                    |  38 +++
 .../db-model/src/queries/region_allocation.rs |  22 ++
 nexus/db-queries/src/db/datastore/mod.rs      | 271 +++++++++++++-----
 nexus/db-queries/src/db/datastore/region.rs   |   2 +-
 .../src/db/queries/region_allocation.rs       | 240 ++++++++++------
 nexus/examples/config.toml                    |  11 +
 nexus/src/app/mod.rs                          |   8 +
 nexus/src/app/sagas/disk_create.rs            |   6 +-
 nexus/src/app/sagas/snapshot_create.rs        |   5 +-
 nexus/tests/config.test.toml                  |   5 +
 smf/nexus/config-partial.toml                 |   5 +
 11 files changed, 442 insertions(+), 171 deletions(-)

diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs
index 73ccec996cb..47c567dbe0e 100644
--- a/common/src/nexus_config.rs
+++ b/common/src/nexus_config.rs
@@ -372,6 +372,8 @@ pub struct PackageConfig {
     pub dendrite: HashMap<SwitchLocation, DpdConfig>,
     /// Background task configuration
     pub background_tasks: BackgroundTaskConfig,
+    /// Default Crucible region allocation strategy
+    pub default_region_allocation_strategy: RegionAllocationStrategy,
 }
 
 #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)]
@@ -594,6 +596,9 @@ mod test {
             dns_external.period_secs_propagation = 7
             dns_external.max_concurrent_server_updates = 8
             external_endpoints.period_secs = 9
+            [default_region_allocation_strategy]
+            type = "random"
+            seed = 0
             "##,
         )
         .unwrap();
@@ -677,6 +682,10 @@ mod test {
                             period_secs: Duration::from_secs(9),
                         }
                     },
+                    default_region_allocation_strategy:
+                        crate::nexus_config::RegionAllocationStrategy::Random {
+                            seed: Some(0)
+                        }
                 },
             }
         );
@@ -724,6 +733,8 @@ mod test {
             dns_external.period_secs_propagation = 7
             dns_external.max_concurrent_server_updates = 8
             external_endpoints.period_secs = 9
+            [default_region_allocation_strategy]
+            type = "random"
             "##,
         )
         .unwrap();
@@ -894,3 +905,30 @@ mod test {
         );
     }
 }
+
+/// Defines a strategy for choosing what physical disks to use when allocating
+/// new crucible regions.
+///
+/// NOTE: More strategies can - and should! - be added.
+///
+/// See <https://rfd.shared.oxide.computer/rfd/0205> for a more
+/// complete discussion.
+///
+/// Longer-term, we should consider:
+/// - Storage size + remaining free space
+/// - Sled placement of datasets
+/// - What sort of loads we'd like to create (even split across all disks
+///   may not be preferable, especially if maintenance is expected)
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum RegionAllocationStrategy {
+    /// Choose disks pseudo-randomly. An optional seed may be provided to make
+    /// the ordering deterministic, otherwise the current time in nanoseconds
+    /// will be used. Ordering is based on sorting the output of `md5(UUID of
+    /// candidate dataset + seed)`. The seed does not need to come from a
+    /// cryptographically secure source.
+    Random { seed: Option<u64> },
+
+    /// Like Random, but ensures that each region is allocated on its own sled.
+    RandomWithDistinctSleds { seed: Option<u64> },
+}
diff --git a/nexus/db-model/src/queries/region_allocation.rs b/nexus/db-model/src/queries/region_allocation.rs
index 43fac3c9a6c..2025e79fb88 100644
--- a/nexus/db-model/src/queries/region_allocation.rs
+++ b/nexus/db-model/src/queries/region_allocation.rs
@@ -47,6 +47,13 @@ table! {
     }
 }
 
+table! {
+    shuffled_candidate_datasets {
+        id -> Uuid,
+        pool_id -> Uuid,
+    }
+}
+
 table! {
     candidate_regions {
         id -> Uuid,
@@ -89,6 +96,19 @@ table! {
     }
 }
 
+table! {
+    one_zpool_per_sled (pool_id) {
+        pool_id -> Uuid
+    }
+}
+
+table! {
+    one_dataset_per_zpool {
+        id -> Uuid,
+        pool_id -> Uuid
+    }
+}
+
 table! {
     inserted_regions {
         id -> Uuid,
@@ -141,6 +161,7 @@ diesel::allow_tables_to_appear_in_same_query!(
 );
 
 diesel::allow_tables_to_appear_in_same_query!(old_regions, dataset,);
+diesel::allow_tables_to_appear_in_same_query!(old_regions, zpool,);
 
 diesel::allow_tables_to_appear_in_same_query!(
     inserted_regions,
@@ -149,6 +170,7 @@ diesel::allow_tables_to_appear_in_same_query!(
 
 diesel::allow_tables_to_appear_in_same_query!(candidate_zpools, dataset,);
 diesel::allow_tables_to_appear_in_same_query!(candidate_zpools, zpool,);
+diesel::allow_tables_to_appear_in_same_query!(candidate_datasets, dataset);
 
 // == Needed for random region allocation ==
 
diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs
index 13fb132abb6..c3788aec8ef 100644
--- a/nexus/db-queries/src/db/datastore/mod.rs
+++ b/nexus/db-queries/src/db/datastore/mod.rs
@@ -286,43 +286,6 @@ pub enum UpdatePrecondition<T> {
     Value(T),
 }
 
-/// Defines a strategy for choosing what physical disks to use when allocating
-/// new crucible regions.
-///
-/// NOTE: More strategies can - and should! - be added.
-///
-/// See <https://rfd.shared.oxide.computer/rfd/0205> for a more
-/// complete discussion.
-///
-/// Longer-term, we should consider:
-/// - Storage size + remaining free space
-/// - Sled placement of datasets
-/// - What sort of loads we'd like to create (even split across all disks
-///   may not be preferable, especially if maintenance is expected)
-#[derive(Debug, Clone)]
-pub enum RegionAllocationStrategy {
-    /// Choose disks that have the least data usage in the rack. This strategy
-    /// can lead to bad failure states wherein the disks with the least usage
-    /// have the least usage because regions on them are actually failing in
-    /// some way. Further retried allocations will then continue to try to
-    /// allocate onto the disk, perpetuating the problem. Currently this
-    /// strategy only exists so we can test that using different allocation
-    /// strategies actually results in different allocation patterns, hence the
-    /// `#[cfg(test)]`.
-    ///
-    /// See https://github.com/oxidecomputer/omicron/issues/3416 for more on the
-    /// failure-states associated with this strategy
-    #[cfg(test)]
-    LeastUsedDisk,
-
-    /// Choose disks pseudo-randomly. An optional seed may be provided to make
-    /// the ordering deterministic, otherwise the current time in nanoseconds
-    /// will be used. Ordering is based on sorting the output of `md5(UUID of
-    /// candidate dataset + seed)`. The seed does not need to come from a
-    /// cryptographically secure source.
-    Random(Option<u128>),
-}
-
 /// Constructs a DataStore for use in test suites that has preloaded the
 /// built-in users, roles, and role assignments that are needed for basic
 /// operation
@@ -400,7 +363,9 @@ mod test {
     use omicron_common::api::external::{
         self, ByteCount, Error, IdentityMetadataCreateParams, LookupType, Name,
     };
+    use omicron_common::nexus_config::RegionAllocationStrategy;
     use omicron_test_utils::dev;
+    use std::collections::HashMap;
     use std::collections::HashSet;
     use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddrV6};
     use std::num::NonZeroU32;
@@ -683,12 +648,18 @@ mod test {
         }
     }
 
+    struct TestDataset {
+        sled_id: Uuid,
+        dataset_id: Uuid,
+    }
+
     async fn create_test_datasets_for_region_allocation(
         opctx: &OpContext,
         datastore: Arc<DataStore>,
-    ) -> Vec<Uuid> {
+        number_of_sleds: usize,
+    ) -> Vec<TestDataset> {
         // Create sleds...
-        let sled_ids: Vec<Uuid> = stream::iter(0..REGION_REDUNDANCY_THRESHOLD)
+        let sled_ids: Vec<Uuid> = stream::iter(0..number_of_sleds)
             .then(|_| create_test_sled(&datastore))
             .collect()
             .await;
@@ -719,48 +690,69 @@ mod test {
             .collect()
             .await;
 
+        #[derive(Copy, Clone)]
+        struct Zpool {
+            sled_id: Uuid,
+            pool_id: Uuid,
+        }
+
         // 1 pool per disk
-        let zpool_ids: Vec<Uuid> = stream::iter(physical_disks)
+        let zpools: Vec<Zpool> = stream::iter(physical_disks)
             .then(|disk| {
-                create_test_zpool(&datastore, disk.sled_id, disk.disk_id)
+                let pool_id_future =
+                    create_test_zpool(&datastore, disk.sled_id, disk.disk_id);
+                async move {
+                    let pool_id = pool_id_future.await;
+                    Zpool { sled_id: disk.sled_id, pool_id }
+                }
             })
             .collect()
             .await;
 
         let bogus_addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 8080, 0, 0);
 
-        // 1 dataset per zpool
-        let dataset_ids: Vec<Uuid> = stream::iter(zpool_ids)
-            .then(|zpool_id| {
-                let id = Uuid::new_v4();
-                let dataset = Dataset::new(
-                    id,
-                    zpool_id,
-                    bogus_addr,
-                    DatasetKind::Crucible,
-                );
-                let datastore = datastore.clone();
-                async move {
-                    datastore.dataset_upsert(dataset).await.unwrap();
-                    id
-                }
+        let datasets: Vec<TestDataset> = stream::iter(zpools)
+            .map(|zpool| {
+                // 3 datasets per zpool, to test that pools are distinct
+                let zpool_iter: Vec<Zpool> = (0..3).map(|_| zpool).collect();
+                stream::iter(zpool_iter).then(|zpool| {
+                    let id = Uuid::new_v4();
+                    let dataset = Dataset::new(
+                        id,
+                        zpool.pool_id,
+                        bogus_addr,
+                        DatasetKind::Crucible,
+                    );
+
+                    let datastore = datastore.clone();
+                    async move {
+                        datastore.dataset_upsert(dataset).await.unwrap();
+
+                        TestDataset { sled_id: zpool.sled_id, dataset_id: id }
+                    }
+                })
             })
+            .flatten()
             .collect()
             .await;
 
-        dataset_ids
+        datasets
     }
 
     #[tokio::test]
     /// Note that this test is currently non-deterministic. It can be made
     /// deterministic by generating deterministic *dataset* Uuids. The sled and
     /// pool IDs should not matter.
-    async fn test_region_allocation() {
+    async fn test_region_allocation_strat_random() {
         let logctx = dev::test_setup_log("test_region_allocation");
         let mut db = test_setup_database(&logctx.log).await;
         let (opctx, datastore) = datastore_test(&logctx, &db).await;
-        create_test_datasets_for_region_allocation(&opctx, datastore.clone())
-            .await;
+        create_test_datasets_for_region_allocation(
+            &opctx,
+            datastore.clone(),
+            REGION_REDUNDANCY_THRESHOLD,
+        )
+        .await;
 
         // Allocate regions from the datasets for this disk. Do it a few times
         // for good measure.
@@ -778,7 +770,9 @@ mod test {
                     volume_id,
                     &params.disk_source,
                     params.size,
-                    &RegionAllocationStrategy::Random(Some(alloc_seed as u128)),
+                    &RegionAllocationStrategy::Random {
+                        seed: Some(alloc_seed),
+                    },
                 )
                 .await
                 .unwrap();
@@ -788,8 +782,79 @@ mod test {
             let mut disk_datasets = HashSet::new();
             let mut disk_zpools = HashSet::new();
 
-            // TODO: When allocation chooses 3 distinct sleds, uncomment this.
-            // let mut disk1_sleds = HashSet::new();
+            for (dataset, region) in dataset_and_regions {
+                // Must be 3 unique datasets
+                assert!(disk_datasets.insert(dataset.id()));
+
+                // Must be 3 unique zpools
+                assert!(disk_zpools.insert(dataset.pool_id));
+
+                assert_eq!(volume_id, region.volume_id());
+                assert_eq!(ByteCount::from(4096), region.block_size());
+                let (_, extent_count) = DataStore::get_crucible_allocation(
+                    &BlockSize::AdvancedFormat,
+                    params.size,
+                );
+                assert_eq!(extent_count, region.extent_count());
+            }
+        }
+
+        let _ = db.cleanup().await;
+        logctx.cleanup_successful();
+    }
+
+    #[tokio::test]
+    /// Test the [`RegionAllocationStrategy::RandomWithDistinctSleds`] strategy.
+    /// It should always pick datasets where no two datasets are on the same
+    /// zpool and no two zpools are on the same sled.
+    async fn test_region_allocation_strat_random_with_distinct_sleds() {
+        let logctx = dev::test_setup_log("test_region_allocation");
+        let mut db = test_setup_database(&logctx.log).await;
+        let (opctx, datastore) = datastore_test(&logctx, &db).await;
+
+        // Create a rack without enough sleds for a successful allocation when
+        // we require 3 distinct sleds.
+        let test_datasets = create_test_datasets_for_region_allocation(
+            &opctx,
+            datastore.clone(),
+            REGION_REDUNDANCY_THRESHOLD,
+        )
+        .await;
+
+        // We need to check that our datasets end up on 3 distinct sleds, but the query doesn't return the sled ID, so we need to reverse map from dataset ID to sled ID
+        let sled_id_map: HashMap<Uuid, Uuid> = test_datasets
+            .into_iter()
+            .map(|test_dataset| (test_dataset.dataset_id, test_dataset.sled_id))
+            .collect();
+
+        // Allocate regions from the datasets for this disk. Do it a few times
+        // for good measure.
+        for alloc_seed in 0..10 {
+            let params = create_test_disk_create_params(
+                &format!("disk{}", alloc_seed),
+                ByteCount::from_mebibytes_u32(1),
+            );
+            let volume_id = Uuid::new_v4();
+
+            let expected_region_count = REGION_REDUNDANCY_THRESHOLD;
+            let dataset_and_regions = datastore
+                .region_allocate(
+                    &opctx,
+                    volume_id,
+                    &params.disk_source,
+                    params.size,
+                    &&RegionAllocationStrategy::RandomWithDistinctSleds {
+                        seed: Some(alloc_seed),
+                    },
+                )
+                .await
+                .unwrap();
+
+            // Verify the allocation.
+            assert_eq!(expected_region_count, dataset_and_regions.len());
+            let mut disk_datasets = HashSet::new();
+            let mut disk_zpools = HashSet::new();
+            let mut disk_sleds = HashSet::new();
             for (dataset, region) in dataset_and_regions {
                 // Must be 3 unique datasets
                 assert!(disk_datasets.insert(dataset.id()));
@@ -798,8 +863,8 @@ mod test {
                 assert!(disk_zpools.insert(dataset.pool_id));
 
                 // Must be 3 unique sleds
-                // TODO: When allocation chooses 3 distinct sleds, uncomment this.
-                // assert!(disk1_sleds.insert(Err(dataset)));
+                let sled_id = sled_id_map.get(&dataset.id()).unwrap();
+                assert!(disk_sleds.insert(*sled_id));
 
                 assert_eq!(volume_id, region.volume_id());
                 assert_eq!(ByteCount::from(4096), region.block_size());
@@ -815,14 +880,70 @@ mod test {
         logctx.cleanup_successful();
     }
 
+    #[tokio::test]
+    /// Ensure the [`RegionAllocationStrategy::RandomWithDistinctSleds`]
+    /// strategy fails when there aren't enough distinct sleds.
+    async fn test_region_allocation_strat_random_with_distinct_sleds_fails() {
+        let logctx = dev::test_setup_log("test_region_allocation");
+        let mut db = test_setup_database(&logctx.log).await;
+        let (opctx, datastore) = datastore_test(&logctx, &db).await;
+
+        // Create a rack without enough sleds for a successful allocation when
+        // we require 3 distinct sleds.
+        create_test_datasets_for_region_allocation(
+            &opctx,
+            datastore.clone(),
+            REGION_REDUNDANCY_THRESHOLD - 1,
+        )
+        .await;
+
+        // Allocate regions from the datasets for this disk. Do it a few times
+        // for good measure.
+        for alloc_seed in 0..10 {
+            let params = create_test_disk_create_params(
+                &format!("disk{}", alloc_seed),
+                ByteCount::from_mebibytes_u32(1),
+            );
+            let volume_id = Uuid::new_v4();
+
+            let err = datastore
+                .region_allocate(
+                    &opctx,
+                    volume_id,
+                    &params.disk_source,
+                    params.size,
+                    &&RegionAllocationStrategy::RandomWithDistinctSleds {
+                        seed: Some(alloc_seed),
+                    },
+                )
+                .await
+                .unwrap_err();
+
+            let expected = "Not enough zpool space to allocate disks";
+            assert!(
+                err.to_string().contains(expected),
+                "Saw error: \'{err}\', but expected \'{expected}\'"
+            );
+
+            assert!(matches!(err, Error::ServiceUnavailable { .. }));
+        }
+
+        let _ = db.cleanup().await;
+        logctx.cleanup_successful();
+    }
+
     #[tokio::test]
     async fn test_region_allocation_is_idempotent() {
         let logctx =
             dev::test_setup_log("test_region_allocation_is_idempotent");
         let mut db = test_setup_database(&logctx.log).await;
         let (opctx, datastore) = datastore_test(&logctx, &db).await;
-        create_test_datasets_for_region_allocation(&opctx, datastore.clone())
-            .await;
+        create_test_datasets_for_region_allocation(
+            &opctx,
+            datastore.clone(),
+            REGION_REDUNDANCY_THRESHOLD,
+        )
+        .await;
 
         // Allocate regions from the datasets for this volume.
         let params = create_test_disk_create_params(
@@ -836,7 +957,7 @@ mod test {
                 volume_id,
                 &params.disk_source,
                 params.size,
-                &RegionAllocationStrategy::Random(Some(0)),
+                &RegionAllocationStrategy::Random { seed: Some(0) },
             )
             .await
             .unwrap();
@@ -849,7 +970,7 @@ mod test {
                 volume_id,
                 &params.disk_source,
                 params.size,
-                &RegionAllocationStrategy::Random(Some(1)),
+                &RegionAllocationStrategy::Random { seed: Some(1) },
             )
             .await
             .unwrap();
@@ -938,7 +1059,7 @@ mod test {
                 volume1_id,
                 &params.disk_source,
                 params.size,
-                &RegionAllocationStrategy::Random(Some(0)),
+                &RegionAllocationStrategy::Random { seed: Some(0) },
             )
             .await
             .unwrap_err();
@@ -962,8 +1083,12 @@ mod test {
         let mut db = test_setup_database(&logctx.log).await;
         let (opctx, datastore) = datastore_test(&logctx, &db).await;
 
-        create_test_datasets_for_region_allocation(&opctx, datastore.clone())
-            .await;
+        create_test_datasets_for_region_allocation(
+            &opctx,
+            datastore.clone(),
+            REGION_REDUNDANCY_THRESHOLD,
+        )
+        .await;
 
         let disk_size = test_zpool_size();
         let alloc_size = ByteCount::try_from(disk_size.to_bytes() * 2).unwrap();
@@ -976,7 +1101,7 @@ mod test {
                 volume1_id,
                 &params.disk_source,
                 params.size,
-                &RegionAllocationStrategy::Random(Some(0)),
+                &RegionAllocationStrategy::Random { seed: Some(0) },
             )
             .await
             .is_err());
diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs
index 6bfea9085d0..a26442280d5 100644
--- a/nexus/db-queries/src/db/datastore/region.rs
+++ b/nexus/db-queries/src/db/datastore/region.rs
@@ -5,7 +5,6 @@
 //! [`DataStore`] methods on [`Region`]s.
 
 use super::DataStore;
-use super::RegionAllocationStrategy;
 use super::RunnableQuery;
 use crate::context::OpContext;
 use crate::db;
@@ -23,6 +22,7 @@ use omicron_common::api::external;
 use omicron_common::api::external::DeleteResult;
 use omicron_common::api::external::Error;
 use omicron_common::backoff::{self, BackoffError};
+use omicron_common::nexus_config::RegionAllocationStrategy;
 use slog::Logger;
 use uuid::Uuid;
 
diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index 674a525c5c8..0922c533dd4 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -6,7 +6,6 @@
 
 use crate::db::alias::ExpressionAlias;
 use crate::db::cast_uuid_as_bytea::CastUuidToBytea;
-use crate::db::datastore::RegionAllocationStrategy;
 use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD;
 use crate::db::model::{Dataset, DatasetKind, Region};
 use crate::db::pool::DbConnection;
@@ -24,10 +23,11 @@ use diesel::{
 use nexus_db_model::queries::region_allocation::{
     candidate_datasets, candidate_regions, candidate_zpools, cockroach_md5,
     do_insert, inserted_regions, old_regions, old_zpool_usage,
-    proposed_dataset_changes, updated_datasets,
+    proposed_dataset_changes, shuffled_candidate_datasets, updated_datasets,
 };
 use nexus_db_model::schema;
 use omicron_common::api::external;
+use omicron_common::nexus_config::RegionAllocationStrategy;
 
 const NOT_ENOUGH_DATASETS_SENTINEL: &'static str = "Not enough datasets";
 const NOT_ENOUGH_ZPOOL_SPACE_SENTINEL: &'static str = "Not enough space";
@@ -91,6 +91,8 @@ impl OldRegions {
 /// This implicitly distinguishes between "M.2s" and "U.2s" -- Nexus needs to
 /// determine during dataset provisioning which devices should be considered for
 /// usage as Crucible storage.
+///
+/// We select only one dataset from each zpool.
 #[derive(Subquery, QueryId)]
 #[subquery(name = candidate_datasets)]
 struct CandidateDatasets {
@@ -98,71 +100,65 @@ struct CandidateDatasets {
 }
 
 impl CandidateDatasets {
-    fn new(
-        allocation_strategy: &RegionAllocationStrategy,
-        candidate_zpools: &CandidateZpools,
-    ) -> Self {
+    fn new(candidate_zpools: &CandidateZpools, seed: u128) -> Self {
         use crate::db::schema::dataset::dsl as dataset_dsl;
         use candidate_zpools::dsl as candidate_zpool_dsl;
 
-        let query = match allocation_strategy {
-            #[cfg(test)]
-            RegionAllocationStrategy::LeastUsedDisk => {
-                let query: Box<
-                    dyn CteQuery<SqlType = candidate_datasets::SqlType>,
-                > = Box::new(
-                    dataset_dsl::dataset
-                        .inner_join(
-                            candidate_zpools
-                                .query_source()
-                                .on(dataset_dsl::pool_id
-                                    .eq(candidate_zpool_dsl::pool_id)),
-                        )
-                        .filter(dataset_dsl::time_deleted.is_null())
-                        .filter(dataset_dsl::size_used.is_not_null())
-                        .filter(dataset_dsl::kind.eq(DatasetKind::Crucible))
-                        .order(dataset_dsl::size_used.asc())
-                        .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap())
-                        .select((dataset_dsl::id, dataset_dsl::pool_id)),
-                );
-                query
-            }
-            RegionAllocationStrategy::Random(seed) => {
-                let seed = seed.unwrap_or_else(|| {
-                    std::time::SystemTime::now()
-                        .duration_since(std::time::UNIX_EPOCH)
-                        .unwrap()
-                        .as_nanos()
-                });
-
-                let seed_bytes = seed.to_le_bytes();
-
-                let query: Box<
-                    dyn CteQuery<SqlType = candidate_datasets::SqlType>,
-                > = Box::new(
-                    dataset_dsl::dataset
-                        .inner_join(
-                            candidate_zpools
-                                .query_source()
-                                .on(dataset_dsl::pool_id
-                                    .eq(candidate_zpool_dsl::pool_id)),
-                        )
-                        .filter(dataset_dsl::time_deleted.is_null())
-                        .filter(dataset_dsl::size_used.is_not_null())
-                        .filter(dataset_dsl::kind.eq(DatasetKind::Crucible))
-                        // We order by md5 to shuffle the ordering of the datasets.
-                        // md5 has a uniform output distribution so it does the job.
-                        .order(cockroach_md5::dsl::md5(
+        let seed_bytes = seed.to_le_bytes();
+
+        let query: Box<dyn CteQuery<SqlType = candidate_datasets::SqlType>> =
+            Box::new(
+                dataset_dsl::dataset
+                    .inner_join(candidate_zpools.query_source().on(
+                        dataset_dsl::pool_id.eq(candidate_zpool_dsl::pool_id),
+                    ))
+                    .filter(dataset_dsl::time_deleted.is_null())
+                    .filter(dataset_dsl::size_used.is_not_null())
+                    .filter(dataset_dsl::kind.eq(DatasetKind::Crucible))
+                    .distinct_on(dataset_dsl::pool_id)
+                    .order_by((
+                        dataset_dsl::pool_id,
+                        cockroach_md5::dsl::md5(
                             CastUuidToBytea::new(dataset_dsl::id)
                                 .concat(seed_bytes.to_vec()),
-                        ))
-                        .select((dataset_dsl::id, dataset_dsl::pool_id))
-                        .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap()),
-                );
-                query
-            }
-        };
+                        ),
+                    ))
+                    .select((dataset_dsl::id, dataset_dsl::pool_id)),
+            );
+        Self { query }
+    }
+}
+
+/// Shuffle the candidate datasets, and select REGION_REDUNDANCY_THRESHOLD
+/// regions from it.
+#[derive(Subquery, QueryId)]
+#[subquery(name = shuffled_candidate_datasets)]
+struct ShuffledCandidateDatasets {
+    query: Box<dyn CteQuery<SqlType = shuffled_candidate_datasets::SqlType>>,
+}
 
+impl ShuffledCandidateDatasets {
+    fn new(candidate_datasets: &CandidateDatasets, seed: u128) -> Self {
+        use candidate_datasets::dsl as candidate_datasets_dsl;
+
+        let seed_bytes = seed.to_le_bytes();
+
+        let query: Box<dyn CteQuery<SqlType = candidate_datasets::SqlType>> =
+            Box::new(
+                candidate_datasets
+                    .query_source()
+                    // We order by md5 to shuffle the ordering of the datasets.
+                    // md5 has a uniform output distribution so it does the job.
+                    .order(cockroach_md5::dsl::md5(
+                        CastUuidToBytea::new(candidate_datasets_dsl::id)
+                            .concat(seed_bytes.to_vec()),
+                    ))
+                    .select((
+                        candidate_datasets_dsl::id,
+                        candidate_datasets_dsl::pool_id,
+                    ))
+                    .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap()),
+            );
         Self { query }
     }
 }
@@ -179,14 +175,14 @@ diesel::sql_function!(fn now() -> Timestamptz);
 
 impl CandidateRegions {
     fn new(
-        candidate_datasets: &CandidateDatasets,
+        shuffled_candidate_datasets: &ShuffledCandidateDatasets,
         volume_id: uuid::Uuid,
         block_size: u64,
         blocks_per_extent: u64,
         extent_count: u64,
     ) -> Self {
-        use candidate_datasets::dsl as candidate_datasets_dsl;
         use schema::region;
+        use shuffled_candidate_datasets::dsl as shuffled_candidate_datasets_dsl;
 
         let volume_id = volume_id.into_sql::<sql_types::Uuid>();
         let block_size = (block_size as i64).into_sql::<sql_types::BigInt>();
@@ -195,20 +191,22 @@ impl CandidateRegions {
         let extent_count =
             (extent_count as i64).into_sql::<sql_types::BigInt>();
         Self {
-            query: Box::new(candidate_datasets.query_source().select((
-                ExpressionAlias::new::<region::id>(gen_random_uuid()),
-                ExpressionAlias::new::<region::time_created>(now()),
-                ExpressionAlias::new::<region::time_modified>(now()),
-                ExpressionAlias::new::<region::dataset_id>(
-                    candidate_datasets_dsl::id,
+            query: Box::new(shuffled_candidate_datasets.query_source().select(
+                (
+                    ExpressionAlias::new::<region::id>(gen_random_uuid()),
+                    ExpressionAlias::new::<region::time_created>(now()),
+                    ExpressionAlias::new::<region::time_modified>(now()),
+                    ExpressionAlias::new::<region::dataset_id>(
+                        shuffled_candidate_datasets_dsl::id,
+                    ),
+                    ExpressionAlias::new::<region::volume_id>(volume_id),
+                    ExpressionAlias::new::<region::block_size>(block_size),
+                    ExpressionAlias::new::<region::blocks_per_extent>(
+                        blocks_per_extent,
+                    ),
+                    ExpressionAlias::new::<region::extent_count>(extent_count),
                 ),
-                ExpressionAlias::new::<region::volume_id>(volume_id),
-                ExpressionAlias::new::<region::block_size>(block_size),
-                ExpressionAlias::new::<region::blocks_per_extent>(
-                    blocks_per_extent,
-                ),
-                ExpressionAlias::new::<region::extent_count>(extent_count),
-            ))),
+            )),
         }
     }
 }
@@ -285,12 +283,14 @@ struct CandidateZpools {
 }
 
 impl CandidateZpools {
-    fn new(old_zpool_usage: &OldPoolUsage, zpool_size_delta: u64) -> Self {
+    fn new(
+        old_zpool_usage: &OldPoolUsage,
+        zpool_size_delta: u64,
+        seed: u128,
+        distinct_sleds: bool,
+    ) -> Self {
         use schema::zpool::dsl as zpool_dsl;
 
-        let with_zpool = zpool_dsl::zpool
-            .on(zpool_dsl::id.eq(old_zpool_usage::dsl::pool_id));
-
         // Why are we using raw `diesel::dsl::sql` here?
         //
         // When SQL performs the "SUM" operation on "bigint" type, the result
@@ -309,15 +309,40 @@ impl CandidateZpools {
             + diesel::dsl::sql(&zpool_size_delta.to_string()))
         .le(diesel::dsl::sql(zpool_dsl::total_size::NAME));
 
-        Self {
-            query: Box::new(
-                old_zpool_usage
-                    .query_source()
-                    .inner_join(with_zpool)
-                    .filter(it_will_fit)
-                    .select((old_zpool_usage::dsl::pool_id,)),
-            ),
-        }
+        let with_zpool = zpool_dsl::zpool
+            .on(zpool_dsl::id.eq(old_zpool_usage::dsl::pool_id));
+
+        let base_query = old_zpool_usage
+            .query_source()
+            .inner_join(with_zpool)
+            .filter(it_will_fit)
+            .select((old_zpool_usage::dsl::pool_id,));
+
+        let query = if distinct_sleds {
+            let seed_bytes = seed.to_le_bytes();
+
+            let query: Box<dyn CteQuery<SqlType = candidate_zpools::SqlType>> =
+                Box::new(
+                    base_query
+                        .order_by((
+                            zpool_dsl::sled_id,
+                            cockroach_md5::dsl::md5(
+                                CastUuidToBytea::new(zpool_dsl::id)
+                                    .concat(seed_bytes.to_vec()),
+                            ),
+                        ))
+                        .distinct_on(zpool_dsl::sled_id),
+                );
+
+            query
+        } else {
+            let query: Box<dyn CteQuery<SqlType = candidate_zpools::SqlType>> =
+                Box::new(base_query);
+
+            query
+        };
+
+        Self { query }
     }
 }
 
@@ -508,19 +533,47 @@ impl RegionAllocate {
         extent_count: u64,
         allocation_strategy: &RegionAllocationStrategy,
     ) -> Self {
+        let (seed, distinct_sleds) = {
+            let (input_seed, distinct_sleds) = match allocation_strategy {
+                RegionAllocationStrategy::Random { seed } => (seed, false),
+                RegionAllocationStrategy::RandomWithDistinctSleds { seed } => {
+                    (seed, true)
+                }
+            };
+            (
+                input_seed.map_or_else(
+                    || {
+                        std::time::SystemTime::now()
+                            .duration_since(std::time::UNIX_EPOCH)
+                            .unwrap()
+                            .as_nanos()
+                    },
+                    |seed| seed as u128,
+                ),
+                distinct_sleds,
+            )
+        };
+
         let size_delta = block_size * blocks_per_extent * extent_count;
 
         let old_regions = OldRegions::new(volume_id);
 
         let old_pool_usage = OldPoolUsage::new();
-        let candidate_zpools =
-            CandidateZpools::new(&old_pool_usage, size_delta);
+        let candidate_zpools = CandidateZpools::new(
+            &old_pool_usage,
+            size_delta,
+            seed,
+            distinct_sleds,
+        );
 
         let candidate_datasets =
-            CandidateDatasets::new(&allocation_strategy, &candidate_zpools);
+            CandidateDatasets::new(&candidate_zpools, seed);
+
+        let shuffled_candidate_datasets =
+            ShuffledCandidateDatasets::new(&candidate_datasets, seed);
 
         let candidate_regions = CandidateRegions::new(
-            &candidate_datasets,
+            &shuffled_candidate_datasets,
             volume_id,
             block_size,
             blocks_per_extent,
@@ -577,6 +630,7 @@ impl RegionAllocate {
             .add_subquery(old_pool_usage)
             .add_subquery(candidate_zpools)
             .add_subquery(candidate_datasets)
+            .add_subquery(shuffled_candidate_datasets)
             .add_subquery(candidate_regions)
             .add_subquery(proposed_changes)
             .add_subquery(do_insert)
diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml
index f1b20c32a10..1a9afbc6bdc 100644
--- a/nexus/examples/config.toml
+++ b/nexus/examples/config.toml
@@ -92,3 +92,14 @@ dns_external.max_concurrent_server_updates = 5
 # certificates it will take _other_ Nexus instances to notice and stop serving
 # them (on a sunny day).
 external_endpoints.period_secs = 60
+
+[default_region_allocation_strategy]
+# allocate region on 3 random distinct zpools, on 3 random distinct sleds.
+type = "random_with_distinct_sleds"
+
+# the same as random_with_distinct_sleds, but without requiring distinct sleds
+# type = "random"
+
+# setting `seed` to a fixed value will make dataset selection ordering use the
+# same shuffling order for every region allocation.
+# seed = 0
diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs
index 99ed75f14ba..7a03caca404 100644
--- a/nexus/src/app/mod.rs
+++ b/nexus/src/app/mod.rs
@@ -23,6 +23,7 @@ use omicron_common::address::DENDRITE_PORT;
 use omicron_common::address::MGS_PORT;
 use omicron_common::api::external::Error;
 use omicron_common::api::internal::shared::SwitchLocation;
+use omicron_common::nexus_config::RegionAllocationStrategy;
 use slog::Logger;
 use std::collections::HashMap;
 use std::net::Ipv6Addr;
@@ -152,6 +153,9 @@ pub struct Nexus {
 
     /// Background tasks
     background_tasks: background::BackgroundTasks,
+
+    /// Default Crucible region allocation strategy
+    default_region_allocation_strategy: RegionAllocationStrategy,
 }
 
 impl Nexus {
@@ -324,6 +328,10 @@ impl Nexus {
             external_resolver,
             dpd_clients,
             background_tasks,
+            default_region_allocation_strategy: config
+                .pkg
+                .default_region_allocation_strategy
+                .clone(),
         };
 
         // TODO-cleanup all the extra Arcs here seems wrong
diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs
index ff1d0b7174d..ef0eb79b75c 100644
--- a/nexus/src/app/sagas/disk_create.rs
+++ b/nexus/src/app/sagas/disk_create.rs
@@ -16,7 +16,6 @@ use crate::db::identity::{Asset, Resource};
 use crate::db::lookup::LookupPath;
 use crate::external_api::params;
 use crate::{authn, authz, db};
-use nexus_db_queries::db::datastore::RegionAllocationStrategy;
 use omicron_common::api::external::DiskState;
 use omicron_common::api::external::Error;
 use rand::{rngs::StdRng, RngCore, SeedableRng};
@@ -251,6 +250,9 @@ async fn sdc_alloc_regions(
         &sagactx,
         &params.serialized_authn,
     );
+
+    let strategy = &osagactx.nexus().default_region_allocation_strategy;
+
     let datasets_and_regions = osagactx
         .datastore()
         .region_allocate(
@@ -258,7 +260,7 @@ async fn sdc_alloc_regions(
             volume_id,
             &params.create_params.disk_source,
             params.create_params.size,
-            &RegionAllocationStrategy::Random(None),
+            &strategy,
         )
         .await
         .map_err(ActionError::action_failed)?;
diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs
index 81212571e25..8d34bd7a58a 100644
--- a/nexus/src/app/sagas/snapshot_create.rs
+++ b/nexus/src/app/sagas/snapshot_create.rs
@@ -107,7 +107,6 @@ use crate::{authn, authz, db};
 use anyhow::anyhow;
 use crucible_agent_client::{types::RegionId, Client as CrucibleAgentClient};
 use nexus_db_model::Generation;
-use nexus_db_queries::db::datastore::RegionAllocationStrategy;
 use omicron_common::api::external;
 use omicron_common::api::external::Error;
 use rand::{rngs::StdRng, RngCore, SeedableRng};
@@ -328,6 +327,8 @@ async fn ssc_alloc_regions(
         .await
         .map_err(ActionError::action_failed)?;
 
+    let strategy = &osagactx.nexus().default_region_allocation_strategy;
+
     let datasets_and_regions = osagactx
         .datastore()
         .region_allocate(
@@ -340,7 +341,7 @@ async fn ssc_alloc_regions(
                 .map_err(|e| ActionError::action_failed(e.to_string()))?,
             },
             external::ByteCount::from(disk.size),
-            &RegionAllocationStrategy::Random(None),
+            &strategy,
         )
         .await
         .map_err(ActionError::action_failed)?;
diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml
index 6eeacceaedd..1b1ae2c9129 100644
--- a/nexus/tests/config.test.toml
+++ b/nexus/tests/config.test.toml
@@ -89,3 +89,8 @@ dns_external.max_concurrent_server_updates = 5
 # certificates it will take _other_ Nexus instances to notice and stop serving
 # them (on a sunny day).
 external_endpoints.period_secs = 60
+
+[default_region_allocation_strategy]
+# we only have one sled in the test environment, so we need to use the
+# `Random` strategy, instead of `RandomWithDistinctSleds`
+type = "random"
\ No newline at end of file
diff --git a/smf/nexus/config-partial.toml b/smf/nexus/config-partial.toml
index b29727c4aa5..2dfee81d026 100644
--- a/smf/nexus/config-partial.toml
+++ b/smf/nexus/config-partial.toml
@@ -38,3 +38,8 @@ dns_external.max_concurrent_server_updates = 5
 # certificates it will take _other_ Nexus instances to notice and stop serving
 # them (on a sunny day).
 external_endpoints.period_secs = 60
+
+[default_region_allocation_strategy]
+# by default, allocate across 3 distinct sleds
+# seed is omitted so a new seed will be chosen with every allocation.
+type = "random_with_distinct_sleds"
\ No newline at end of file