From 10ab943e20f0ae15dfebf5dc48f8d8802465dcc1 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Tue, 13 Feb 2024 20:27:57 -0800
Subject: [PATCH 01/14] kinda working?

---
 nexus/db-macros/src/lib.rs                    |   4 +-
 nexus/db-model/src/dataset.rs                 |   1 +
 nexus/db-model/src/region.rs                  |   1 +
 nexus/db-queries/Cargo.toml                   |   1 +
 nexus/db-queries/src/db/datastore/region.rs   |  23 ++-
 .../src/db/queries/region_allocation.rs       | 194 ++++++++++++++++++
 6 files changed, 216 insertions(+), 8 deletions(-)

diff --git a/nexus/db-macros/src/lib.rs b/nexus/db-macros/src/lib.rs
index fd15b59128..30208412a1 100644
--- a/nexus/db-macros/src/lib.rs
+++ b/nexus/db-macros/src/lib.rs
@@ -280,7 +280,7 @@ fn build_resource_identity(
     let identity_name = format_ident!("{}Identity", struct_name);
     quote! {
         #[doc = #identity_doc]
-        #[derive(Clone, Debug, PartialEq, Eq, Selectable, Queryable, Insertable, serde::Serialize, serde::Deserialize)]
+        #[derive(Clone, Debug, PartialEq, Eq, Selectable, Queryable, QueryableByName, Insertable, serde::Serialize, serde::Deserialize)]
         #[diesel(table_name = #table_name) ]
         pub struct #identity_name {
             pub id: ::uuid::Uuid,
@@ -322,7 +322,7 @@ fn build_asset_identity(
     let identity_name = format_ident!("{}Identity", struct_name);
     quote! {
         #[doc = #identity_doc]
-        #[derive(Clone, Debug, PartialEq, Selectable, Queryable, Insertable, serde::Serialize, serde::Deserialize)]
+        #[derive(Clone, Debug, PartialEq, Selectable, Queryable, QueryableByName, Insertable, serde::Serialize, serde::Deserialize)]
         #[diesel(table_name = #table_name) ]
         pub struct #identity_name {
             pub id: ::uuid::Uuid,
diff --git a/nexus/db-model/src/dataset.rs b/nexus/db-model/src/dataset.rs
index 65c0070509..0ea465bebc 100644
--- a/nexus/db-model/src/dataset.rs
+++ b/nexus/db-model/src/dataset.rs
@@ -18,6 +18,7 @@ use uuid::Uuid;
 /// available to a service on the Sled.
 #[derive(
     Queryable,
+    QueryableByName,
     Insertable,
     Debug,
     Clone,
diff --git a/nexus/db-model/src/region.rs b/nexus/db-model/src/region.rs
index fefc4f4fce..0ea6f82e65 100644
--- a/nexus/db-model/src/region.rs
+++ b/nexus/db-model/src/region.rs
@@ -15,6 +15,7 @@ use uuid::Uuid;
 /// allocated within a volume.
 #[derive(
     Queryable,
+    QueryableByName,
     Insertable,
     Debug,
     Clone,
diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml
index 9cdcc88e6a..d979b01702 100644
--- a/nexus/db-queries/Cargo.toml
+++ b/nexus/db-queries/Cargo.toml
@@ -15,6 +15,7 @@ base64.workspace = true
 bb8.workspace = true
 camino.workspace = true
 chrono.workspace = true
+const_format.workspace = true
 cookie.workspace = true
 diesel.workspace = true
 diesel-dtrace.workspace = true
diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs
index b055a3e85c..24298c4ede 100644
--- a/nexus/db-queries/src/db/datastore/region.rs
+++ b/nexus/db-queries/src/db/datastore/region.rs
@@ -128,19 +128,30 @@ impl DataStore {
         let (blocks_per_extent, extent_count) =
             Self::get_crucible_allocation(&block_size, size);
 
-        let dataset_and_regions: Vec<(Dataset, Region)> =
-            crate::db::queries::region_allocation::RegionAllocate::new(
+        #[derive(diesel::QueryableByName, Queryable)]
+        struct DatasetAndRegion {
+            #[diesel(embed)]
+            dataset: Dataset,
+            #[diesel(embed)]
+            region: Region,
+        }
+
+        let dataset_and_regions: Vec<_> =
+            crate::db::queries::region_allocation::allocation_query(
                 volume_id,
                 block_size.to_bytes() as u64,
                 blocks_per_extent,
                 extent_count,
                 allocation_strategy,
             )
-            .get_results_async(&*self.pool_connection_authorized(&opctx).await?)
+            .get_results_async::<DatasetAndRegion>(
+                &*self.pool_connection_authorized(&opctx).await?,
+            )
             .await
-            .map_err(|e| {
-                crate::db::queries::region_allocation::from_diesel(e)
-            })?;
+            .map_err(|e| crate::db::queries::region_allocation::from_diesel(e))?
+            .into_iter()
+            .map(|DatasetAndRegion { dataset, region }| (dataset, region))
+            .collect::<Vec<_>>();
 
         Ok(dataset_and_regions)
     }
diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index 3c37bf6b2e..f00b8c4384 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -676,3 +676,197 @@ impl Query for RegionAllocate {
 }
 
 impl RunQueryDsl<DbConnection> for RegionAllocate {}
+
+struct BindParamCounter(i32);
+impl BindParamCounter {
+    fn new() -> Self {
+        Self(0)
+    }
+    fn next(&mut self) -> i32 {
+        self.0 += 1;
+        self.0
+    }
+}
+
+trait SqlQueryBinds {
+    fn add_bind(self, bind_counter: &mut BindParamCounter) -> Self;
+}
+
+impl<'a, Query> SqlQueryBinds
+    for diesel::query_builder::BoxedSqlQuery<'a, Pg, Query>
+{
+    fn add_bind(self, bind_counter: &mut BindParamCounter) -> Self {
+        self.sql("$").sql(bind_counter.next().to_string())
+    }
+}
+
+pub fn allocation_query(
+    volume_id: uuid::Uuid,
+    block_size: u64,
+    blocks_per_extent: u64,
+    extent_count: u64,
+    allocation_strategy: &RegionAllocationStrategy,
+) -> diesel::query_builder::BoxedSqlQuery<
+    'static,
+    Pg,
+    diesel::query_builder::SqlQuery,
+> {
+    let (seed, distinct_sleds) = {
+        let (input_seed, distinct_sleds) = match allocation_strategy {
+            RegionAllocationStrategy::Random { seed } => (seed, false),
+            RegionAllocationStrategy::RandomWithDistinctSleds { seed } => {
+                (seed, true)
+            }
+        };
+        (
+            input_seed.map_or_else(
+                || {
+                    std::time::SystemTime::now()
+                        .duration_since(std::time::UNIX_EPOCH)
+                        .unwrap()
+                        .as_nanos()
+                },
+                |seed| seed as u128,
+            ),
+            distinct_sleds,
+        )
+    };
+
+    let seed = seed.to_le_bytes().to_vec();
+
+    let size_delta = block_size * blocks_per_extent * extent_count;
+    let redunancy: i64 = i64::try_from(REGION_REDUNDANCY_THRESHOLD).unwrap();
+
+    let mut binds = BindParamCounter::new();
+    let query = diesel::sql_query(
+"WITH
+  old_regions AS
+    (SELECT region.id, region.time_created, region.time_modified, region.dataset_id, region.volume_id, region.block_size, region.blocks_per_extent, region.extent_count FROM region WHERE (region.volume_id = ").into_boxed().add_bind(&mut binds).sql(")),")
+    .bind::<sql_types::Uuid, _>(volume_id)
+    .sql(
+"
+  old_zpool_usage AS
+    (SELECT dataset.pool_id, sum(dataset.size_used) AS size_used FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),
+");
+
+    // We pick one of these branches, depending on whether or not the sleds
+    // should be distinct.
+    if distinct_sleds {
+        query.sql("
+  candidate_zpools AS
+    (SELECT old_zpool_usage.pool_id FROM (old_zpool_usage INNER JOIN (zpool INNER JOIN sled ON (zpool.sled_id = sled.id)) ON (zpool.id = old_zpool_usage.pool_id)) WHERE (((old_zpool_usage.size_used + ").add_bind(&mut binds).sql(" ) <= total_size) AND (sled.provision_state = 'provisionable'))),
+")
+        .bind::<sql_types::BigInt, _>(size_delta as i64)
+    } else {
+        query.sql("
+  candidate_zpools AS
+    (SELECT DISTINCT ON (zpool.sled_id) old_zpool_usage.pool_id FROM (old_zpool_usage INNER JOIN (zpool INNER JOIN sled ON (zpool.sled_id = sled.id)) ON (zpool.id = old_zpool_usage.pool_id)) WHERE (((old_zpool_usage.size_used + ").add_bind(&mut binds).sql(" ) <= total_size) AND (sled.provision_state = 'provisionable')) ORDER BY zpool.sled_id, md5((CAST(zpool.id as BYTEA) || ").add_bind(&mut binds).sql("))),
+")
+        .bind::<sql_types::BigInt, _>(size_delta as i64)
+        .bind::<sql_types::Bytea, _>(seed.clone())
+    }
+    .sql(
+"
+  candidate_datasets AS
+    (SELECT DISTINCT ON (dataset.pool_id) dataset.id, dataset.pool_id FROM (dataset INNER JOIN candidate_zpools ON (dataset.pool_id = candidate_zpools.pool_id)) WHERE (((dataset.time_deleted IS NULL) AND (dataset.size_used IS NOT NULL)) AND (dataset.kind = 'crucible')) ORDER BY dataset.pool_id, md5((CAST(dataset.id as BYTEA) || ").add_bind(&mut binds).sql("))),
+"
+    ).bind::<sql_types::Bytea, _>(seed.clone())
+    .sql(
+"
+  shuffled_candidate_datasets AS
+    (SELECT candidate_datasets.id, candidate_datasets.pool_id FROM candidate_datasets ORDER BY md5((CAST(candidate_datasets.id as BYTEA) || ").add_bind(&mut binds).sql(")) LIMIT ").add_bind(&mut binds).sql("),
+"
+    ).bind::<sql_types::Bytea, _>(seed)
+    .bind::<sql_types::BigInt, _>(redunancy)
+    .sql(
+"
+  candidate_regions AS
+    (SELECT gen_random_uuid() AS id, now() AS time_created, now() AS time_modified, shuffled_candidate_datasets.id AS dataset_id, ").add_bind(&mut binds).sql(" AS volume_id, ").add_bind(&mut binds).sql(" AS block_size, ").add_bind(&mut binds).sql(" AS blocks_per_extent, ").add_bind(&mut binds).sql(" AS extent_count FROM shuffled_candidate_datasets),
+"
+    ).bind::<sql_types::Uuid, _>(volume_id)
+    .bind::<sql_types::BigInt, _>(block_size as i64)
+    .bind::<sql_types::BigInt, _>(blocks_per_extent as i64)
+    .bind::<sql_types::BigInt, _>(extent_count as i64)
+    .sql(
+"
+  proposed_dataset_changes AS
+    (SELECT candidate_regions.dataset_id AS id, dataset.pool_id AS pool_id, ((candidate_regions.block_size * candidate_regions.blocks_per_extent) * candidate_regions.extent_count) AS size_used_delta FROM (candidate_regions INNER JOIN dataset ON (dataset.id = candidate_regions.dataset_id))),
+  do_insert AS
+    (SELECT (((((SELECT COUNT(*) FROM old_regions LIMIT 1) < ").add_bind(&mut binds).sql(") AND CAST(IF(((SELECT COUNT(*) FROM candidate_zpools LIMIT 1) >= ").add_bind(&mut binds).sql("), 'TRUE', 'Not enough space') AS BOOL)) AND CAST(IF(((SELECT COUNT(*) FROM candidate_regions LIMIT 1) >= ").add_bind(&mut binds).sql("), 'TRUE', 'Not enough datasets') AS BOOL)) AND CAST(IF(((SELECT COUNT(DISTINCT dataset.pool_id) FROM (candidate_regions INNER JOIN dataset ON (candidate_regions.dataset_id = dataset.id)) LIMIT 1) >= ").add_bind(&mut binds).sql("), 'TRUE', 'Not enough unique zpools selected') AS BOOL)) AS insert),
+"
+    ).bind::<sql_types::BigInt, _>(redunancy)
+    .bind::<sql_types::BigInt, _>(redunancy)
+    .bind::<sql_types::BigInt, _>(redunancy)
+    .bind::<sql_types::BigInt, _>(redunancy)
+    .sql(
+"
+  inserted_regions AS
+    (INSERT INTO region (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count) SELECT candidate_regions.id, candidate_regions.time_created, candidate_regions.time_modified, candidate_regions.dataset_id, candidate_regions.volume_id, candidate_regions.block_size, candidate_regions.blocks_per_extent, candidate_regions.extent_count FROM candidate_regions WHERE (SELECT do_insert.insert FROM do_insert LIMIT 1) RETURNING region.id, region.time_created, region.time_modified, region.dataset_id, region.volume_id, region.block_size, region.blocks_per_extent, region.extent_count),
+  updated_datasets AS
+    (UPDATE dataset SET size_used = (dataset.size_used + (SELECT proposed_dataset_changes.size_used_delta FROM proposed_dataset_changes WHERE (proposed_dataset_changes.id = dataset.id) LIMIT 1)) WHERE ((dataset.id = ANY(SELECT proposed_dataset_changes.id FROM proposed_dataset_changes)) AND (SELECT do_insert.insert FROM do_insert LIMIT 1)) RETURNING dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used)
+(SELECT dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used, old_regions.id, old_regions.time_created, old_regions.time_modified, old_regions.dataset_id, old_regions.volume_id, old_regions.block_size, old_regions.blocks_per_extent, old_regions.extent_count FROM (old_regions INNER JOIN dataset ON (old_regions.dataset_id = dataset.id))) UNION (SELECT updated_datasets.id, updated_datasets.time_created, updated_datasets.time_modified, updated_datasets.time_deleted, updated_datasets.rcgen, updated_datasets.pool_id, updated_datasets.ip, updated_datasets.port, updated_datasets.kind, updated_datasets.size_used, inserted_regions.id, inserted_regions.time_created, inserted_regions.time_modified, inserted_regions.dataset_id, inserted_regions.volume_id, inserted_regions.block_size, inserted_regions.blocks_per_extent, inserted_regions.extent_count FROM (inserted_regions INNER JOIN updated_datasets ON (inserted_regions.dataset_id = updated_datasets.id)))"
+    )
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::db::explain::ExplainableAsync;
+    use omicron_test_utils::dev;
+    use nexus_test_utils::db::test_setup_database;
+    use uuid::Uuid;
+
+    #[test]
+    fn raw_sql() {
+        let volume_id = Uuid::new_v4();
+        let block_size = 0;
+        let blocks_per_extent = 1;
+        let extent_count = 2;
+        let allocation_strategy =
+            RegionAllocationStrategy::RandomWithDistinctSleds { seed: None };
+
+        let region_allocate = RegionAllocate::new(
+            volume_id,
+            block_size,
+            blocks_per_extent,
+            extent_count,
+            &allocation_strategy,
+        );
+
+        let query = diesel::debug_query::<Pg, _>(&region_allocate);
+
+        assert_eq!(query.to_string(), "foobar");
+    }
+
+    #[tokio::test]
+    async fn explainable() {
+        let logctx = dev::test_setup_log("explainable");
+        let log = logctx.log.new(o!());
+        let mut db = test_setup_database(&log).await;
+        let cfg = crate::db::Config { url: db.pg_config().clone() };
+        let pool = crate::db::Pool::new(&logctx.log, &cfg);
+        let conn = pool.pool().get().await.unwrap();
+
+        let volume_id = Uuid::new_v4();
+        let block_size = 0;
+        let blocks_per_extent = 1;
+        let extent_count = 2;
+        let allocation_strategy =
+            RegionAllocationStrategy::RandomWithDistinctSleds { seed: None };
+        let region_allocate = allocation_query(
+            volume_id,
+            block_size,
+            blocks_per_extent,
+            extent_count,
+            &allocation_strategy,
+        );
+
+        let _ = region_allocate
+            .explain_async(&conn)
+            .await
+            .expect("Failed to explain query - is it valid SQL?");
+
+        db.cleanup().await.unwrap();
+        logctx.cleanup_successful();
+    }
+}

From 47bf04a3495f5ee615c5ae7a8547aa6b1aff733e Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Tue, 13 Feb 2024 23:33:05 -0800
Subject: [PATCH 02/14] Removed old query

---
 Cargo.lock                                    |  21 +
 Cargo.toml                                    |   1 +
 nexus/db-model/src/queries/mod.rs             |   1 -
 .../db-model/src/queries/region_allocation.rs | 195 ----
 nexus/db-queries/src/db/cast_uuid_as_bytea.rs |  62 --
 nexus/db-queries/src/db/mod.rs                |   1 -
 .../src/db/queries/region_allocation.rs       | 880 ++++--------------
 7 files changed, 225 insertions(+), 936 deletions(-)
 delete mode 100644 nexus/db-model/src/queries/region_allocation.rs
 delete mode 100644 nexus/db-queries/src/db/cast_uuid_as_bytea.rs

diff --git a/Cargo.lock b/Cargo.lock
index caddb6d8ea..388799ff59 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1090,6 +1090,26 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
 
+[[package]]
+name = "const_format"
+version = "0.2.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a214c7af3d04997541b18d432afaff4c455e79e2029079647e72fc2bd27673"
+dependencies = [
+ "const_format_proc_macros",
+]
+
+[[package]]
+name = "const_format_proc_macros"
+version = "0.2.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7f6ff08fd20f4f299298a28e2dfa8a8ba1036e6cd2460ac1de7b425d76f2500"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-xid",
+]
+
 [[package]]
 name = "constant_time_eq"
 version = "0.2.6"
@@ -4360,6 +4380,7 @@ dependencies = [
  "camino",
  "camino-tempfile",
  "chrono",
+ "const_format",
  "cookie",
  "db-macros",
  "diesel",
diff --git a/Cargo.toml b/Cargo.toml
index d33758fc06..048ac8bc60 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -174,6 +174,7 @@ ciborium = "0.2.2"
 cfg-if = "1.0"
 chrono = { version = "0.4", features = [ "serde" ] }
 clap = { version = "4.4", features = ["cargo", "derive", "env", "wrap_help"] }
+const_format = "0.2.32"
 cookie = "0.18"
 criterion = { version = "0.5.1", features = [ "async_tokio" ] }
 crossbeam = "0.8"
diff --git a/nexus/db-model/src/queries/mod.rs b/nexus/db-model/src/queries/mod.rs
index 7724d48bab..e138508f84 100644
--- a/nexus/db-model/src/queries/mod.rs
+++ b/nexus/db-model/src/queries/mod.rs
@@ -4,5 +4,4 @@
 
 //! Subqueries used in CTEs.
 
-pub mod region_allocation;
 pub mod virtual_provisioning_collection_update;
diff --git a/nexus/db-model/src/queries/region_allocation.rs b/nexus/db-model/src/queries/region_allocation.rs
deleted file mode 100644
index a1b9e0373a..0000000000
--- a/nexus/db-model/src/queries/region_allocation.rs
+++ /dev/null
@@ -1,195 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-//! Describes subqueries which may be issues as a part of CTEs.
-//!
-//! When possible, it's preferable to define subqueries close to their
-//! usage. However, certain Diesel traits (such as those enabling joins)
-//! require the table structures to be defined in the same crate.
-
-// TODO: We're currently piggy-backing on the table macro for convenience.
-// We actually do not want to generate an entire table for each subquery - we'd
-// like to have a query source (which we can use to generate SELECT statements,
-// JOIN, etc), but we don't want this to be an INSERT/UPDATE/DELETE target.
-//
-// Similarly, we don't want to force callers to supply a "primary key".
-//
-// I've looked into Diesel's `alias!` macro for this purpose, but unfortunately
-// that implementation is too opinionated about the output QueryFragment.
-// It expects to use the form:
-//
-// "<SOURCE> as <ALIAS NAME>", which is actually the opposite of what we want in
-// a CTE (where we want the alias name to come first).
-
-use crate::schema::dataset;
-use crate::schema::sled;
-use crate::schema::zpool;
-
-table! {
-    old_regions {
-        id -> Uuid,
-        time_created -> Timestamptz,
-        time_modified -> Timestamptz,
-
-        dataset_id -> Uuid,
-        volume_id -> Uuid,
-
-        block_size -> Int8,
-        blocks_per_extent -> Int8,
-        extent_count -> Int8,
-    }
-}
-
-table! {
-    candidate_datasets {
-        id -> Uuid,
-        pool_id -> Uuid,
-    }
-}
-
-table! {
-    shuffled_candidate_datasets {
-        id -> Uuid,
-        pool_id -> Uuid,
-    }
-}
-
-table! {
-    candidate_regions {
-        id -> Uuid,
-        time_created -> Timestamptz,
-        time_modified -> Timestamptz,
-
-        dataset_id -> Uuid,
-        volume_id -> Uuid,
-
-        block_size -> Int8,
-        blocks_per_extent -> Int8,
-        extent_count -> Int8,
-    }
-}
-
-table! {
-    proposed_dataset_changes {
-        id -> Uuid,
-        pool_id -> Uuid,
-        size_used_delta -> Int8,
-    }
-}
-
-table! {
-    old_zpool_usage (pool_id) {
-        pool_id -> Uuid,
-        size_used -> Numeric,
-    }
-}
-
-table! {
-    candidate_zpools (pool_id) {
-        pool_id -> Uuid
-    }
-}
-
-table! {
-    do_insert (insert) {
-        insert -> Bool,
-    }
-}
-
-table! {
-    one_zpool_per_sled (pool_id) {
-        pool_id -> Uuid
-    }
-}
-
-table! {
-    one_dataset_per_zpool {
-        id -> Uuid,
-        pool_id -> Uuid
-    }
-}
-
-table! {
-    inserted_regions {
-        id -> Uuid,
-        time_created -> Timestamptz,
-        time_modified -> Timestamptz,
-
-        dataset_id -> Uuid,
-        volume_id -> Uuid,
-
-        block_size -> Int8,
-        blocks_per_extent -> Int8,
-        extent_count -> Int8,
-    }
-}
-
-table! {
-    updated_datasets (id) {
-        id -> Uuid,
-        time_created -> Timestamptz,
-        time_modified -> Timestamptz,
-        time_deleted -> Nullable<Timestamptz>,
-        rcgen -> Int8,
-
-        pool_id -> Uuid,
-
-        ip -> Inet,
-        port -> Int4,
-
-        kind -> crate::DatasetKindEnum,
-        size_used -> Nullable<Int8>,
-    }
-}
-
-diesel::allow_tables_to_appear_in_same_query!(
-    proposed_dataset_changes,
-    dataset,
-);
-
-diesel::allow_tables_to_appear_in_same_query!(
-    do_insert,
-    candidate_regions,
-    dataset,
-    zpool,
-);
-
-diesel::allow_tables_to_appear_in_same_query!(
-    old_zpool_usage,
-    zpool,
-    sled,
-    proposed_dataset_changes,
-);
-
-diesel::allow_tables_to_appear_in_same_query!(old_regions, dataset,);
-diesel::allow_tables_to_appear_in_same_query!(old_regions, zpool,);
-
-diesel::allow_tables_to_appear_in_same_query!(
-    inserted_regions,
-    updated_datasets,
-);
-
-diesel::allow_tables_to_appear_in_same_query!(candidate_zpools, dataset,);
-diesel::allow_tables_to_appear_in_same_query!(candidate_zpools, zpool,);
-diesel::allow_tables_to_appear_in_same_query!(candidate_datasets, dataset);
-
-// == Needed for random region allocation ==
-
-pub mod cockroach_md5 {
-    pub mod functions {
-        use diesel::sql_types::*;
-        diesel::sql_function!(fn md5(x: Bytea) -> Bytea);
-    }
-
-    pub mod helper_types {
-        pub type Md5<Expr> = super::functions::md5::HelperType<Expr>;
-    }
-
-    pub mod dsl {
-        pub use super::functions::*;
-        pub use super::helper_types::*;
-    }
-}
-
-// == End random region allocation dependencies ==
diff --git a/nexus/db-queries/src/db/cast_uuid_as_bytea.rs b/nexus/db-queries/src/db/cast_uuid_as_bytea.rs
deleted file mode 100644
index c50c88971f..0000000000
--- a/nexus/db-queries/src/db/cast_uuid_as_bytea.rs
+++ /dev/null
@@ -1,62 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-//! Cast UUID to BYTES
-
-use diesel::expression::ValidGrouping;
-use diesel::pg::Pg;
-use diesel::query_builder::AstPass;
-use diesel::query_builder::QueryFragment;
-use diesel::query_builder::QueryId;
-use diesel::Expression;
-use diesel::SelectableExpression;
-
-/// Cast an expression which evaluates to a Uuid and cast it to a Bytea. It's
-/// that simple!
-#[derive(ValidGrouping, QueryId)]
-pub struct CastUuidToBytea<E> {
-    expression: E,
-}
-
-impl<E> CastUuidToBytea<E>
-where
-    E: Expression<SqlType = diesel::sql_types::Uuid>,
-{
-    pub const fn new(expression: E) -> Self {
-        Self { expression }
-    }
-}
-
-impl<E> Expression for CastUuidToBytea<E>
-where
-    E: Expression,
-{
-    type SqlType = diesel::sql_types::Bytea;
-}
-
-impl<E, QS> diesel::AppearsOnTable<QS> for CastUuidToBytea<E> where
-    E: diesel::AppearsOnTable<QS>
-{
-}
-
-impl<E, T> SelectableExpression<T> for CastUuidToBytea<E> where
-    E: SelectableExpression<T>
-{
-}
-
-impl<E> QueryFragment<Pg> for CastUuidToBytea<E>
-where
-    E: QueryFragment<Pg>,
-{
-    fn walk_ast<'a>(
-        &'a self,
-        mut out: AstPass<'_, 'a, Pg>,
-    ) -> diesel::QueryResult<()> {
-        out.push_sql("CAST(");
-        self.expression.walk_ast(out.reborrow())?;
-        out.push_sql(" as BYTEA)");
-
-        Ok(())
-    }
-}
diff --git a/nexus/db-queries/src/db/mod.rs b/nexus/db-queries/src/db/mod.rs
index e21ba2e3a8..6cd3911efe 100644
--- a/nexus/db-queries/src/db/mod.rs
+++ b/nexus/db-queries/src/db/mod.rs
@@ -5,7 +5,6 @@
 //! Facilities for working with the Omicron database
 
 pub(crate) mod alias;
-pub(crate) mod cast_uuid_as_bytea;
 // This is not intended to be public, but this is necessary to use it from
 // doctests
 pub mod collection_attach;
diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index f00b8c4384..24ace6467e 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -4,31 +4,15 @@
 
 //! Implementation of queries for provisioning regions.
 
-use crate::db::alias::ExpressionAlias;
-use crate::db::cast_uuid_as_bytea::CastUuidToBytea;
 use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD;
-use crate::db::model::{Dataset, DatasetKind, Region};
-use crate::db::pool::DbConnection;
-use crate::db::subquery::{AsQuerySource, Cte, CteBuilder, CteQuery};
-use crate::db::true_or_cast_error::{matches_sentinel, TrueOrCastError};
-use db_macros::Subquery;
+use crate::db::true_or_cast_error::matches_sentinel;
+use const_format::concatcp;
 use diesel::pg::Pg;
-use diesel::query_builder::{AstPass, Query, QueryFragment, QueryId};
 use diesel::result::Error as DieselError;
-use diesel::PgBinaryExpressionMethods;
-use diesel::{
-    sql_types, BoolExpressionMethods, Column, CombineDsl, ExpressionMethods,
-    Insertable, IntoSql, JoinOnDsl, NullableExpressionMethods, QueryDsl,
-    RunQueryDsl,
-};
-use nexus_db_model::queries::region_allocation::{
-    candidate_datasets, candidate_regions, candidate_zpools, cockroach_md5,
-    do_insert, inserted_regions, old_regions, old_zpool_usage,
-    proposed_dataset_changes, shuffled_candidate_datasets, updated_datasets,
-};
-use nexus_db_model::schema;
+use diesel::sql_types;
 use omicron_common::api::external;
 use omicron_common::nexus_config::RegionAllocationStrategy;
+use std::cell::Cell;
 
 const NOT_ENOUGH_DATASETS_SENTINEL: &'static str = "Not enough datasets";
 const NOT_ENOUGH_ZPOOL_SPACE_SENTINEL: &'static str = "Not enough space";
@@ -74,632 +58,77 @@ pub fn from_diesel(e: DieselError) -> external::Error {
     error::public_error_from_diesel(e, error::ErrorHandler::Server)
 }
 
-/// A subquery to find all old regions associated with a particular volume.
-#[derive(Subquery, QueryId)]
-#[subquery(name = old_regions)]
-struct OldRegions {
-    query: Box<dyn CteQuery<SqlType = schema::region::SqlType>>,
-}
-
-impl OldRegions {
-    fn new(volume_id: uuid::Uuid) -> Self {
-        use crate::db::schema::region::dsl;
-        Self {
-            query: Box::new(dsl::region.filter(dsl::volume_id.eq(volume_id))),
-        }
-    }
-}
-
-/// A subquery to find datasets which could be used for provisioning regions.
-///
-/// We only consider datasets which are already allocated as "Crucible".
-/// This implicitly distinguishes between "M.2s" and "U.2s" -- Nexus needs to
-/// determine during dataset provisioning which devices should be considered for
-/// usage as Crucible storage.
-///
-/// We select only one dataset from each zpool.
-#[derive(Subquery, QueryId)]
-#[subquery(name = candidate_datasets)]
-struct CandidateDatasets {
-    query: Box<dyn CteQuery<SqlType = candidate_datasets::SqlType>>,
-}
-
-impl CandidateDatasets {
-    fn new(candidate_zpools: &CandidateZpools, seed: u128) -> Self {
-        use crate::db::schema::dataset::dsl as dataset_dsl;
-        use candidate_zpools::dsl as candidate_zpool_dsl;
-
-        let seed_bytes = seed.to_le_bytes();
-
-        let query: Box<dyn CteQuery<SqlType = candidate_datasets::SqlType>> =
-            Box::new(
-                dataset_dsl::dataset
-                    .inner_join(candidate_zpools.query_source().on(
-                        dataset_dsl::pool_id.eq(candidate_zpool_dsl::pool_id),
-                    ))
-                    .filter(dataset_dsl::time_deleted.is_null())
-                    .filter(dataset_dsl::size_used.is_not_null())
-                    .filter(dataset_dsl::kind.eq(DatasetKind::Crucible))
-                    .distinct_on(dataset_dsl::pool_id)
-                    .order_by((
-                        dataset_dsl::pool_id,
-                        cockroach_md5::dsl::md5(
-                            CastUuidToBytea::new(dataset_dsl::id)
-                                .concat(seed_bytes.to_vec()),
-                        ),
-                    ))
-                    .select((dataset_dsl::id, dataset_dsl::pool_id)),
-            );
-        Self { query }
+struct BindParamCounter(Cell<i32>);
+impl BindParamCounter {
+    fn new() -> Self {
+        Self(0.into())
     }
-}
-
-/// Shuffle the candidate datasets, and select REGION_REDUNDANCY_THRESHOLD
-/// regions from it.
-#[derive(Subquery, QueryId)]
-#[subquery(name = shuffled_candidate_datasets)]
-struct ShuffledCandidateDatasets {
-    query: Box<dyn CteQuery<SqlType = shuffled_candidate_datasets::SqlType>>,
-}
-
-impl ShuffledCandidateDatasets {
-    fn new(candidate_datasets: &CandidateDatasets, seed: u128) -> Self {
-        use candidate_datasets::dsl as candidate_datasets_dsl;
-
-        let seed_bytes = seed.to_le_bytes();
-
-        let query: Box<dyn CteQuery<SqlType = candidate_datasets::SqlType>> =
-            Box::new(
-                candidate_datasets
-                    .query_source()
-                    // We order by md5 to shuffle the ordering of the datasets.
-                    // md5 has a uniform output distribution so it does the job.
-                    .order(cockroach_md5::dsl::md5(
-                        CastUuidToBytea::new(candidate_datasets_dsl::id)
-                            .concat(seed_bytes.to_vec()),
-                    ))
-                    .select((
-                        candidate_datasets_dsl::id,
-                        candidate_datasets_dsl::pool_id,
-                    ))
-                    .limit(REGION_REDUNDANCY_THRESHOLD.try_into().unwrap()),
-            );
-        Self { query }
+    fn next(&self) -> i32 {
+        self.0.set(self.0.get() + 1);
+        self.0.get()
     }
 }
 
-/// A subquery to create the regions-to-be-inserted for the volume.
-#[derive(Subquery, QueryId)]
-#[subquery(name = candidate_regions)]
-struct CandidateRegions {
-    query: Box<dyn CteQuery<SqlType = schema::region::SqlType>>,
-}
-
-diesel::sql_function!(fn gen_random_uuid() -> Uuid);
-diesel::sql_function!(fn now() -> Timestamptz);
-
-impl CandidateRegions {
-    fn new(
-        shuffled_candidate_datasets: &ShuffledCandidateDatasets,
-        volume_id: uuid::Uuid,
-        block_size: u64,
-        blocks_per_extent: u64,
-        extent_count: u64,
-    ) -> Self {
-        use schema::region;
-        use shuffled_candidate_datasets::dsl as shuffled_candidate_datasets_dsl;
-
-        let volume_id = volume_id.into_sql::<sql_types::Uuid>();
-        let block_size = (block_size as i64).into_sql::<sql_types::BigInt>();
-        let blocks_per_extent =
-            (blocks_per_extent as i64).into_sql::<sql_types::BigInt>();
-        let extent_count =
-            (extent_count as i64).into_sql::<sql_types::BigInt>();
-        Self {
-            query: Box::new(shuffled_candidate_datasets.query_source().select(
-                (
-                    ExpressionAlias::new::<region::id>(gen_random_uuid()),
-                    ExpressionAlias::new::<region::time_created>(now()),
-                    ExpressionAlias::new::<region::time_modified>(now()),
-                    ExpressionAlias::new::<region::dataset_id>(
-                        shuffled_candidate_datasets_dsl::id,
-                    ),
-                    ExpressionAlias::new::<region::volume_id>(volume_id),
-                    ExpressionAlias::new::<region::block_size>(block_size),
-                    ExpressionAlias::new::<region::blocks_per_extent>(
-                        blocks_per_extent,
-                    ),
-                    ExpressionAlias::new::<region::extent_count>(extent_count),
-                ),
-            )),
-        }
-    }
-}
-
-/// A subquery which summarizes the changes we intend to make, showing:
-///
-/// 1. Which datasets will have size adjustments
-/// 2. Which pools those datasets belong to
-/// 3. The delta in size-used
-#[derive(Subquery, QueryId)]
-#[subquery(name = proposed_dataset_changes)]
-struct ProposedChanges {
-    query: Box<dyn CteQuery<SqlType = proposed_dataset_changes::SqlType>>,
+trait SqlQueryBinds {
+    fn add_bind(self, bind_counter: &BindParamCounter) -> Self;
 }
 
-impl ProposedChanges {
-    fn new(candidate_regions: &CandidateRegions) -> Self {
-        use crate::db::schema::dataset::dsl as dataset_dsl;
-        use candidate_regions::dsl as candidate_regions_dsl;
-        Self {
-            query: Box::new(
-                candidate_regions.query_source()
-                    .inner_join(
-                        dataset_dsl::dataset.on(dataset_dsl::id.eq(candidate_regions_dsl::dataset_id))
-                    )
-                    .select((
-                        ExpressionAlias::new::<proposed_dataset_changes::id>(candidate_regions_dsl::dataset_id),
-                        ExpressionAlias::new::<proposed_dataset_changes::pool_id>(dataset_dsl::pool_id),
-                        ExpressionAlias::new::<proposed_dataset_changes::size_used_delta>(
-                            candidate_regions_dsl::block_size *
-                            candidate_regions_dsl::blocks_per_extent *
-                            candidate_regions_dsl::extent_count
-                        ),
-                    ))
-            ),
-        }
+impl<'a, Query> SqlQueryBinds
+    for diesel::query_builder::BoxedSqlQuery<'a, Pg, Query>
+{
+    fn add_bind(self, bind_counter: &BindParamCounter) -> Self {
+        self.sql("$").sql(bind_counter.next().to_string())
     }
 }
 
-/// A subquery which calculates the old size being used by zpools
-/// under consideration as targets for region allocation.
-#[derive(Subquery, QueryId)]
-#[subquery(name = old_zpool_usage)]
-struct OldPoolUsage {
-    query: Box<dyn CteQuery<SqlType = old_zpool_usage::SqlType>>,
+// A small wrapper around [diesel::query_builder::BoxedSqlQuery] which
+// assists with counting bind parameters and recommends avoiding the usage of
+// any non-static strings in query construction.
+struct QueryBuilder {
+    query: diesel::query_builder::BoxedSqlQuery<'static, Pg, diesel::query_builder::SqlQuery>,
+    bind_counter: BindParamCounter,
 }
 
-impl OldPoolUsage {
+impl QueryBuilder {
     fn new() -> Self {
-        use crate::db::schema::dataset::dsl as dataset_dsl;
         Self {
-            query: Box::new(
-                dataset_dsl::dataset
-                    .group_by(dataset_dsl::pool_id)
-                    .filter(dataset_dsl::size_used.is_not_null())
-                    .filter(dataset_dsl::time_deleted.is_null())
-                    .select((
-                        dataset_dsl::pool_id,
-                        ExpressionAlias::new::<old_zpool_usage::size_used>(
-                            diesel::dsl::sum(dataset_dsl::size_used)
-                                .assume_not_null(),
-                        ),
-                    )),
-            ),
+            query: diesel::sql_query("").into_boxed(),
+            bind_counter: BindParamCounter::new(),
         }
     }
-}
-
-/// A subquery which identifies zpools with enough space for a region allocation.
-#[derive(Subquery, QueryId)]
-#[subquery(name = candidate_zpools)]
-struct CandidateZpools {
-    query: Box<dyn CteQuery<SqlType = candidate_zpools::SqlType>>,
-}
-
-impl CandidateZpools {
-    fn new(
-        old_zpool_usage: &OldPoolUsage,
-        zpool_size_delta: u64,
-        seed: u128,
-        distinct_sleds: bool,
-    ) -> Self {
-        use schema::sled::dsl as sled_dsl;
-        use schema::zpool::dsl as zpool_dsl;
-
-        // Why are we using raw `diesel::dsl::sql` here?
-        //
-        // When SQL performs the "SUM" operation on "bigint" type, the result
-        // is promoted to "numeric" (see: old_zpool_usage::dsl::size_used).
-        //
-        // However, we'd like to compare that value with a different value
-        // (zpool_dsl::total_size) which is still a "bigint". This comparison
-        // is safe (after all, we basically want to promote "total_size" to a
-        // Numeric too) but Diesel demands that the input and output SQL types
-        // of expression methods like ".le" match exactly.
-        //
-        // For similar reasons, we use `diesel::dsl::sql` with zpool_size_delta.
-        // We would like to add it, but diesel only permits us to `to_sql()` it
-        // into a BigInt, not a Numeric. I welcome a better solution.
-        let it_will_fit = (old_zpool_usage::dsl::size_used
-            + diesel::dsl::sql(&zpool_size_delta.to_string()))
-        .le(diesel::dsl::sql(zpool_dsl::total_size::NAME));
-
-        // We need to join on the sled table to access provision_state.
-        let with_sled = sled_dsl::sled.on(zpool_dsl::sled_id.eq(sled_dsl::id));
-        let with_zpool = zpool_dsl::zpool
-            .on(zpool_dsl::id.eq(old_zpool_usage::dsl::pool_id))
-            .inner_join(with_sled);
-
-        let sled_is_provisionable = sled_dsl::provision_state
-            .eq(crate::db::model::SledProvisionState::Provisionable);
-
-        let base_query = old_zpool_usage
-            .query_source()
-            .inner_join(with_zpool)
-            .filter(it_will_fit)
-            .filter(sled_is_provisionable)
-            .select((old_zpool_usage::dsl::pool_id,));
-
-        let query = if distinct_sleds {
-            let seed_bytes = seed.to_le_bytes();
-
-            let query: Box<dyn CteQuery<SqlType = candidate_zpools::SqlType>> =
-                Box::new(
-                    base_query
-                        .order_by((
-                            zpool_dsl::sled_id,
-                            cockroach_md5::dsl::md5(
-                                CastUuidToBytea::new(zpool_dsl::id)
-                                    .concat(seed_bytes.to_vec()),
-                            ),
-                        ))
-                        .distinct_on(zpool_dsl::sled_id),
-                );
-
-            query
-        } else {
-            let query: Box<dyn CteQuery<SqlType = candidate_zpools::SqlType>> =
-                Box::new(base_query);
-
-            query
-        };
-
-        Self { query }
-    }
-}
-
-diesel::sql_function! {
-    #[aggregate]
-    fn bool_and(b: sql_types::Bool) -> sql_types::Bool;
-}
-
-/// A subquery which confirms whether or not the insertion and updates should
-/// occur.
-///
-/// This subquery additionally exits the CTE early with an error if either:
-/// 1. Not enough datasets exist to provision regions with our required
-///    redundancy, or
-/// 2. Not enough space exists on zpools to perform the provisioning.
-#[derive(Subquery, QueryId)]
-#[subquery(name = do_insert)]
-struct DoInsert {
-    query: Box<dyn CteQuery<SqlType = do_insert::SqlType>>,
-}
-
-impl DoInsert {
-    fn new(
-        old_regions: &OldRegions,
-        candidate_regions: &CandidateRegions,
-        candidate_zpools: &CandidateZpools,
-    ) -> Self {
-        let redundancy = REGION_REDUNDANCY_THRESHOLD as i64;
-        let not_allocated_yet = old_regions
-            .query_source()
-            .count()
-            .single_value()
-            .assume_not_null()
-            .lt(redundancy);
-
-        let enough_candidate_zpools = candidate_zpools
-            .query_source()
-            .count()
-            .single_value()
-            .assume_not_null()
-            .ge(redundancy);
-
-        let enough_candidate_regions = candidate_regions
-            .query_source()
-            .count()
-            .single_value()
-            .assume_not_null()
-            .ge(redundancy);
-
-        // We want to ensure that we do not allocate on two datasets in the same
-        // zpool, for two reasons
-        // - Data redundancy: If a drive fails it should only take one of the 3
-        //   regions with it
-        // - Risk of overallocation: We only check that each zpool as enough
-        //   room for one region, so we should not allocate more than one region
-        //   to it.
-        //
-        // Selecting two datasets on the same zpool will not initially be
-        // possible, as at the time of writing each zpool only has one dataset.
-        // Additionally, we intend to modify the allocation strategy to select
-        // from 3 distinct sleds, removing the possibility entirely. But, if we
-        // introduce a change that adds another crucible dataset to zpools
-        // before we improve the allocation strategy, this check will make sure
-        // we don't violate drive redundancy, and generate an error instead.
-        use crate::db::schema::dataset::dsl as dataset_dsl;
-        use candidate_regions::dsl as candidate_dsl;
-        let enough_unique_candidate_zpools = candidate_regions
-            .query_source()
-            .inner_join(
-                dataset_dsl::dataset
-                    .on(candidate_dsl::dataset_id.eq(dataset_dsl::id)),
-            )
-            .select(diesel::dsl::count_distinct(dataset_dsl::pool_id))
-            .single_value()
-            .assume_not_null()
-            .ge(redundancy);
 
+    // Identifies that a bind parameter should exist in this location within
+    // the SQL string.
+    fn param(self) -> Self {
         Self {
-            query: Box::new(diesel::select((ExpressionAlias::new::<
-                do_insert::insert,
-            >(
-                not_allocated_yet
-                    .and(TrueOrCastError::new(
-                        enough_candidate_zpools,
-                        NOT_ENOUGH_ZPOOL_SPACE_SENTINEL,
-                    ))
-                    .and(TrueOrCastError::new(
-                        enough_candidate_regions,
-                        NOT_ENOUGH_DATASETS_SENTINEL,
-                    ))
-                    .and(TrueOrCastError::new(
-                        enough_unique_candidate_zpools,
-                        NOT_ENOUGH_UNIQUE_ZPOOLS_SENTINEL,
-                    )),
-            ),))),
+            query: self.query.sql("$").sql(self.bind_counter.next().to_string()),
+            bind_counter: self.bind_counter,
         }
     }
-}
-
-/// A subquery which actually inserts the regions.
-#[derive(Subquery, QueryId)]
-#[subquery(name = inserted_regions)]
-struct InsertRegions {
-    query: Box<dyn CteQuery<SqlType = schema::region::SqlType>>,
-}
-
-impl InsertRegions {
-    fn new(do_insert: &DoInsert, candidate_regions: &CandidateRegions) -> Self {
-        use crate::db::schema::region;
 
+    // Slightly more strict than the "sql" method of Diesel's SqlQuery.
+    // Only permits "&'static str" intentionally to limit susceptibility to
+    // SQL injection.
+    fn sql(self, s: &'static str) -> Self {
         Self {
-            query: Box::new(
-                candidate_regions
-                    .query_source()
-                    .select(candidate_regions::all_columns)
-                    .filter(
-                        do_insert
-                            .query_source()
-                            .select(do_insert::insert)
-                            .single_value()
-                            .assume_not_null(),
-                    )
-                    .insert_into(region::table)
-                    .returning(region::all_columns),
-            ),
+            query: self.query.sql(s),
+            bind_counter: self.bind_counter,
         }
     }
-}
-
-/// A subquery which updates dataset size usage based on inserted regions.
-#[derive(Subquery, QueryId)]
-#[subquery(name = updated_datasets)]
-struct UpdateDatasets {
-    query: Box<dyn CteQuery<SqlType = updated_datasets::SqlType>>,
-}
-
-impl UpdateDatasets {
-    fn new(
-        do_insert: &DoInsert,
-        proposed_dataset_changes: &ProposedChanges,
-    ) -> Self {
-        use crate::db::schema::dataset::dsl as dataset_dsl;
-
-        let datasets_with_updates = proposed_dataset_changes
-            .query_source()
-            .select(proposed_dataset_changes::columns::id)
-            .into_boxed();
 
+    fn bind<BindSt, Value>(self, b: Value) -> Self
+    where
+        Pg: sql_types::HasSqlType<BindSt>,
+        Value: diesel::serialize::ToSql<BindSt, Pg> + Send + 'static,
+        BindSt: Send + 'static
+    {
         Self {
-            query: Box::new(
-                diesel::update(
-                    dataset_dsl::dataset.filter(
-                        dataset_dsl::id.eq_any(datasets_with_updates)
-                    )
-                )
-                .filter(
-                    do_insert.query_source()
-                        .select(do_insert::insert)
-                        .single_value()
-                        .assume_not_null()
-                )
-                .set(
-                    dataset_dsl::size_used.eq(
-                        dataset_dsl::size_used + proposed_dataset_changes.query_source()
-                            .filter(proposed_dataset_changes::columns::id.eq(dataset_dsl::id))
-                            .select(proposed_dataset_changes::columns::size_used_delta)
-                            .single_value()
-                    )
-                )
-                .returning(crate::db::schema::dataset::all_columns)
-            )
+            query: self.query.bind(b),
+            bind_counter: self.bind_counter,
         }
     }
 }
 
-/// Constructs a CTE for allocating new regions, and updating the datasets to
-/// which those regions belong.
-#[derive(QueryId)]
-pub struct RegionAllocate {
-    cte: Cte,
-}
-
-impl RegionAllocate {
-    pub fn new(
-        volume_id: uuid::Uuid,
-        block_size: u64,
-        blocks_per_extent: u64,
-        extent_count: u64,
-        allocation_strategy: &RegionAllocationStrategy,
-    ) -> Self {
-        let (seed, distinct_sleds) = {
-            let (input_seed, distinct_sleds) = match allocation_strategy {
-                RegionAllocationStrategy::Random { seed } => (seed, false),
-                RegionAllocationStrategy::RandomWithDistinctSleds { seed } => {
-                    (seed, true)
-                }
-            };
-            (
-                input_seed.map_or_else(
-                    || {
-                        std::time::SystemTime::now()
-                            .duration_since(std::time::UNIX_EPOCH)
-                            .unwrap()
-                            .as_nanos()
-                    },
-                    |seed| seed as u128,
-                ),
-                distinct_sleds,
-            )
-        };
-
-        let size_delta = block_size * blocks_per_extent * extent_count;
-
-        let old_regions = OldRegions::new(volume_id);
-
-        let old_pool_usage = OldPoolUsage::new();
-        let candidate_zpools = CandidateZpools::new(
-            &old_pool_usage,
-            size_delta,
-            seed,
-            distinct_sleds,
-        );
-
-        let candidate_datasets =
-            CandidateDatasets::new(&candidate_zpools, seed);
-
-        let shuffled_candidate_datasets =
-            ShuffledCandidateDatasets::new(&candidate_datasets, seed);
-
-        let candidate_regions = CandidateRegions::new(
-            &shuffled_candidate_datasets,
-            volume_id,
-            block_size,
-            blocks_per_extent,
-            extent_count,
-        );
-        let proposed_changes = ProposedChanges::new(&candidate_regions);
-        let do_insert =
-            DoInsert::new(&old_regions, &candidate_regions, &candidate_zpools);
-        let insert_regions = InsertRegions::new(&do_insert, &candidate_regions);
-        let updated_datasets =
-            UpdateDatasets::new(&do_insert, &proposed_changes);
-
-        // Gather together all "(dataset, region)" rows for all regions which
-        // are allocated to the volume.
-        //
-        // This roughly translates to:
-        //
-        // old_regions INNER JOIN old_datasets
-        // UNION
-        // new_regions INNER JOIN updated_datasets
-        //
-        // Note that we cannot simply JOIN the old + new regions, and query for
-        // their associated datasets: doing so would return the pre-UPDATE
-        // values of datasets that are updated by this CTE.
-        let final_select = Box::new(
-            old_regions
-                .query_source()
-                .inner_join(
-                    crate::db::schema::dataset::dsl::dataset
-                        .on(old_regions::dataset_id
-                            .eq(crate::db::schema::dataset::dsl::id)),
-                )
-                .select((
-                    crate::db::schema::dataset::all_columns,
-                    old_regions::all_columns,
-                ))
-                .union(
-                    insert_regions
-                        .query_source()
-                        .inner_join(
-                            updated_datasets::dsl::updated_datasets
-                                .on(inserted_regions::dataset_id
-                                    .eq(updated_datasets::id)),
-                        )
-                        .select((
-                            updated_datasets::all_columns,
-                            inserted_regions::all_columns,
-                        )),
-                ),
-        );
-
-        let cte = CteBuilder::new()
-            .add_subquery(old_regions)
-            .add_subquery(old_pool_usage)
-            .add_subquery(candidate_zpools)
-            .add_subquery(candidate_datasets)
-            .add_subquery(shuffled_candidate_datasets)
-            .add_subquery(candidate_regions)
-            .add_subquery(proposed_changes)
-            .add_subquery(do_insert)
-            .add_subquery(insert_regions)
-            .add_subquery(updated_datasets)
-            .build(final_select);
-
-        Self { cte }
-    }
-}
-
-impl QueryFragment<Pg> for RegionAllocate {
-    fn walk_ast<'a>(
-        &'a self,
-        mut out: AstPass<'_, 'a, Pg>,
-    ) -> diesel::QueryResult<()> {
-        out.unsafe_to_cache_prepared();
-
-        self.cte.walk_ast(out.reborrow())?;
-        Ok(())
-    }
-}
-
-type SelectableSql<T> = <
-    <T as diesel::Selectable<Pg>>::SelectExpression as diesel::Expression
->::SqlType;
-
-impl Query for RegionAllocate {
-    type SqlType = (SelectableSql<Dataset>, SelectableSql<Region>);
-}
-
-impl RunQueryDsl<DbConnection> for RegionAllocate {}
-
-struct BindParamCounter(i32);
-impl BindParamCounter {
-    fn new() -> Self {
-        Self(0)
-    }
-    fn next(&mut self) -> i32 {
-        self.0 += 1;
-        self.0
-    }
-}
-
-trait SqlQueryBinds {
-    fn add_bind(self, bind_counter: &mut BindParamCounter) -> Self;
-}
-
-impl<'a, Query> SqlQueryBinds
-    for diesel::query_builder::BoxedSqlQuery<'a, Pg, Query>
-{
-    fn add_bind(self, bind_counter: &mut BindParamCounter) -> Self {
-        self.sql("$").sql(bind_counter.next().to_string())
-    }
-}
-
 pub fn allocation_query(
     volume_id: uuid::Uuid,
     block_size: u64,
@@ -737,107 +166,190 @@ pub fn allocation_query(
     let size_delta = block_size * blocks_per_extent * extent_count;
     let redunancy: i64 = i64::try_from(REGION_REDUNDANCY_THRESHOLD).unwrap();
 
-    let mut binds = BindParamCounter::new();
-    let query = diesel::sql_query(
+
+    let builder = QueryBuilder::new().sql(
+    // Find all old regions associated with a particular volume
 "WITH
   old_regions AS
-    (SELECT region.id, region.time_created, region.time_modified, region.dataset_id, region.volume_id, region.block_size, region.blocks_per_extent, region.extent_count FROM region WHERE (region.volume_id = ").into_boxed().add_bind(&mut binds).sql(")),")
+    (SELECT
+      region.id,
+      region.time_created,
+      region.time_modified,
+      region.dataset_id,
+      region.volume_id,
+      region.block_size,
+      region.blocks_per_extent,
+      region.extent_count
+    FROM region WHERE (region.volume_id = ").param().sql(")),")
     .bind::<sql_types::Uuid, _>(volume_id)
-    .sql(
-"
-  old_zpool_usage AS
-    (SELECT dataset.pool_id, sum(dataset.size_used) AS size_used FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),
-");
 
-    // We pick one of these branches, depending on whether or not the sleds
-    // should be distinct.
+    // Calculates the old size being used by zpools under consideration as targets for region
+    // allocation.
+    .sql("
+  old_zpool_usage AS
+    (SELECT
+      dataset.pool_id,
+      sum(dataset.size_used) AS size_used
+    FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),");
+
+    // Identifies zpools with enough space for region allocation.
+    //
+    // NOTE: This changes the format of the underlying SQL query, as it uses
+    // distinct bind parameters depending on the conditional branch.
     if distinct_sleds {
-        query.sql("
+        builder.sql("
   candidate_zpools AS
-    (SELECT old_zpool_usage.pool_id FROM (old_zpool_usage INNER JOIN (zpool INNER JOIN sled ON (zpool.sled_id = sled.id)) ON (zpool.id = old_zpool_usage.pool_id)) WHERE (((old_zpool_usage.size_used + ").add_bind(&mut binds).sql(" ) <= total_size) AND (sled.provision_state = 'provisionable'))),
-")
+    (SELECT
+      old_zpool_usage.pool_id
+    FROM (old_zpool_usage INNER JOIN (zpool INNER JOIN sled ON (zpool.sled_id = sled.id)) ON (zpool.id = old_zpool_usage.pool_id))
+    WHERE (((old_zpool_usage.size_used + ").param().sql(" ) <= total_size) AND (sled.provision_state = 'provisionable'))),")
         .bind::<sql_types::BigInt, _>(size_delta as i64)
     } else {
-        query.sql("
+        builder.sql("
   candidate_zpools AS
-    (SELECT DISTINCT ON (zpool.sled_id) old_zpool_usage.pool_id FROM (old_zpool_usage INNER JOIN (zpool INNER JOIN sled ON (zpool.sled_id = sled.id)) ON (zpool.id = old_zpool_usage.pool_id)) WHERE (((old_zpool_usage.size_used + ").add_bind(&mut binds).sql(" ) <= total_size) AND (sled.provision_state = 'provisionable')) ORDER BY zpool.sled_id, md5((CAST(zpool.id as BYTEA) || ").add_bind(&mut binds).sql("))),
-")
+    (SELECT DISTINCT ON (zpool.sled_id)
+      old_zpool_usage.pool_id
+    FROM (old_zpool_usage INNER JOIN (zpool INNER JOIN sled ON (zpool.sled_id = sled.id)) ON (zpool.id = old_zpool_usage.pool_id))
+    WHERE (((old_zpool_usage.size_used + ").param().sql(" ) <= total_size) AND (sled.provision_state = 'provisionable'))
+    ORDER BY zpool.sled_id, md5((CAST(zpool.id as BYTEA) || ").param().sql("))),")
         .bind::<sql_types::BigInt, _>(size_delta as i64)
         .bind::<sql_types::Bytea, _>(seed.clone())
     }
+    // Find datasets which could be used for provisioning regions.
+    //
+    // We only consider datasets which are already allocated as "Crucible".
+    // This implicitly distinguishes between "M.2s" and "U.2s" -- Nexus needs to
+    // determine during dataset provisioning which devices should be considered for
+    // usage as Crucible storage.
+    //
+    // We select only one dataset from each zpool.
     .sql(
 "
   candidate_datasets AS
-    (SELECT DISTINCT ON (dataset.pool_id) dataset.id, dataset.pool_id FROM (dataset INNER JOIN candidate_zpools ON (dataset.pool_id = candidate_zpools.pool_id)) WHERE (((dataset.time_deleted IS NULL) AND (dataset.size_used IS NOT NULL)) AND (dataset.kind = 'crucible')) ORDER BY dataset.pool_id, md5((CAST(dataset.id as BYTEA) || ").add_bind(&mut binds).sql("))),
-"
-    ).bind::<sql_types::Bytea, _>(seed.clone())
+    (SELECT DISTINCT ON (dataset.pool_id)
+      dataset.id,
+      dataset.pool_id
+    FROM (dataset INNER JOIN candidate_zpools ON (dataset.pool_id = candidate_zpools.pool_id))
+    WHERE (((dataset.time_deleted IS NULL) AND (dataset.size_used IS NOT NULL)) AND (dataset.kind = 'crucible'))
+    ORDER BY dataset.pool_id, md5((CAST(dataset.id as BYTEA) || ").param().sql("))),")
+    .bind::<sql_types::Bytea, _>(seed.clone())
+    // We order by md5 to shuffle the ordering of the datasets.
+    // md5 has a uniform output distribution so it does the job.
     .sql(
 "
   shuffled_candidate_datasets AS
-    (SELECT candidate_datasets.id, candidate_datasets.pool_id FROM candidate_datasets ORDER BY md5((CAST(candidate_datasets.id as BYTEA) || ").add_bind(&mut binds).sql(")) LIMIT ").add_bind(&mut binds).sql("),
-"
-    ).bind::<sql_types::Bytea, _>(seed)
+    (SELECT
+      candidate_datasets.id,
+      candidate_datasets.pool_id
+    FROM candidate_datasets
+    ORDER BY md5((CAST(candidate_datasets.id as BYTEA) || ").param().sql(")) LIMIT ").param().sql("),")
+    .bind::<sql_types::Bytea, _>(seed)
     .bind::<sql_types::BigInt, _>(redunancy)
+    // Create the regions-to-be-inserted for the volume.
     .sql(
 "
   candidate_regions AS
-    (SELECT gen_random_uuid() AS id, now() AS time_created, now() AS time_modified, shuffled_candidate_datasets.id AS dataset_id, ").add_bind(&mut binds).sql(" AS volume_id, ").add_bind(&mut binds).sql(" AS block_size, ").add_bind(&mut binds).sql(" AS blocks_per_extent, ").add_bind(&mut binds).sql(" AS extent_count FROM shuffled_candidate_datasets),
-"
-    ).bind::<sql_types::Uuid, _>(volume_id)
+    (SELECT
+      gen_random_uuid() AS id,
+      now() AS time_created,
+      now() AS time_modified,
+      shuffled_candidate_datasets.id AS dataset_id,
+      ").param().sql(" AS volume_id,
+      ").param().sql(" AS block_size,
+      ").param().sql(" AS blocks_per_extent,
+      ").param().sql(" AS extent_count
+    FROM shuffled_candidate_datasets),")
+    .bind::<sql_types::Uuid, _>(volume_id)
     .bind::<sql_types::BigInt, _>(block_size as i64)
     .bind::<sql_types::BigInt, _>(blocks_per_extent as i64)
     .bind::<sql_types::BigInt, _>(extent_count as i64)
+    // A subquery which summarizes the changes we intend to make, showing:
+    //
+    // 1. Which datasets will have size adjustments
+    // 2. Which pools those datasets belong to
+    // 3. The delta in size-used
     .sql(
 "
   proposed_dataset_changes AS
-    (SELECT candidate_regions.dataset_id AS id, dataset.pool_id AS pool_id, ((candidate_regions.block_size * candidate_regions.blocks_per_extent) * candidate_regions.extent_count) AS size_used_delta FROM (candidate_regions INNER JOIN dataset ON (dataset.id = candidate_regions.dataset_id))),
+    (SELECT
+      candidate_regions.dataset_id AS id,
+      dataset.pool_id AS pool_id,
+      ((candidate_regions.block_size * candidate_regions.blocks_per_extent) * candidate_regions.extent_count) AS size_used_delta
+    FROM (candidate_regions INNER JOIN dataset ON (dataset.id = candidate_regions.dataset_id))),")
+    // Confirms whether or not the insertion and updates should
+    // occur.
+    //
+    // This subquery additionally exits the CTE early with an error if either:
+    // 1. Not enough datasets exist to provision regions with our required
+    //    redundancy, or
+    // 2. Not enough space exists on zpools to perform the provisioning.
+    //
+    // We want to ensure that we do not allocate on two datasets in the same
+    // zpool, for two reasons
+    // - Data redundancy: If a drive fails it should only take one of the 3
+    //   regions with it
+    // - Risk of overallocation: We only check that each zpool as enough
+    //   room for one region, so we should not allocate more than one region
+    //   to it.
+    //
+    // Selecting two datasets on the same zpool will not initially be
+    // possible, as at the time of writing each zpool only has one dataset.
+    // Additionally, we intend to modify the allocation strategy to select
+    // from 3 distinct sleds, removing the possibility entirely. But, if we
+    // introduce a change that adds another crucible dataset to zpools
+    // before we improve the allocation strategy, this check will make sure
+    // we don't violate drive redundancy, and generate an error instead.
+    .sql("
   do_insert AS
-    (SELECT (((((SELECT COUNT(*) FROM old_regions LIMIT 1) < ").add_bind(&mut binds).sql(") AND CAST(IF(((SELECT COUNT(*) FROM candidate_zpools LIMIT 1) >= ").add_bind(&mut binds).sql("), 'TRUE', 'Not enough space') AS BOOL)) AND CAST(IF(((SELECT COUNT(*) FROM candidate_regions LIMIT 1) >= ").add_bind(&mut binds).sql("), 'TRUE', 'Not enough datasets') AS BOOL)) AND CAST(IF(((SELECT COUNT(DISTINCT dataset.pool_id) FROM (candidate_regions INNER JOIN dataset ON (candidate_regions.dataset_id = dataset.id)) LIMIT 1) >= ").add_bind(&mut binds).sql("), 'TRUE', 'Not enough unique zpools selected') AS BOOL)) AS insert),
-"
-    ).bind::<sql_types::BigInt, _>(redunancy)
+    (SELECT
+      (((((SELECT COUNT(*) FROM old_regions LIMIT 1) < ").param().sql(") AND CAST(IF(((SELECT COUNT(*) FROM candidate_zpools LIMIT 1) >= ").param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_ZPOOL_SPACE_SENTINEL, "') AS BOOL)) AND CAST(IF(((SELECT COUNT(*) FROM candidate_regions LIMIT 1) >= ")).param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_DATASETS_SENTINEL, "') AS BOOL)) AND CAST(IF(((SELECT COUNT(DISTINCT dataset.pool_id) FROM (candidate_regions INNER JOIN dataset ON (candidate_regions.dataset_id = dataset.id)) LIMIT 1) >= ")).param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_UNIQUE_ZPOOLS_SENTINEL, "') AS BOOL)) AS insert),"))
+    .bind::<sql_types::BigInt, _>(redunancy)
     .bind::<sql_types::BigInt, _>(redunancy)
     .bind::<sql_types::BigInt, _>(redunancy)
     .bind::<sql_types::BigInt, _>(redunancy)
     .sql(
 "
   inserted_regions AS
-    (INSERT INTO region (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count) SELECT candidate_regions.id, candidate_regions.time_created, candidate_regions.time_modified, candidate_regions.dataset_id, candidate_regions.volume_id, candidate_regions.block_size, candidate_regions.blocks_per_extent, candidate_regions.extent_count FROM candidate_regions WHERE (SELECT do_insert.insert FROM do_insert LIMIT 1) RETURNING region.id, region.time_created, region.time_modified, region.dataset_id, region.volume_id, region.block_size, region.blocks_per_extent, region.extent_count),
+    (INSERT INTO region
+      (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count)
+    SELECT
+      candidate_regions.id,
+      candidate_regions.time_created,
+      candidate_regions.time_modified,
+      candidate_regions.dataset_id,
+      candidate_regions.volume_id,
+      candidate_regions.block_size,
+      candidate_regions.blocks_per_extent,
+      candidate_regions.extent_count
+    FROM candidate_regions
+    WHERE
+      (SELECT do_insert.insert FROM do_insert LIMIT 1)
+    RETURNING
+      region.id,
+      region.time_created,
+      region.time_modified,
+      region.dataset_id,
+      region.volume_id,
+      region.block_size,
+      region.blocks_per_extent,
+      region.extent_count
+    ),
   updated_datasets AS
     (UPDATE dataset SET size_used = (dataset.size_used + (SELECT proposed_dataset_changes.size_used_delta FROM proposed_dataset_changes WHERE (proposed_dataset_changes.id = dataset.id) LIMIT 1)) WHERE ((dataset.id = ANY(SELECT proposed_dataset_changes.id FROM proposed_dataset_changes)) AND (SELECT do_insert.insert FROM do_insert LIMIT 1)) RETURNING dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used)
 (SELECT dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used, old_regions.id, old_regions.time_created, old_regions.time_modified, old_regions.dataset_id, old_regions.volume_id, old_regions.block_size, old_regions.blocks_per_extent, old_regions.extent_count FROM (old_regions INNER JOIN dataset ON (old_regions.dataset_id = dataset.id))) UNION (SELECT updated_datasets.id, updated_datasets.time_created, updated_datasets.time_modified, updated_datasets.time_deleted, updated_datasets.rcgen, updated_datasets.pool_id, updated_datasets.ip, updated_datasets.port, updated_datasets.kind, updated_datasets.size_used, inserted_regions.id, inserted_regions.time_created, inserted_regions.time_modified, inserted_regions.dataset_id, inserted_regions.volume_id, inserted_regions.block_size, inserted_regions.blocks_per_extent, inserted_regions.extent_count FROM (inserted_regions INNER JOIN updated_datasets ON (inserted_regions.dataset_id = updated_datasets.id)))"
-    )
+    ).query
 }
 
 #[cfg(test)]
 mod test {
     use super::*;
     use crate::db::explain::ExplainableAsync;
-    use omicron_test_utils::dev;
     use nexus_test_utils::db::test_setup_database;
+    use omicron_test_utils::dev;
     use uuid::Uuid;
 
-    #[test]
-    fn raw_sql() {
-        let volume_id = Uuid::new_v4();
-        let block_size = 0;
-        let blocks_per_extent = 1;
-        let extent_count = 2;
-        let allocation_strategy =
-            RegionAllocationStrategy::RandomWithDistinctSleds { seed: None };
-
-        let region_allocate = RegionAllocate::new(
-            volume_id,
-            block_size,
-            blocks_per_extent,
-            extent_count,
-            &allocation_strategy,
-        );
-
-        let query = diesel::debug_query::<Pg, _>(&region_allocate);
-
-        assert_eq!(query.to_string(), "foobar");
-    }
-
+    // Explain the possible forms of the SQL query to ensure that it
+    // creates a valid SQL string.
     #[tokio::test]
     async fn explainable() {
         let logctx = dev::test_setup_log("explainable");
@@ -848,19 +360,33 @@ mod test {
         let conn = pool.pool().get().await.unwrap();
 
         let volume_id = Uuid::new_v4();
-        let block_size = 0;
-        let blocks_per_extent = 1;
-        let extent_count = 2;
-        let allocation_strategy =
-            RegionAllocationStrategy::RandomWithDistinctSleds { seed: None };
+        let block_size = 512;
+        let blocks_per_extent = 4;
+        let extent_count = 8;
+
+        // First structure: Explain the query with "RandomWithDistinctSleds"
+
         let region_allocate = allocation_query(
             volume_id,
             block_size,
             blocks_per_extent,
             extent_count,
-            &allocation_strategy,
+            &RegionAllocationStrategy::RandomWithDistinctSleds { seed: None },
         );
+        let _ = region_allocate
+            .explain_async(&conn)
+            .await
+            .expect("Failed to explain query - is it valid SQL?");
+
+        // Second structure: Explain the query with "Random"
 
+        let region_allocate = allocation_query(
+            volume_id,
+            block_size,
+            blocks_per_extent,
+            extent_count,
+            &RegionAllocationStrategy::Random { seed: None },
+        );
         let _ = region_allocate
             .explain_async(&conn)
             .await

From 1c74b04ba3f8b1e2fde652ae8a8379b60bc0883d Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Tue, 13 Feb 2024 23:38:08 -0800
Subject: [PATCH 03/14] better formatting

---
 .../src/db/queries/region_allocation.rs       | 61 ++++++++++++++++++-
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index 24ace6467e..8dbc23c712 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -335,8 +335,65 @@ pub fn allocation_query(
       region.extent_count
     ),
   updated_datasets AS
-    (UPDATE dataset SET size_used = (dataset.size_used + (SELECT proposed_dataset_changes.size_used_delta FROM proposed_dataset_changes WHERE (proposed_dataset_changes.id = dataset.id) LIMIT 1)) WHERE ((dataset.id = ANY(SELECT proposed_dataset_changes.id FROM proposed_dataset_changes)) AND (SELECT do_insert.insert FROM do_insert LIMIT 1)) RETURNING dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used)
-(SELECT dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used, old_regions.id, old_regions.time_created, old_regions.time_modified, old_regions.dataset_id, old_regions.volume_id, old_regions.block_size, old_regions.blocks_per_extent, old_regions.extent_count FROM (old_regions INNER JOIN dataset ON (old_regions.dataset_id = dataset.id))) UNION (SELECT updated_datasets.id, updated_datasets.time_created, updated_datasets.time_modified, updated_datasets.time_deleted, updated_datasets.rcgen, updated_datasets.pool_id, updated_datasets.ip, updated_datasets.port, updated_datasets.kind, updated_datasets.size_used, inserted_regions.id, inserted_regions.time_created, inserted_regions.time_modified, inserted_regions.dataset_id, inserted_regions.volume_id, inserted_regions.block_size, inserted_regions.blocks_per_extent, inserted_regions.extent_count FROM (inserted_regions INNER JOIN updated_datasets ON (inserted_regions.dataset_id = updated_datasets.id)))"
+    (UPDATE dataset SET
+      size_used = (dataset.size_used + (SELECT proposed_dataset_changes.size_used_delta FROM proposed_dataset_changes WHERE (proposed_dataset_changes.id = dataset.id) LIMIT 1))
+    WHERE (
+      (dataset.id = ANY(SELECT proposed_dataset_changes.id FROM proposed_dataset_changes)) AND
+      (SELECT do_insert.insert FROM do_insert LIMIT 1))
+    RETURNING
+      dataset.id,
+      dataset.time_created,
+      dataset.time_modified,
+      dataset.time_deleted,
+      dataset.rcgen,
+      dataset.pool_id,
+      dataset.ip,
+      dataset.port,
+      dataset.kind,
+      dataset.size_used
+    )
+(SELECT
+  dataset.id,
+  dataset.time_created,
+  dataset.time_modified,
+  dataset.time_deleted,
+  dataset.rcgen,
+  dataset.pool_id,
+  dataset.ip,
+  dataset.port,
+  dataset.kind,
+  dataset.size_used,
+  old_regions.id,
+  old_regions.time_created,
+  old_regions.time_modified,
+  old_regions.dataset_id,
+  old_regions.volume_id,
+  old_regions.block_size,
+  old_regions.blocks_per_extent,
+  old_regions.extent_count
+FROM
+  (old_regions INNER JOIN dataset ON (old_regions.dataset_id = dataset.id))
+) UNION
+(SELECT
+  updated_datasets.id,
+  updated_datasets.time_created,
+  updated_datasets.time_modified,
+  updated_datasets.time_deleted,
+  updated_datasets.rcgen,
+  updated_datasets.pool_id,
+  updated_datasets.ip,
+  updated_datasets.port,
+  updated_datasets.kind,
+  updated_datasets.size_used,
+  inserted_regions.id,
+  inserted_regions.time_created,
+  inserted_regions.time_modified,
+  inserted_regions.dataset_id,
+  inserted_regions.volume_id,
+  inserted_regions.block_size,
+  inserted_regions.blocks_per_extent,
+  inserted_regions.extent_count
+FROM (inserted_regions INNER JOIN updated_datasets ON (inserted_regions.dataset_id = updated_datasets.id)))"
     ).query
 }
 

From 25820cc26882ce18d3ae92605e1ec5465d117d8f Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Wed, 14 Feb 2024 00:04:31 -0800
Subject: [PATCH 04/14] Fix tests

---
 .../src/db/queries/region_allocation.rs       | 113 +++++++++++-------
 1 file changed, 69 insertions(+), 44 deletions(-)

diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index 8dbc23c712..d953e05b0e 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -85,7 +85,11 @@ impl<'a, Query> SqlQueryBinds
 // assists with counting bind parameters and recommends avoiding the usage of
 // any non-static strings in query construction.
 struct QueryBuilder {
-    query: diesel::query_builder::BoxedSqlQuery<'static, Pg, diesel::query_builder::SqlQuery>,
+    query: diesel::query_builder::BoxedSqlQuery<
+        'static,
+        Pg,
+        diesel::query_builder::SqlQuery,
+    >,
     bind_counter: BindParamCounter,
 }
 
@@ -101,7 +105,10 @@ impl QueryBuilder {
     // the SQL string.
     fn param(self) -> Self {
         Self {
-            query: self.query.sql("$").sql(self.bind_counter.next().to_string()),
+            query: self
+                .query
+                .sql("$")
+                .sql(self.bind_counter.next().to_string()),
             bind_counter: self.bind_counter,
         }
     }
@@ -110,22 +117,16 @@ impl QueryBuilder {
     // Only permits "&'static str" intentionally to limit susceptibility to
     // SQL injection.
     fn sql(self, s: &'static str) -> Self {
-        Self {
-            query: self.query.sql(s),
-            bind_counter: self.bind_counter,
-        }
+        Self { query: self.query.sql(s), bind_counter: self.bind_counter }
     }
 
     fn bind<BindSt, Value>(self, b: Value) -> Self
     where
         Pg: sql_types::HasSqlType<BindSt>,
         Value: diesel::serialize::ToSql<BindSt, Pg> + Send + 'static,
-        BindSt: Send + 'static
+        BindSt: Send + 'static,
     {
-        Self {
-            query: self.query.bind(b),
-            bind_counter: self.bind_counter,
-        }
+        Self { query: self.query.bind(b), bind_counter: self.bind_counter }
     }
 }
 
@@ -166,12 +167,11 @@ pub fn allocation_query(
     let size_delta = block_size * blocks_per_extent * extent_count;
     let redunancy: i64 = i64::try_from(REGION_REDUNDANCY_THRESHOLD).unwrap();
 
-
     let builder = QueryBuilder::new().sql(
     // Find all old regions associated with a particular volume
 "WITH
-  old_regions AS
-    (SELECT
+  old_regions AS (
+    SELECT
       region.id,
       region.time_created,
       region.time_modified,
@@ -186,8 +186,8 @@ pub fn allocation_query(
     // Calculates the old size being used by zpools under consideration as targets for region
     // allocation.
     .sql("
-  old_zpool_usage AS
-    (SELECT
+  old_zpool_usage AS (
+    SELECT
       dataset.pool_id,
       sum(dataset.size_used) AS size_used
     FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),");
@@ -198,22 +198,40 @@ pub fn allocation_query(
     // distinct bind parameters depending on the conditional branch.
     if distinct_sleds {
         builder.sql("
-  candidate_zpools AS
-    (SELECT
+  candidate_zpools AS (
+    SELECT DISTINCT ON (zpool.sled_id)
       old_zpool_usage.pool_id
-    FROM (old_zpool_usage INNER JOIN (zpool INNER JOIN sled ON (zpool.sled_id = sled.id)) ON (zpool.id = old_zpool_usage.pool_id))
-    WHERE (((old_zpool_usage.size_used + ").param().sql(" ) <= total_size) AND (sled.provision_state = 'provisionable'))),")
+    FROM (
+        old_zpool_usage
+          INNER JOIN
+        (zpool INNER JOIN sled ON (zpool.sled_id = sled.id)) ON (zpool.id = old_zpool_usage.pool_id)
+    )
+    WHERE (
+      ((old_zpool_usage.size_used + ").param().sql(" ) <= total_size)
+      AND
+      (sled.provision_state = 'provisionable')
+    )
+    ORDER BY zpool.sled_id, md5((CAST(zpool.id as BYTEA) || ").param().sql("))),")
         .bind::<sql_types::BigInt, _>(size_delta as i64)
+        .bind::<sql_types::Bytea, _>(seed.clone())
     } else {
         builder.sql("
-  candidate_zpools AS
-    (SELECT DISTINCT ON (zpool.sled_id)
+  candidate_zpools AS (
+    SELECT
       old_zpool_usage.pool_id
-    FROM (old_zpool_usage INNER JOIN (zpool INNER JOIN sled ON (zpool.sled_id = sled.id)) ON (zpool.id = old_zpool_usage.pool_id))
-    WHERE (((old_zpool_usage.size_used + ").param().sql(" ) <= total_size) AND (sled.provision_state = 'provisionable'))
-    ORDER BY zpool.sled_id, md5((CAST(zpool.id as BYTEA) || ").param().sql("))),")
+    FROM (
+      old_zpool_usage
+      INNER JOIN
+      (zpool INNER JOIN sled ON (zpool.sled_id = sled.id))
+      ON (zpool.id = old_zpool_usage.pool_id)
+    )
+    WHERE (
+      ((old_zpool_usage.size_used + ").param().sql(" ) <= total_size)
+      AND
+      (sled.provision_state = 'provisionable')
+    )
+  ),")
         .bind::<sql_types::BigInt, _>(size_delta as i64)
-        .bind::<sql_types::Bytea, _>(seed.clone())
     }
     // Find datasets which could be used for provisioning regions.
     //
@@ -225,20 +243,24 @@ pub fn allocation_query(
     // We select only one dataset from each zpool.
     .sql(
 "
-  candidate_datasets AS
-    (SELECT DISTINCT ON (dataset.pool_id)
+  candidate_datasets AS (
+    SELECT DISTINCT ON (dataset.pool_id)
       dataset.id,
       dataset.pool_id
     FROM (dataset INNER JOIN candidate_zpools ON (dataset.pool_id = candidate_zpools.pool_id))
-    WHERE (((dataset.time_deleted IS NULL) AND (dataset.size_used IS NOT NULL)) AND (dataset.kind = 'crucible'))
+    WHERE (
+      ((dataset.time_deleted IS NULL) AND
+      (dataset.size_used IS NOT NULL)) AND
+      (dataset.kind = 'crucible')
+    )
     ORDER BY dataset.pool_id, md5((CAST(dataset.id as BYTEA) || ").param().sql("))),")
     .bind::<sql_types::Bytea, _>(seed.clone())
     // We order by md5 to shuffle the ordering of the datasets.
     // md5 has a uniform output distribution so it does the job.
     .sql(
 "
-  shuffled_candidate_datasets AS
-    (SELECT
+  shuffled_candidate_datasets AS (
+    SELECT
       candidate_datasets.id,
       candidate_datasets.pool_id
     FROM candidate_datasets
@@ -248,8 +270,8 @@ pub fn allocation_query(
     // Create the regions-to-be-inserted for the volume.
     .sql(
 "
-  candidate_regions AS
-    (SELECT
+  candidate_regions AS (
+    SELECT
       gen_random_uuid() AS id,
       now() AS time_created,
       now() AS time_modified,
@@ -268,10 +290,9 @@ pub fn allocation_query(
     // 1. Which datasets will have size adjustments
     // 2. Which pools those datasets belong to
     // 3. The delta in size-used
-    .sql(
-"
-  proposed_dataset_changes AS
-    (SELECT
+    .sql("
+  proposed_dataset_changes AS (
+    SELECT
       candidate_regions.dataset_id AS id,
       dataset.pool_id AS pool_id,
       ((candidate_regions.block_size * candidate_regions.blocks_per_extent) * candidate_regions.extent_count) AS size_used_delta
@@ -300,17 +321,21 @@ pub fn allocation_query(
     // before we improve the allocation strategy, this check will make sure
     // we don't violate drive redundancy, and generate an error instead.
     .sql("
-  do_insert AS
-    (SELECT
-      (((((SELECT COUNT(*) FROM old_regions LIMIT 1) < ").param().sql(") AND CAST(IF(((SELECT COUNT(*) FROM candidate_zpools LIMIT 1) >= ").param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_ZPOOL_SPACE_SENTINEL, "') AS BOOL)) AND CAST(IF(((SELECT COUNT(*) FROM candidate_regions LIMIT 1) >= ")).param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_DATASETS_SENTINEL, "') AS BOOL)) AND CAST(IF(((SELECT COUNT(DISTINCT dataset.pool_id) FROM (candidate_regions INNER JOIN dataset ON (candidate_regions.dataset_id = dataset.id)) LIMIT 1) >= ")).param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_UNIQUE_ZPOOLS_SENTINEL, "') AS BOOL)) AS insert),"))
+  do_insert AS (
+    SELECT (((
+        ((SELECT COUNT(*) FROM old_regions LIMIT 1) < ").param().sql(") AND
+        CAST(IF(((SELECT COUNT(*) FROM candidate_zpools LIMIT 1) >= ").param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_ZPOOL_SPACE_SENTINEL, "') AS BOOL)) AND
+        CAST(IF(((SELECT COUNT(*) FROM candidate_regions LIMIT 1) >= ")).param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_DATASETS_SENTINEL, "') AS BOOL)) AND
+        CAST(IF(((SELECT COUNT(DISTINCT dataset.pool_id) FROM (candidate_regions INNER JOIN dataset ON (candidate_regions.dataset_id = dataset.id)) LIMIT 1) >= ")).param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_UNIQUE_ZPOOLS_SENTINEL, "') AS BOOL)
+    ) AS insert),"))
     .bind::<sql_types::BigInt, _>(redunancy)
     .bind::<sql_types::BigInt, _>(redunancy)
     .bind::<sql_types::BigInt, _>(redunancy)
     .bind::<sql_types::BigInt, _>(redunancy)
     .sql(
 "
-  inserted_regions AS
-    (INSERT INTO region
+  inserted_regions AS (
+    INSERT INTO region
       (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count)
     SELECT
       candidate_regions.id,
@@ -334,8 +359,8 @@ pub fn allocation_query(
       region.blocks_per_extent,
       region.extent_count
     ),
-  updated_datasets AS
-    (UPDATE dataset SET
+  updated_datasets AS (
+    UPDATE dataset SET
       size_used = (dataset.size_used + (SELECT proposed_dataset_changes.size_used_delta FROM proposed_dataset_changes WHERE (proposed_dataset_changes.id = dataset.id) LIMIT 1))
     WHERE (
       (dataset.id = ANY(SELECT proposed_dataset_changes.id FROM proposed_dataset_changes)) AND

From cd9b5480a02e2eb226f2ed3427866d437c128f2d Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Wed, 14 Feb 2024 01:59:41 -0800
Subject: [PATCH 05/14] Support for 'allcolumns'

---
 nexus/db-queries/src/db/column_walker.rs      |  27 ++-
 nexus/db-queries/src/db/mod.rs                |   1 +
 .../src/db/queries/region_allocation.rs       | 212 ++++--------------
 nexus/db-queries/src/db/raw_query_builder.rs  | 151 +++++++++++++
 4 files changed, 221 insertions(+), 170 deletions(-)
 create mode 100644 nexus/db-queries/src/db/raw_query_builder.rs

diff --git a/nexus/db-queries/src/db/column_walker.rs b/nexus/db-queries/src/db/column_walker.rs
index 64c3b450c8..6e21e0d899 100644
--- a/nexus/db-queries/src/db/column_walker.rs
+++ b/nexus/db-queries/src/db/column_walker.rs
@@ -4,6 +4,7 @@
 
 //! CTE utility for iterating over all columns in a table.
 
+use crate::db::raw_query_builder::TrustedStr;
 use diesel::prelude::*;
 use std::marker::PhantomData;
 
@@ -17,14 +18,30 @@ pub(crate) struct ColumnWalker<T> {
     remaining: PhantomData<T>,
 }
 
+pub type AllColumnsOf<T> = ColumnWalker<<T as diesel::Table>::AllColumns>;
+
 impl<T> ColumnWalker<T> {
-    pub fn new() -> Self {
+    pub const fn new() -> Self {
         Self { remaining: PhantomData }
     }
 }
 
 macro_rules! impl_column_walker {
     ( $len:literal $($column:ident)+ ) => (
+        #[allow(dead_code)]
+        impl<$($column: Column),+> ColumnWalker<($($column,)+)> {
+            pub fn with_prefix(prefix: &'static str) -> TrustedStr {
+                // This string is derived from:
+                // - The "table" type, with associated columns, which
+                // are not controlled by an arbitrary user, and
+                // - The "prefix" type, which is a "&'static str" (AKA,
+                // hopefully known at compile-time, and not leaked).
+                TrustedStr::this_string_wont_cause_sql_injections(
+                    [$([prefix, $column::NAME].join("."),)+].join(", ")
+                )
+            }
+        }
+
         impl<$($column: Column),+> IntoIterator for ColumnWalker<($($column,)+)> {
             type Item = &'static str;
             type IntoIter = std::array::IntoIter<Self::Item, $len>;
@@ -109,4 +126,12 @@ mod test {
         assert_eq!(iter.next(), Some("value"));
         assert_eq!(iter.next(), None);
     }
+
+    #[test]
+    fn test_all_columns_with_prefix() {
+        assert_eq!(
+            AllColumnsOf::<test_table::table>::with_prefix("foo").as_str(),
+            "foo.id, foo.value, foo.time_deleted"
+        );
+    }
 }
diff --git a/nexus/db-queries/src/db/mod.rs b/nexus/db-queries/src/db/mod.rs
index 6cd3911efe..86778fa2ec 100644
--- a/nexus/db-queries/src/db/mod.rs
+++ b/nexus/db-queries/src/db/mod.rs
@@ -27,6 +27,7 @@ mod pool_connection;
 // This is marked public because the error types are used elsewhere, e.g., in
 // sagas.
 pub mod queries;
+mod raw_query_builder;
 mod saga_recovery;
 mod sec_store;
 pub mod subquery;
diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index d953e05b0e..dd7431d723 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -4,7 +4,10 @@
 
 //! Implementation of queries for provisioning regions.
 
+use crate::db::column_walker::AllColumnsOf;
 use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD;
+use crate::db::raw_query_builder::QueryBuilder;
+use crate::db::schema;
 use crate::db::true_or_cast_error::matches_sentinel;
 use const_format::concatcp;
 use diesel::pg::Pg;
@@ -12,7 +15,9 @@ use diesel::result::Error as DieselError;
 use diesel::sql_types;
 use omicron_common::api::external;
 use omicron_common::nexus_config::RegionAllocationStrategy;
-use std::cell::Cell;
+
+type AllColumnsOfRegion = AllColumnsOf<schema::region::table>;
+type AllColumnsOfDataset = AllColumnsOf<schema::dataset::table>;
 
 const NOT_ENOUGH_DATASETS_SENTINEL: &'static str = "Not enough datasets";
 const NOT_ENOUGH_ZPOOL_SPACE_SENTINEL: &'static str = "Not enough space";
@@ -58,78 +63,6 @@ pub fn from_diesel(e: DieselError) -> external::Error {
     error::public_error_from_diesel(e, error::ErrorHandler::Server)
 }
 
-struct BindParamCounter(Cell<i32>);
-impl BindParamCounter {
-    fn new() -> Self {
-        Self(0.into())
-    }
-    fn next(&self) -> i32 {
-        self.0.set(self.0.get() + 1);
-        self.0.get()
-    }
-}
-
-trait SqlQueryBinds {
-    fn add_bind(self, bind_counter: &BindParamCounter) -> Self;
-}
-
-impl<'a, Query> SqlQueryBinds
-    for diesel::query_builder::BoxedSqlQuery<'a, Pg, Query>
-{
-    fn add_bind(self, bind_counter: &BindParamCounter) -> Self {
-        self.sql("$").sql(bind_counter.next().to_string())
-    }
-}
-
-// A small wrapper around [diesel::query_builder::BoxedSqlQuery] which
-// assists with counting bind parameters and recommends avoiding the usage of
-// any non-static strings in query construction.
-struct QueryBuilder {
-    query: diesel::query_builder::BoxedSqlQuery<
-        'static,
-        Pg,
-        diesel::query_builder::SqlQuery,
-    >,
-    bind_counter: BindParamCounter,
-}
-
-impl QueryBuilder {
-    fn new() -> Self {
-        Self {
-            query: diesel::sql_query("").into_boxed(),
-            bind_counter: BindParamCounter::new(),
-        }
-    }
-
-    // Identifies that a bind parameter should exist in this location within
-    // the SQL string.
-    fn param(self) -> Self {
-        Self {
-            query: self
-                .query
-                .sql("$")
-                .sql(self.bind_counter.next().to_string()),
-            bind_counter: self.bind_counter,
-        }
-    }
-
-    // Slightly more strict than the "sql" method of Diesel's SqlQuery.
-    // Only permits "&'static str" intentionally to limit susceptibility to
-    // SQL injection.
-    fn sql(self, s: &'static str) -> Self {
-        Self { query: self.query.sql(s), bind_counter: self.bind_counter }
-    }
-
-    fn bind<BindSt, Value>(self, b: Value) -> Self
-    where
-        Pg: sql_types::HasSqlType<BindSt>,
-        Value: diesel::serialize::ToSql<BindSt, Pg> + Send + 'static,
-        BindSt: Send + 'static,
-    {
-        Self { query: self.query.bind(b), bind_counter: self.bind_counter }
-    }
-}
-
 pub fn allocation_query(
     volume_id: uuid::Uuid,
     block_size: u64,
@@ -171,15 +104,7 @@ pub fn allocation_query(
     // Find all old regions associated with a particular volume
 "WITH
   old_regions AS (
-    SELECT
-      region.id,
-      region.time_created,
-      region.time_modified,
-      region.dataset_id,
-      region.volume_id,
-      region.block_size,
-      region.blocks_per_extent,
-      region.extent_count
+    SELECT ").sql(AllColumnsOfRegion::with_prefix("region")).sql("
     FROM region WHERE (region.volume_id = ").param().sql(")),")
     .bind::<sql_types::Uuid, _>(volume_id)
 
@@ -241,8 +166,7 @@ pub fn allocation_query(
     // usage as Crucible storage.
     //
     // We select only one dataset from each zpool.
-    .sql(
-"
+    .sql("
   candidate_datasets AS (
     SELECT DISTINCT ON (dataset.pool_id)
       dataset.id,
@@ -253,23 +177,23 @@ pub fn allocation_query(
       (dataset.size_used IS NOT NULL)) AND
       (dataset.kind = 'crucible')
     )
-    ORDER BY dataset.pool_id, md5((CAST(dataset.id as BYTEA) || ").param().sql("))),")
+    ORDER BY dataset.pool_id, md5((CAST(dataset.id as BYTEA) || ").param().sql("))
+  ),")
     .bind::<sql_types::Bytea, _>(seed.clone())
     // We order by md5 to shuffle the ordering of the datasets.
     // md5 has a uniform output distribution so it does the job.
-    .sql(
-"
+    .sql("
   shuffled_candidate_datasets AS (
     SELECT
       candidate_datasets.id,
       candidate_datasets.pool_id
     FROM candidate_datasets
-    ORDER BY md5((CAST(candidate_datasets.id as BYTEA) || ").param().sql(")) LIMIT ").param().sql("),")
+    ORDER BY md5((CAST(candidate_datasets.id as BYTEA) || ").param().sql(")) LIMIT ").param().sql("
+  ),")
     .bind::<sql_types::Bytea, _>(seed)
     .bind::<sql_types::BigInt, _>(redunancy)
     // Create the regions-to-be-inserted for the volume.
-    .sql(
-"
+    .sql("
   candidate_regions AS (
     SELECT
       gen_random_uuid() AS id,
@@ -280,7 +204,8 @@ pub fn allocation_query(
       ").param().sql(" AS block_size,
       ").param().sql(" AS blocks_per_extent,
       ").param().sql(" AS extent_count
-    FROM shuffled_candidate_datasets),")
+    FROM shuffled_candidate_datasets
+  ),")
     .bind::<sql_types::Uuid, _>(volume_id)
     .bind::<sql_types::BigInt, _>(block_size as i64)
     .bind::<sql_types::BigInt, _>(blocks_per_extent as i64)
@@ -296,7 +221,8 @@ pub fn allocation_query(
       candidate_regions.dataset_id AS id,
       dataset.pool_id AS pool_id,
       ((candidate_regions.block_size * candidate_regions.blocks_per_extent) * candidate_regions.extent_count) AS size_used_delta
-    FROM (candidate_regions INNER JOIN dataset ON (dataset.id = candidate_regions.dataset_id))),")
+    FROM (candidate_regions INNER JOIN dataset ON (dataset.id = candidate_regions.dataset_id))
+  ),")
     // Confirms whether or not the insertion and updates should
     // occur.
     //
@@ -327,99 +253,47 @@ pub fn allocation_query(
         CAST(IF(((SELECT COUNT(*) FROM candidate_zpools LIMIT 1) >= ").param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_ZPOOL_SPACE_SENTINEL, "') AS BOOL)) AND
         CAST(IF(((SELECT COUNT(*) FROM candidate_regions LIMIT 1) >= ")).param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_DATASETS_SENTINEL, "') AS BOOL)) AND
         CAST(IF(((SELECT COUNT(DISTINCT dataset.pool_id) FROM (candidate_regions INNER JOIN dataset ON (candidate_regions.dataset_id = dataset.id)) LIMIT 1) >= ")).param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_UNIQUE_ZPOOLS_SENTINEL, "') AS BOOL)
-    ) AS insert),"))
+    ) AS insert
+  ),"))
     .bind::<sql_types::BigInt, _>(redunancy)
     .bind::<sql_types::BigInt, _>(redunancy)
     .bind::<sql_types::BigInt, _>(redunancy)
     .bind::<sql_types::BigInt, _>(redunancy)
-    .sql(
-"
+    .sql("
   inserted_regions AS (
     INSERT INTO region
       (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count)
-    SELECT
-      candidate_regions.id,
-      candidate_regions.time_created,
-      candidate_regions.time_modified,
-      candidate_regions.dataset_id,
-      candidate_regions.volume_id,
-      candidate_regions.block_size,
-      candidate_regions.blocks_per_extent,
-      candidate_regions.extent_count
+    SELECT ").sql(AllColumnsOfRegion::with_prefix("candidate_regions")).sql("
     FROM candidate_regions
     WHERE
       (SELECT do_insert.insert FROM do_insert LIMIT 1)
-    RETURNING
-      region.id,
-      region.time_created,
-      region.time_modified,
-      region.dataset_id,
-      region.volume_id,
-      region.block_size,
-      region.blocks_per_extent,
-      region.extent_count
-    ),
+    RETURNING ").sql(AllColumnsOfRegion::with_prefix("region")).sql("
+  ),
   updated_datasets AS (
     UPDATE dataset SET
       size_used = (dataset.size_used + (SELECT proposed_dataset_changes.size_used_delta FROM proposed_dataset_changes WHERE (proposed_dataset_changes.id = dataset.id) LIMIT 1))
     WHERE (
       (dataset.id = ANY(SELECT proposed_dataset_changes.id FROM proposed_dataset_changes)) AND
       (SELECT do_insert.insert FROM do_insert LIMIT 1))
-    RETURNING
-      dataset.id,
-      dataset.time_created,
-      dataset.time_modified,
-      dataset.time_deleted,
-      dataset.rcgen,
-      dataset.pool_id,
-      dataset.ip,
-      dataset.port,
-      dataset.kind,
-      dataset.size_used
-    )
-(SELECT
-  dataset.id,
-  dataset.time_created,
-  dataset.time_modified,
-  dataset.time_deleted,
-  dataset.rcgen,
-  dataset.pool_id,
-  dataset.ip,
-  dataset.port,
-  dataset.kind,
-  dataset.size_used,
-  old_regions.id,
-  old_regions.time_created,
-  old_regions.time_modified,
-  old_regions.dataset_id,
-  old_regions.volume_id,
-  old_regions.block_size,
-  old_regions.blocks_per_extent,
-  old_regions.extent_count
-FROM
-  (old_regions INNER JOIN dataset ON (old_regions.dataset_id = dataset.id))
-) UNION
-(SELECT
-  updated_datasets.id,
-  updated_datasets.time_created,
-  updated_datasets.time_modified,
-  updated_datasets.time_deleted,
-  updated_datasets.rcgen,
-  updated_datasets.pool_id,
-  updated_datasets.ip,
-  updated_datasets.port,
-  updated_datasets.kind,
-  updated_datasets.size_used,
-  inserted_regions.id,
-  inserted_regions.time_created,
-  inserted_regions.time_modified,
-  inserted_regions.dataset_id,
-  inserted_regions.volume_id,
-  inserted_regions.block_size,
-  inserted_regions.blocks_per_extent,
-  inserted_regions.extent_count
-FROM (inserted_regions INNER JOIN updated_datasets ON (inserted_regions.dataset_id = updated_datasets.id)))"
-    ).query
+    RETURNING ").sql(AllColumnsOfDataset::with_prefix("dataset")).sql("
+  )
+(
+  SELECT ")
+    .sql(AllColumnsOfDataset::with_prefix("dataset"))
+    .sql(", ")
+    .sql(AllColumnsOfRegion::with_prefix("old_regions")).sql("
+  FROM
+    (old_regions INNER JOIN dataset ON (old_regions.dataset_id = dataset.id))
+)
+UNION
+(
+  SELECT ")
+    .sql(AllColumnsOfDataset::with_prefix("updated_datasets"))
+    .sql(", ")
+    .sql(AllColumnsOfRegion::with_prefix("inserted_regions")).sql("
+  FROM (inserted_regions INNER JOIN updated_datasets ON (inserted_regions.dataset_id = updated_datasets.id))
+)"
+    ).query()
 }
 
 #[cfg(test)]
diff --git a/nexus/db-queries/src/db/raw_query_builder.rs b/nexus/db-queries/src/db/raw_query_builder.rs
new file mode 100644
index 0000000000..a303378c0c
--- /dev/null
+++ b/nexus/db-queries/src/db/raw_query_builder.rs
@@ -0,0 +1,151 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Utilities for building string-based queries.
+//!
+//! These largely side-step Diesel's type system,
+//! and are recommended for more complex CTE
+
+use diesel::pg::Pg;
+use diesel::sql_types;
+use std::cell::Cell;
+
+// Keeps a counter to "how many bind parameters have been used" to
+// aid in the construction of the query string.
+struct BindParamCounter(Cell<i32>);
+impl BindParamCounter {
+    fn new() -> Self {
+        Self(0.into())
+    }
+    fn next(&self) -> i32 {
+        self.0.set(self.0.get() + 1);
+        self.0.get()
+    }
+}
+
+/// A "trusted" string, which can be used to construct SQL queries even
+/// though it isn't static. We use "trust" to refer to "protection from
+/// SQL injections".
+///
+/// This is basically a workaround for cases where we haven't yet been
+/// able to construct a query at compile-time.
+pub enum TrustedStr {
+    Static(&'static str),
+    ValidatedExplicitly(String),
+}
+
+impl TrustedStr {
+    /// Explicitly constructs a string, with a name that hopefully
+    /// gives callers some pause when calling this API.
+    ///
+    /// If arbitrary user input is provided here, this string COULD
+    /// cause SQL injection attacks, so each call-site should have a
+    /// justification for "why it's safe".
+    pub fn this_string_wont_cause_sql_injections(s: String) -> Self {
+        Self::ValidatedExplicitly(s)
+    }
+
+    #[cfg(test)]
+    pub fn as_str(&self) -> &str {
+        match self {
+            Self::Static(s) => s,
+            Self::ValidatedExplicitly(s) => s.as_str(),
+        }
+    }
+}
+
+impl From<&'static str> for TrustedStr {
+    fn from(s: &'static str) -> Self {
+        Self::Static(s)
+    }
+}
+
+trait SqlQueryBinds {
+    fn add_bind(self, bind_counter: &BindParamCounter) -> Self;
+}
+
+impl<'a, Query> SqlQueryBinds
+    for diesel::query_builder::BoxedSqlQuery<'a, Pg, Query>
+{
+    fn add_bind(self, bind_counter: &BindParamCounter) -> Self {
+        self.sql("$").sql(bind_counter.next().to_string())
+    }
+}
+
+type BoxedQuery = diesel::query_builder::BoxedSqlQuery<
+    'static,
+    Pg,
+    diesel::query_builder::SqlQuery,
+>;
+
+/// A small wrapper around [diesel::query_builder::BoxedSqlQuery] which
+/// assists with counting bind parameters and recommends avoiding the usage of
+/// any non-static strings in query construction.
+// NOTE: I'd really like to eventually be able to construct SQL statements
+// entirely at compile-time, but the combination of "const generics" and "const
+// fns" in stable Rust just isn't there yet.
+//
+// It's definitely possible to create static string builders that operate
+// entirely at compile-time, like:
+// https://play.rust-lang.org/?version=nightly&mode=debug&edition=2021&gist=26d0276648c3315f285372a19d0d492f
+//
+// But this relies on nightly features.
+pub struct QueryBuilder {
+    query: BoxedQuery,
+    bind_counter: BindParamCounter,
+}
+
+impl QueryBuilder {
+    pub fn new() -> Self {
+        Self {
+            query: diesel::sql_query("").into_boxed(),
+            bind_counter: BindParamCounter::new(),
+        }
+    }
+
+    /// Identifies that a bind parameter should exist in this location within
+    /// the SQL string.
+    ///
+    /// This should be called the same number of times as [Self::bind]. It is,
+    /// however, a distinct method, as "identifying bind params" should be
+    /// decoupled from "using bind parameters" to have an efficient statement
+    /// cache.
+    pub fn param(self) -> Self {
+        Self {
+            query: self
+                .query
+                .sql("$")
+                .sql(self.bind_counter.next().to_string()),
+            bind_counter: self.bind_counter,
+        }
+    }
+
+    /// Slightly more strict than the "sql" method of Diesel's SqlQuery.
+    /// Only permits strings which have been validated intentionally to limit
+    /// susceptibility to SQL injection.
+    ///
+    /// See the documentation of [TrustedStr] for more details.
+    pub fn sql<S: Into<TrustedStr>>(self, s: S) -> Self {
+        let query = match s.into() {
+            TrustedStr::Static(s) => self.query.sql(s),
+            TrustedStr::ValidatedExplicitly(s) => self.query.sql(s),
+        };
+        Self { query, bind_counter: self.bind_counter }
+    }
+
+    /// A call-through function to [diesel::query_builder::BoxedSqlQuery].
+    pub fn bind<BindSt, Value>(self, b: Value) -> Self
+    where
+        Pg: sql_types::HasSqlType<BindSt>,
+        Value: diesel::serialize::ToSql<BindSt, Pg> + Send + 'static,
+        BindSt: Send + 'static,
+    {
+        Self { query: self.query.bind(b), bind_counter: self.bind_counter }
+    }
+
+    /// Takes the final boxed query
+    pub fn query(self) -> BoxedQuery {
+        self.query
+    }
+}

From a756c26c5ebd44587abc2dd738041f5b909869c7 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Wed, 14 Feb 2024 13:44:54 -0800
Subject: [PATCH 06/14] Improve logging to help me debug

---
 nexus/db-queries/src/db/datastore/region.rs |  6 +++++
 nexus/src/app/sagas/mod.rs                  | 23 +++++++++++------
 nexus/src/app/sagas/snapshot_create.rs      | 28 +++++++++++++++++----
 3 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs
index 24298c4ede..b334ac1085 100644
--- a/nexus/db-queries/src/db/datastore/region.rs
+++ b/nexus/db-queries/src/db/datastore/region.rs
@@ -153,6 +153,12 @@ impl DataStore {
             .map(|DatasetAndRegion { dataset, region }| (dataset, region))
             .collect::<Vec<_>>();
 
+        info!(
+            self.log,
+            "Allocated regions for volume";
+            "volume_id" => %volume_id,
+            "datasets_and_regions" => ?dataset_and_regions,
+        );
         Ok(dataset_and_regions)
     }
 
diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs
index e9f800c61b..7edb3a10ec 100644
--- a/nexus/src/app/sagas/mod.rs
+++ b/nexus/src/app/sagas/mod.rs
@@ -358,8 +358,8 @@ where
                     Err(progenitor_client::Error::CommunicationError(e)) => {
                         warn!(
                             log,
-                            "saw transient communication error {}, retrying...",
-                            e,
+                            "saw transient communication error, retrying...";
+                            "error" => %e,
                         );
 
                         Err(backoff::BackoffError::transient(
@@ -382,16 +382,23 @@ where
                             }
 
                             // Anything else is a permanent error
-                            _ => Err(backoff::BackoffError::Permanent(
-                                progenitor_client::Error::ErrorResponse(
-                                    response_value,
-                                ),
-                            )),
+                            _ => {
+                                warn!(
+                                    log,
+                                    "saw response implying permanent error, aborting";
+                                    "response" => ?response_value
+                                );
+                                Err(backoff::BackoffError::Permanent(
+                                    progenitor_client::Error::ErrorResponse(
+                                        response_value,
+                                    ),
+                                ))
+                            },
                         }
                     }
 
                     Err(e) => {
-                        warn!(log, "saw permanent error {}, aborting", e,);
+                        warn!(log, "saw permanent error, aborting"; "error" => %e);
 
                         Err(backoff::BackoffError::Permanent(e))
                     }
diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs
index e017ab377b..3d2dfbe458 100644
--- a/nexus/src/app/sagas/snapshot_create.rs
+++ b/nexus/src/app/sagas/snapshot_create.rs
@@ -1175,7 +1175,7 @@ async fn ssc_start_running_snapshot(
     );
 
     let snapshot_id = sagactx.lookup::<Uuid>("snapshot_id")?;
-    info!(log, "starting running snapshot for {snapshot_id}");
+    info!(log, "starting running snapshot"; "snapshot_id" => %snapshot_id);
 
     let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore())
         .disk_id(params.disk_id)
@@ -1198,7 +1198,13 @@ async fn ssc_start_running_snapshot(
         let url = format!("http://{}", dataset.address());
         let client = CrucibleAgentClient::new(&url);
 
-        info!(log, "dataset {:?} region {:?} url {}", dataset, region, url);
+        info!(
+            log,
+            "contacting crucible agent to confirm region exists";
+            "dataset" => ?dataset,
+            "region" => ?region,
+            "url" => url,
+        );
 
         // Validate with the Crucible agent that the snapshot exists
         let crucible_region = retry_until_known_result(log, || async {
@@ -1208,7 +1214,11 @@ async fn ssc_start_running_snapshot(
         .map_err(|e| e.to_string())
         .map_err(ActionError::action_failed)?;
 
-        info!(log, "crucible region {:?}", crucible_region);
+        info!(
+            log,
+            "confirmed the region exists with crucible agent";
+            "crucible region" => ?crucible_region
+        );
 
         let crucible_snapshot = retry_until_known_result(log, || async {
             client
@@ -1222,7 +1232,11 @@ async fn ssc_start_running_snapshot(
         .map_err(|e| e.to_string())
         .map_err(ActionError::action_failed)?;
 
-        info!(log, "crucible snapshot {:?}", crucible_snapshot);
+        info!(
+            log,
+            "successfully accessed crucible snapshot";
+            "crucible snapshot" => ?crucible_snapshot
+        );
 
         // Start the snapshot running
         let crucible_running_snapshot =
@@ -1238,7 +1252,11 @@ async fn ssc_start_running_snapshot(
             .map_err(|e| e.to_string())
             .map_err(ActionError::action_failed)?;
 
-        info!(log, "crucible running snapshot {:?}", crucible_running_snapshot);
+        info!(
+            log,
+            "successfully started running region snapshot";
+            "crucible running snapshot" => ?crucible_running_snapshot
+        );
 
         // Map from the region to the snapshot
         let region_addr = format!(

From 1ec1c8193c20748e95e0980cc840cecb8ea0a9c5 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Wed, 14 Feb 2024 13:53:14 -0800
Subject: [PATCH 07/14] Add expectorate test

---
 .../src/db/queries/region_allocation.rs       | 35 +++++++
 .../output/region_allocate_distinct_sleds.sql | 96 ++++++++++++++++++
 .../output/region_allocate_random_sleds.sql   | 97 +++++++++++++++++++
 3 files changed, 228 insertions(+)
 create mode 100644 nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql
 create mode 100644 nexus/db-queries/tests/output/region_allocate_random_sleds.sql

diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index dd7431d723..9169f619a8 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -304,6 +304,41 @@ mod test {
     use omicron_test_utils::dev;
     use uuid::Uuid;
 
+    // This test is a bit of a "change detector", but it's here to help with
+    // debugging too. If you change this query, it can be useful to see exactly
+    // how the output SQL has been altered.
+    #[tokio::test]
+    async fn expectorate_query() {
+        let volume_id = Uuid::new_v4();
+        let block_size = 512;
+        let blocks_per_extent = 4;
+        let extent_count = 8;
+
+        // First structure: "RandomWithDistinctSleds"
+
+        let region_allocate = allocation_query(
+            volume_id,
+            block_size,
+            blocks_per_extent,
+            extent_count,
+            &RegionAllocationStrategy::RandomWithDistinctSleds { seed: Some(1) },
+        );
+        let s = diesel::debug_query::<Pg, _>(&region_allocate).to_string();
+        expectorate::assert_contents("tests/output/region_allocate_distinct_sleds.sql", &s);
+
+        // Second structure: "Random"
+
+        let region_allocate = allocation_query(
+            volume_id,
+            block_size,
+            blocks_per_extent,
+            extent_count,
+            &RegionAllocationStrategy::Random { seed: Some(1) },
+        );
+        let s = diesel::debug_query::<Pg, _>(&region_allocate).to_string();
+        expectorate::assert_contents("tests/output/region_allocate_random_sleds.sql", &s);
+    }
+
     // Explain the possible forms of the SQL query to ensure that it
     // creates a valid SQL string.
     #[tokio::test]
diff --git a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql
new file mode 100644
index 0000000000..50b1637753
--- /dev/null
+++ b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql
@@ -0,0 +1,96 @@
+WITH
+  old_regions AS (
+    SELECT region.id, region.time_created, region.time_modified, region.dataset_id, region.volume_id, region.block_size, region.blocks_per_extent, region.extent_count
+    FROM region WHERE (region.volume_id = $1)),
+  old_zpool_usage AS (
+    SELECT
+      dataset.pool_id,
+      sum(dataset.size_used) AS size_used
+    FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),
+  candidate_zpools AS (
+    SELECT DISTINCT ON (zpool.sled_id)
+      old_zpool_usage.pool_id
+    FROM (
+        old_zpool_usage
+          INNER JOIN
+        (zpool INNER JOIN sled ON (zpool.sled_id = sled.id)) ON (zpool.id = old_zpool_usage.pool_id)
+    )
+    WHERE (
+      ((old_zpool_usage.size_used + $2 ) <= total_size)
+      AND
+      (sled.provision_state = 'provisionable')
+    )
+    ORDER BY zpool.sled_id, md5((CAST(zpool.id as BYTEA) || $3))),
+  candidate_datasets AS (
+    SELECT DISTINCT ON (dataset.pool_id)
+      dataset.id,
+      dataset.pool_id
+    FROM (dataset INNER JOIN candidate_zpools ON (dataset.pool_id = candidate_zpools.pool_id))
+    WHERE (
+      ((dataset.time_deleted IS NULL) AND
+      (dataset.size_used IS NOT NULL)) AND
+      (dataset.kind = 'crucible')
+    )
+    ORDER BY dataset.pool_id, md5((CAST(dataset.id as BYTEA) || $4))
+  ),
+  shuffled_candidate_datasets AS (
+    SELECT
+      candidate_datasets.id,
+      candidate_datasets.pool_id
+    FROM candidate_datasets
+    ORDER BY md5((CAST(candidate_datasets.id as BYTEA) || $5)) LIMIT $6
+  ),
+  candidate_regions AS (
+    SELECT
+      gen_random_uuid() AS id,
+      now() AS time_created,
+      now() AS time_modified,
+      shuffled_candidate_datasets.id AS dataset_id,
+      $7 AS volume_id,
+      $8 AS block_size,
+      $9 AS blocks_per_extent,
+      $10 AS extent_count
+    FROM shuffled_candidate_datasets
+  ),
+  proposed_dataset_changes AS (
+    SELECT
+      candidate_regions.dataset_id AS id,
+      dataset.pool_id AS pool_id,
+      ((candidate_regions.block_size * candidate_regions.blocks_per_extent) * candidate_regions.extent_count) AS size_used_delta
+    FROM (candidate_regions INNER JOIN dataset ON (dataset.id = candidate_regions.dataset_id))
+  ),
+  do_insert AS (
+    SELECT (((
+        ((SELECT COUNT(*) FROM old_regions LIMIT 1) < $11) AND
+        CAST(IF(((SELECT COUNT(*) FROM candidate_zpools LIMIT 1) >= $12), 'TRUE', 'Not enough space') AS BOOL)) AND
+        CAST(IF(((SELECT COUNT(*) FROM candidate_regions LIMIT 1) >= $13), 'TRUE', 'Not enough datasets') AS BOOL)) AND
+        CAST(IF(((SELECT COUNT(DISTINCT dataset.pool_id) FROM (candidate_regions INNER JOIN dataset ON (candidate_regions.dataset_id = dataset.id)) LIMIT 1) >= $14), 'TRUE', 'Not enough unique zpools selected') AS BOOL)
+    ) AS insert
+  ),
+  inserted_regions AS (
+    INSERT INTO region
+      (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count)
+    SELECT candidate_regions.id, candidate_regions.time_created, candidate_regions.time_modified, candidate_regions.dataset_id, candidate_regions.volume_id, candidate_regions.block_size, candidate_regions.blocks_per_extent, candidate_regions.extent_count
+    FROM candidate_regions
+    WHERE
+      (SELECT do_insert.insert FROM do_insert LIMIT 1)
+    RETURNING region.id, region.time_created, region.time_modified, region.dataset_id, region.volume_id, region.block_size, region.blocks_per_extent, region.extent_count
+  ),
+  updated_datasets AS (
+    UPDATE dataset SET
+      size_used = (dataset.size_used + (SELECT proposed_dataset_changes.size_used_delta FROM proposed_dataset_changes WHERE (proposed_dataset_changes.id = dataset.id) LIMIT 1))
+    WHERE (
+      (dataset.id = ANY(SELECT proposed_dataset_changes.id FROM proposed_dataset_changes)) AND
+      (SELECT do_insert.insert FROM do_insert LIMIT 1))
+    RETURNING dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used
+  )
+(
+  SELECT dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used, old_regions.id, old_regions.time_created, old_regions.time_modified, old_regions.dataset_id, old_regions.volume_id, old_regions.block_size, old_regions.blocks_per_extent, old_regions.extent_count
+  FROM
+    (old_regions INNER JOIN dataset ON (old_regions.dataset_id = dataset.id))
+)
+UNION
+(
+  SELECT updated_datasets.id, updated_datasets.time_created, updated_datasets.time_modified, updated_datasets.time_deleted, updated_datasets.rcgen, updated_datasets.pool_id, updated_datasets.ip, updated_datasets.port, updated_datasets.kind, updated_datasets.size_used, inserted_regions.id, inserted_regions.time_created, inserted_regions.time_modified, inserted_regions.dataset_id, inserted_regions.volume_id, inserted_regions.block_size, inserted_regions.blocks_per_extent, inserted_regions.extent_count
+  FROM (inserted_regions INNER JOIN updated_datasets ON (inserted_regions.dataset_id = updated_datasets.id))
+) -- binds: [b1aa0a63-9afa-4fc3-b61e-0ebdc7854840, 16384, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3, b1aa0a63-9afa-4fc3-b61e-0ebdc7854840, 512, 4, 8, 3, 3, 3, 3]
\ No newline at end of file
diff --git a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql
new file mode 100644
index 0000000000..e40caee93e
--- /dev/null
+++ b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql
@@ -0,0 +1,97 @@
+WITH
+  old_regions AS (
+    SELECT region.id, region.time_created, region.time_modified, region.dataset_id, region.volume_id, region.block_size, region.blocks_per_extent, region.extent_count
+    FROM region WHERE (region.volume_id = $1)),
+  old_zpool_usage AS (
+    SELECT
+      dataset.pool_id,
+      sum(dataset.size_used) AS size_used
+    FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),
+  candidate_zpools AS (
+    SELECT
+      old_zpool_usage.pool_id
+    FROM (
+      old_zpool_usage
+      INNER JOIN
+      (zpool INNER JOIN sled ON (zpool.sled_id = sled.id))
+      ON (zpool.id = old_zpool_usage.pool_id)
+    )
+    WHERE (
+      ((old_zpool_usage.size_used + $2 ) <= total_size)
+      AND
+      (sled.provision_state = 'provisionable')
+    )
+  ),
+  candidate_datasets AS (
+    SELECT DISTINCT ON (dataset.pool_id)
+      dataset.id,
+      dataset.pool_id
+    FROM (dataset INNER JOIN candidate_zpools ON (dataset.pool_id = candidate_zpools.pool_id))
+    WHERE (
+      ((dataset.time_deleted IS NULL) AND
+      (dataset.size_used IS NOT NULL)) AND
+      (dataset.kind = 'crucible')
+    )
+    ORDER BY dataset.pool_id, md5((CAST(dataset.id as BYTEA) || $3))
+  ),
+  shuffled_candidate_datasets AS (
+    SELECT
+      candidate_datasets.id,
+      candidate_datasets.pool_id
+    FROM candidate_datasets
+    ORDER BY md5((CAST(candidate_datasets.id as BYTEA) || $4)) LIMIT $5
+  ),
+  candidate_regions AS (
+    SELECT
+      gen_random_uuid() AS id,
+      now() AS time_created,
+      now() AS time_modified,
+      shuffled_candidate_datasets.id AS dataset_id,
+      $6 AS volume_id,
+      $7 AS block_size,
+      $8 AS blocks_per_extent,
+      $9 AS extent_count
+    FROM shuffled_candidate_datasets
+  ),
+  proposed_dataset_changes AS (
+    SELECT
+      candidate_regions.dataset_id AS id,
+      dataset.pool_id AS pool_id,
+      ((candidate_regions.block_size * candidate_regions.blocks_per_extent) * candidate_regions.extent_count) AS size_used_delta
+    FROM (candidate_regions INNER JOIN dataset ON (dataset.id = candidate_regions.dataset_id))
+  ),
+  do_insert AS (
+    SELECT (((
+        ((SELECT COUNT(*) FROM old_regions LIMIT 1) < $10) AND
+        CAST(IF(((SELECT COUNT(*) FROM candidate_zpools LIMIT 1) >= $11), 'TRUE', 'Not enough space') AS BOOL)) AND
+        CAST(IF(((SELECT COUNT(*) FROM candidate_regions LIMIT 1) >= $12), 'TRUE', 'Not enough datasets') AS BOOL)) AND
+        CAST(IF(((SELECT COUNT(DISTINCT dataset.pool_id) FROM (candidate_regions INNER JOIN dataset ON (candidate_regions.dataset_id = dataset.id)) LIMIT 1) >= $13), 'TRUE', 'Not enough unique zpools selected') AS BOOL)
+    ) AS insert
+  ),
+  inserted_regions AS (
+    INSERT INTO region
+      (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count)
+    SELECT candidate_regions.id, candidate_regions.time_created, candidate_regions.time_modified, candidate_regions.dataset_id, candidate_regions.volume_id, candidate_regions.block_size, candidate_regions.blocks_per_extent, candidate_regions.extent_count
+    FROM candidate_regions
+    WHERE
+      (SELECT do_insert.insert FROM do_insert LIMIT 1)
+    RETURNING region.id, region.time_created, region.time_modified, region.dataset_id, region.volume_id, region.block_size, region.blocks_per_extent, region.extent_count
+  ),
+  updated_datasets AS (
+    UPDATE dataset SET
+      size_used = (dataset.size_used + (SELECT proposed_dataset_changes.size_used_delta FROM proposed_dataset_changes WHERE (proposed_dataset_changes.id = dataset.id) LIMIT 1))
+    WHERE (
+      (dataset.id = ANY(SELECT proposed_dataset_changes.id FROM proposed_dataset_changes)) AND
+      (SELECT do_insert.insert FROM do_insert LIMIT 1))
+    RETURNING dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used
+  )
+(
+  SELECT dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used, old_regions.id, old_regions.time_created, old_regions.time_modified, old_regions.dataset_id, old_regions.volume_id, old_regions.block_size, old_regions.blocks_per_extent, old_regions.extent_count
+  FROM
+    (old_regions INNER JOIN dataset ON (old_regions.dataset_id = dataset.id))
+)
+UNION
+(
+  SELECT updated_datasets.id, updated_datasets.time_created, updated_datasets.time_modified, updated_datasets.time_deleted, updated_datasets.rcgen, updated_datasets.pool_id, updated_datasets.ip, updated_datasets.port, updated_datasets.kind, updated_datasets.size_used, inserted_regions.id, inserted_regions.time_created, inserted_regions.time_modified, inserted_regions.dataset_id, inserted_regions.volume_id, inserted_regions.block_size, inserted_regions.blocks_per_extent, inserted_regions.extent_count
+  FROM (inserted_regions INNER JOIN updated_datasets ON (inserted_regions.dataset_id = updated_datasets.id))
+) -- binds: [b1aa0a63-9afa-4fc3-b61e-0ebdc7854840, 16384, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3, b1aa0a63-9afa-4fc3-b61e-0ebdc7854840, 512, 4, 8, 3, 3, 3, 3]
\ No newline at end of file

From b2bdaff6d3f5bc1eda3d8c110ac11c285feb88dc Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Wed, 14 Feb 2024 14:47:40 -0800
Subject: [PATCH 08/14] Spelling, stable EXPECTORATE tests

---
 .../db-queries/src/db/queries/region_allocation.rs | 14 +++++++-------
 .../output/region_allocate_distinct_sleds.sql      |  2 +-
 .../tests/output/region_allocate_random_sleds.sql  |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index 9169f619a8..7314c47591 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -98,7 +98,7 @@ pub fn allocation_query(
     let seed = seed.to_le_bytes().to_vec();
 
     let size_delta = block_size * blocks_per_extent * extent_count;
-    let redunancy: i64 = i64::try_from(REGION_REDUNDANCY_THRESHOLD).unwrap();
+    let redundancy: i64 = i64::try_from(REGION_REDUNDANCY_THRESHOLD).unwrap();
 
     let builder = QueryBuilder::new().sql(
     // Find all old regions associated with a particular volume
@@ -191,7 +191,7 @@ pub fn allocation_query(
     ORDER BY md5((CAST(candidate_datasets.id as BYTEA) || ").param().sql(")) LIMIT ").param().sql("
   ),")
     .bind::<sql_types::Bytea, _>(seed)
-    .bind::<sql_types::BigInt, _>(redunancy)
+    .bind::<sql_types::BigInt, _>(redundancy)
     // Create the regions-to-be-inserted for the volume.
     .sql("
   candidate_regions AS (
@@ -255,10 +255,10 @@ pub fn allocation_query(
         CAST(IF(((SELECT COUNT(DISTINCT dataset.pool_id) FROM (candidate_regions INNER JOIN dataset ON (candidate_regions.dataset_id = dataset.id)) LIMIT 1) >= ")).param().sql(concatcp!("), 'TRUE', '", NOT_ENOUGH_UNIQUE_ZPOOLS_SENTINEL, "') AS BOOL)
     ) AS insert
   ),"))
-    .bind::<sql_types::BigInt, _>(redunancy)
-    .bind::<sql_types::BigInt, _>(redunancy)
-    .bind::<sql_types::BigInt, _>(redunancy)
-    .bind::<sql_types::BigInt, _>(redunancy)
+    .bind::<sql_types::BigInt, _>(redundancy)
+    .bind::<sql_types::BigInt, _>(redundancy)
+    .bind::<sql_types::BigInt, _>(redundancy)
+    .bind::<sql_types::BigInt, _>(redundancy)
     .sql("
   inserted_regions AS (
     INSERT INTO region
@@ -309,7 +309,7 @@ mod test {
     // how the output SQL has been altered.
     #[tokio::test]
     async fn expectorate_query() {
-        let volume_id = Uuid::new_v4();
+        let volume_id = Uuid::nil();
         let block_size = 512;
         let blocks_per_extent = 4;
         let extent_count = 8;
diff --git a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql
index 50b1637753..c971d9bdf2 100644
--- a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql
+++ b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql
@@ -93,4 +93,4 @@ UNION
 (
   SELECT updated_datasets.id, updated_datasets.time_created, updated_datasets.time_modified, updated_datasets.time_deleted, updated_datasets.rcgen, updated_datasets.pool_id, updated_datasets.ip, updated_datasets.port, updated_datasets.kind, updated_datasets.size_used, inserted_regions.id, inserted_regions.time_created, inserted_regions.time_modified, inserted_regions.dataset_id, inserted_regions.volume_id, inserted_regions.block_size, inserted_regions.blocks_per_extent, inserted_regions.extent_count
   FROM (inserted_regions INNER JOIN updated_datasets ON (inserted_regions.dataset_id = updated_datasets.id))
-) -- binds: [b1aa0a63-9afa-4fc3-b61e-0ebdc7854840, 16384, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3, b1aa0a63-9afa-4fc3-b61e-0ebdc7854840, 512, 4, 8, 3, 3, 3, 3]
\ No newline at end of file
+) -- binds: [00000000-0000-0000-0000-000000000000, 16384, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3, 00000000-0000-0000-0000-000000000000, 512, 4, 8, 3, 3, 3, 3]
\ No newline at end of file
diff --git a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql
index e40caee93e..d5b8e5f5fb 100644
--- a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql
+++ b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql
@@ -94,4 +94,4 @@ UNION
 (
   SELECT updated_datasets.id, updated_datasets.time_created, updated_datasets.time_modified, updated_datasets.time_deleted, updated_datasets.rcgen, updated_datasets.pool_id, updated_datasets.ip, updated_datasets.port, updated_datasets.kind, updated_datasets.size_used, inserted_regions.id, inserted_regions.time_created, inserted_regions.time_modified, inserted_regions.dataset_id, inserted_regions.volume_id, inserted_regions.block_size, inserted_regions.blocks_per_extent, inserted_regions.extent_count
   FROM (inserted_regions INNER JOIN updated_datasets ON (inserted_regions.dataset_id = updated_datasets.id))
-) -- binds: [b1aa0a63-9afa-4fc3-b61e-0ebdc7854840, 16384, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3, b1aa0a63-9afa-4fc3-b61e-0ebdc7854840, 512, 4, 8, 3, 3, 3, 3]
\ No newline at end of file
+) -- binds: [00000000-0000-0000-0000-000000000000, 16384, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3, 00000000-0000-0000-0000-000000000000, 512, 4, 8, 3, 3, 3, 3]
\ No newline at end of file

From 31f697be7e549abab0905cf867ccb0bb2ad270f4 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Wed, 14 Feb 2024 14:57:06 -0800
Subject: [PATCH 09/14] Better string validation visibility

---
 nexus/db-queries/src/db/column_walker.rs     |  2 +-
 nexus/db-queries/src/db/raw_query_builder.rs | 30 +++++++++++---------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/nexus/db-queries/src/db/column_walker.rs b/nexus/db-queries/src/db/column_walker.rs
index 6e21e0d899..cace2ba5fb 100644
--- a/nexus/db-queries/src/db/column_walker.rs
+++ b/nexus/db-queries/src/db/column_walker.rs
@@ -36,7 +36,7 @@ macro_rules! impl_column_walker {
                 // are not controlled by an arbitrary user, and
                 // - The "prefix" type, which is a "&'static str" (AKA,
                 // hopefully known at compile-time, and not leaked).
-                TrustedStr::this_string_wont_cause_sql_injections(
+                TrustedStr::i_take_responsibility_for_validating_this_string(
                     [$([prefix, $column::NAME].join("."),)+].join(", ")
                 )
             }
diff --git a/nexus/db-queries/src/db/raw_query_builder.rs b/nexus/db-queries/src/db/raw_query_builder.rs
index a303378c0c..b0094f09ce 100644
--- a/nexus/db-queries/src/db/raw_query_builder.rs
+++ b/nexus/db-queries/src/db/raw_query_builder.rs
@@ -30,10 +30,7 @@ impl BindParamCounter {
 ///
 /// This is basically a workaround for cases where we haven't yet been
 /// able to construct a query at compile-time.
-pub enum TrustedStr {
-    Static(&'static str),
-    ValidatedExplicitly(String),
-}
+pub struct TrustedStr(TrustedStrVariants);
 
 impl TrustedStr {
     /// Explicitly constructs a string, with a name that hopefully
@@ -42,25 +39,32 @@ impl TrustedStr {
     /// If arbitrary user input is provided here, this string COULD
     /// cause SQL injection attacks, so each call-site should have a
     /// justification for "why it's safe".
-    pub fn this_string_wont_cause_sql_injections(s: String) -> Self {
-        Self::ValidatedExplicitly(s)
+    pub fn i_take_responsibility_for_validating_this_string(s: String) -> Self {
+        Self(TrustedStrVariants::ValidatedExplicitly(s))
     }
 
     #[cfg(test)]
     pub fn as_str(&self) -> &str {
-        match self {
-            Self::Static(s) => s,
-            Self::ValidatedExplicitly(s) => s.as_str(),
+        match &self.0 {
+            TrustedStrVariants::Static(s) => s,
+            TrustedStrVariants::ValidatedExplicitly(s) => s.as_str(),
         }
     }
 }
 
 impl From<&'static str> for TrustedStr {
     fn from(s: &'static str) -> Self {
-        Self::Static(s)
+        Self(TrustedStrVariants::Static(s))
     }
 }
 
+// This enum should be kept non-pub to make it harder to accidentally
+// construct a "ValidatedExplicitly" variant.
+enum TrustedStrVariants {
+    Static(&'static str),
+    ValidatedExplicitly(String),
+}
+
 trait SqlQueryBinds {
     fn add_bind(self, bind_counter: &BindParamCounter) -> Self;
 }
@@ -127,9 +131,9 @@ impl QueryBuilder {
     ///
     /// See the documentation of [TrustedStr] for more details.
     pub fn sql<S: Into<TrustedStr>>(self, s: S) -> Self {
-        let query = match s.into() {
-            TrustedStr::Static(s) => self.query.sql(s),
-            TrustedStr::ValidatedExplicitly(s) => self.query.sql(s),
+        let query = match s.into().0 {
+            TrustedStrVariants::Static(s) => self.query.sql(s),
+            TrustedStrVariants::ValidatedExplicitly(s) => self.query.sql(s),
         };
         Self { query, bind_counter: self.bind_counter }
     }

From 2060750c2adbf0a8c45cd65431d97fa86e5db48a Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Wed, 14 Feb 2024 15:41:57 -0800
Subject: [PATCH 10/14] fmt

---
 .../db-queries/src/db/queries/region_allocation.rs | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index 7314c47591..407f2df865 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -321,10 +321,15 @@ mod test {
             block_size,
             blocks_per_extent,
             extent_count,
-            &RegionAllocationStrategy::RandomWithDistinctSleds { seed: Some(1) },
+            &RegionAllocationStrategy::RandomWithDistinctSleds {
+                seed: Some(1),
+            },
         );
         let s = diesel::debug_query::<Pg, _>(&region_allocate).to_string();
-        expectorate::assert_contents("tests/output/region_allocate_distinct_sleds.sql", &s);
+        expectorate::assert_contents(
+            "tests/output/region_allocate_distinct_sleds.sql",
+            &s,
+        );
 
         // Second structure: "Random"
 
@@ -336,7 +341,10 @@ mod test {
             &RegionAllocationStrategy::Random { seed: Some(1) },
         );
         let s = diesel::debug_query::<Pg, _>(&region_allocate).to_string();
-        expectorate::assert_contents("tests/output/region_allocate_random_sleds.sql", &s);
+        expectorate::assert_contents(
+            "tests/output/region_allocate_random_sleds.sql",
+            &s,
+        );
     }
 
     // Explain the possible forms of the SQL query to ensure that it

From 291791deba716e796f3e0837ad05cbb483e74736 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Wed, 14 Feb 2024 16:27:27 -0800
Subject: [PATCH 11/14] Add test which is breaking in a more succinct way

---
 nexus/db-queries/src/db/datastore/mod.rs | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs
index f9e0be81c1..bc20d7a6cc 100644
--- a/nexus/db-queries/src/db/datastore/mod.rs
+++ b/nexus/db-queries/src/db/datastore/mod.rs
@@ -877,10 +877,21 @@ mod test {
             assert_eq!(expected_region_count, dataset_and_regions.len());
             let mut disk_datasets = HashSet::new();
             let mut disk_zpools = HashSet::new();
+            let mut regions = HashSet::new();
 
             for (dataset, region) in dataset_and_regions {
                 // Must be 3 unique datasets
                 assert!(disk_datasets.insert(dataset.id()));
+                // All regions should be unique
+                assert!(regions.insert(region.id()));
+
+                // Check there's no cross contamination between returned UUIDs
+                //
+                // This is a little goofy, but it catches a bug that has
+                // happened before. The returned columns share names (like
+                // "id"), so we need to process them in-order.
+                assert!(regions.get(&dataset.id()).is_none());
+                assert!(disk_datasets.get(&region.id()).is_none());
 
                 // Dataset must not be non-provisionable.
                 assert_ne!(dataset.id(), non_provisionable_dataset_id);

From c868f715ca284b6d3d61a1cfe09c82447ab67b02 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Wed, 14 Feb 2024 16:58:14 -0800
Subject: [PATCH 12/14] arcane chicanery which bends the type system to our
 will

---
 nexus/db-macros/src/lib.rs                    |  4 +-
 nexus/db-model/src/dataset.rs                 |  1 -
 nexus/db-model/src/region.rs                  |  1 -
 nexus/db-queries/src/db/datastore/region.rs   | 21 +++------
 .../src/db/queries/region_allocation.rs       | 13 +++---
 nexus/db-queries/src/db/raw_query_builder.rs  | 44 ++++++++++++++++++-
 6 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/nexus/db-macros/src/lib.rs b/nexus/db-macros/src/lib.rs
index 30208412a1..fd15b59128 100644
--- a/nexus/db-macros/src/lib.rs
+++ b/nexus/db-macros/src/lib.rs
@@ -280,7 +280,7 @@ fn build_resource_identity(
     let identity_name = format_ident!("{}Identity", struct_name);
     quote! {
         #[doc = #identity_doc]
-        #[derive(Clone, Debug, PartialEq, Eq, Selectable, Queryable, QueryableByName, Insertable, serde::Serialize, serde::Deserialize)]
+        #[derive(Clone, Debug, PartialEq, Eq, Selectable, Queryable, Insertable, serde::Serialize, serde::Deserialize)]
         #[diesel(table_name = #table_name) ]
         pub struct #identity_name {
             pub id: ::uuid::Uuid,
@@ -322,7 +322,7 @@ fn build_asset_identity(
     let identity_name = format_ident!("{}Identity", struct_name);
     quote! {
         #[doc = #identity_doc]
-        #[derive(Clone, Debug, PartialEq, Selectable, Queryable, QueryableByName, Insertable, serde::Serialize, serde::Deserialize)]
+        #[derive(Clone, Debug, PartialEq, Selectable, Queryable, Insertable, serde::Serialize, serde::Deserialize)]
         #[diesel(table_name = #table_name) ]
         pub struct #identity_name {
             pub id: ::uuid::Uuid,
diff --git a/nexus/db-model/src/dataset.rs b/nexus/db-model/src/dataset.rs
index 0ea465bebc..65c0070509 100644
--- a/nexus/db-model/src/dataset.rs
+++ b/nexus/db-model/src/dataset.rs
@@ -18,7 +18,6 @@ use uuid::Uuid;
 /// available to a service on the Sled.
 #[derive(
     Queryable,
-    QueryableByName,
     Insertable,
     Debug,
     Clone,
diff --git a/nexus/db-model/src/region.rs b/nexus/db-model/src/region.rs
index 0ea6f82e65..fefc4f4fce 100644
--- a/nexus/db-model/src/region.rs
+++ b/nexus/db-model/src/region.rs
@@ -15,7 +15,6 @@ use uuid::Uuid;
 /// allocated within a volume.
 #[derive(
     Queryable,
-    QueryableByName,
     Insertable,
     Debug,
     Clone,
diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs
index b334ac1085..c7b98b9a82 100644
--- a/nexus/db-queries/src/db/datastore/region.rs
+++ b/nexus/db-queries/src/db/datastore/region.rs
@@ -128,15 +128,7 @@ impl DataStore {
         let (blocks_per_extent, extent_count) =
             Self::get_crucible_allocation(&block_size, size);
 
-        #[derive(diesel::QueryableByName, Queryable)]
-        struct DatasetAndRegion {
-            #[diesel(embed)]
-            dataset: Dataset,
-            #[diesel(embed)]
-            region: Region,
-        }
-
-        let dataset_and_regions: Vec<_> =
+        let dataset_and_regions: Vec<(Dataset, Region)> =
             crate::db::queries::region_allocation::allocation_query(
                 volume_id,
                 block_size.to_bytes() as u64,
@@ -144,14 +136,11 @@ impl DataStore {
                 extent_count,
                 allocation_strategy,
             )
-            .get_results_async::<DatasetAndRegion>(
-                &*self.pool_connection_authorized(&opctx).await?,
-            )
+            .get_results_async(&*self.pool_connection_authorized(&opctx).await?)
             .await
-            .map_err(|e| crate::db::queries::region_allocation::from_diesel(e))?
-            .into_iter()
-            .map(|DatasetAndRegion { dataset, region }| (dataset, region))
-            .collect::<Vec<_>>();
+            .map_err(|e| {
+                crate::db::queries::region_allocation::from_diesel(e)
+            })?;
 
         info!(
             self.log,
diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index 407f2df865..3d39808831 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -6,7 +6,8 @@
 
 use crate::db::column_walker::AllColumnsOf;
 use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD;
-use crate::db::raw_query_builder::QueryBuilder;
+use crate::db::model::{Dataset, Region};
+use crate::db::raw_query_builder::{QueryBuilder, TypedSqlQuery};
 use crate::db::schema;
 use crate::db::true_or_cast_error::matches_sentinel;
 use const_format::concatcp;
@@ -63,17 +64,17 @@ pub fn from_diesel(e: DieselError) -> external::Error {
     error::public_error_from_diesel(e, error::ErrorHandler::Server)
 }
 
+type SelectableSql<T> = <
+    <T as diesel::Selectable<Pg>>::SelectExpression as diesel::Expression
+>::SqlType;
+
 pub fn allocation_query(
     volume_id: uuid::Uuid,
     block_size: u64,
     blocks_per_extent: u64,
     extent_count: u64,
     allocation_strategy: &RegionAllocationStrategy,
-) -> diesel::query_builder::BoxedSqlQuery<
-    'static,
-    Pg,
-    diesel::query_builder::SqlQuery,
-> {
+) -> TypedSqlQuery<(SelectableSql<Dataset>, SelectableSql<Region>)> {
     let (seed, distinct_sleds) = {
         let (input_seed, distinct_sleds) = match allocation_strategy {
             RegionAllocationStrategy::Random { seed } => (seed, false),
diff --git a/nexus/db-queries/src/db/raw_query_builder.rs b/nexus/db-queries/src/db/raw_query_builder.rs
index b0094f09ce..65ff22f86f 100644
--- a/nexus/db-queries/src/db/raw_query_builder.rs
+++ b/nexus/db-queries/src/db/raw_query_builder.rs
@@ -7,9 +7,13 @@
 //! These largely side-step Diesel's type system,
 //! and are recommended for more complex CTE
 
+use crate::db::pool::DbConnection;
 use diesel::pg::Pg;
+use diesel::query_builder::{AstPass, Query, QueryFragment, QueryId};
 use diesel::sql_types;
+use diesel::RunQueryDsl;
 use std::cell::Cell;
+use std::marker::PhantomData;
 
 // Keeps a counter to "how many bind parameters have been used" to
 // aid in the construction of the query string.
@@ -149,7 +153,43 @@ impl QueryBuilder {
     }
 
     /// Takes the final boxed query
-    pub fn query(self) -> BoxedQuery {
-        self.query
+    pub fn query<T>(self) -> TypedSqlQuery<T> {
+        TypedSqlQuery { inner: self.query, _phantom: PhantomData }
     }
 }
+
+/// Diesel's [diesel::query_builder::BoxedSqlQuery] has a few drawbacks that
+/// make wrapper more palatable:
+///
+/// - It always implements "Query" with SqlType = Untyped, so a caller could try to
+/// execute this query and get back any type.
+/// - It forces the usage of "QueryableByName", which acts wrong if we're
+/// returning multiple columns with the same name (this is normal! If you want
+/// to UNION two objects that both have "id" columns, this happens).
+#[derive(QueryId)]
+pub struct TypedSqlQuery<T> {
+    inner: diesel::query_builder::BoxedSqlQuery<
+        'static,
+        Pg,
+        diesel::query_builder::SqlQuery,
+    >,
+    _phantom: PhantomData<T>,
+}
+
+impl<T> QueryFragment<Pg> for TypedSqlQuery<T> {
+    fn walk_ast<'a>(
+        &'a self,
+        mut out: AstPass<'_, 'a, Pg>,
+    ) -> diesel::QueryResult<()> {
+        out.unsafe_to_cache_prepared();
+
+        self.inner.walk_ast(out.reborrow())?;
+        Ok(())
+    }
+}
+
+impl<T> RunQueryDsl<DbConnection> for TypedSqlQuery<T> {}
+
+impl<T> Query for TypedSqlQuery<T> {
+    type SqlType = T;
+}

From e7c71d7b63caac7046fc1eae3e0f11b71272bf9d Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Thu, 15 Feb 2024 12:32:36 -0800
Subject: [PATCH 13/14] Auto-format SQL

---
 .../src/db/queries/region_allocation.rs       |  12 +-
 .../output/region_allocate_distinct_sleds.sql | 335 ++++++++++++-----
 .../output/region_allocate_random_sleds.sql   | 336 +++++++++++++-----
 test-utils/src/dev/db.rs                      |  49 +++
 4 files changed, 550 insertions(+), 182 deletions(-)

diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index 3d39808831..31bb277af6 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -326,7 +326,11 @@ mod test {
                 seed: Some(1),
             },
         );
-        let s = diesel::debug_query::<Pg, _>(&region_allocate).to_string();
+        let s = dev::db::format_sql(
+            &diesel::debug_query::<Pg, _>(&region_allocate).to_string(),
+        )
+        .await
+        .unwrap();
         expectorate::assert_contents(
             "tests/output/region_allocate_distinct_sleds.sql",
             &s,
@@ -341,7 +345,11 @@ mod test {
             extent_count,
             &RegionAllocationStrategy::Random { seed: Some(1) },
         );
-        let s = diesel::debug_query::<Pg, _>(&region_allocate).to_string();
+        let s = dev::db::format_sql(
+            &diesel::debug_query::<Pg, _>(&region_allocate).to_string(),
+        )
+        .await
+        .unwrap();
         expectorate::assert_contents(
             "tests/output/region_allocate_random_sleds.sql",
             &s,
diff --git a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql
index c971d9bdf2..be22f92ed1 100644
--- a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql
+++ b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql
@@ -1,96 +1,253 @@
 WITH
-  old_regions AS (
-    SELECT region.id, region.time_created, region.time_modified, region.dataset_id, region.volume_id, region.block_size, region.blocks_per_extent, region.extent_count
-    FROM region WHERE (region.volume_id = $1)),
-  old_zpool_usage AS (
-    SELECT
-      dataset.pool_id,
-      sum(dataset.size_used) AS size_used
-    FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),
-  candidate_zpools AS (
-    SELECT DISTINCT ON (zpool.sled_id)
-      old_zpool_usage.pool_id
-    FROM (
+  old_regions
+    AS (
+      SELECT
+        region.id,
+        region.time_created,
+        region.time_modified,
+        region.dataset_id,
+        region.volume_id,
+        region.block_size,
+        region.blocks_per_extent,
+        region.extent_count
+      FROM
+        region
+      WHERE
+        region.volume_id = $1
+    ),
+  old_zpool_usage
+    AS (
+      SELECT
+        dataset.pool_id, sum(dataset.size_used) AS size_used
+      FROM
+        dataset
+      WHERE
+        (dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)
+      GROUP BY
+        dataset.pool_id
+    ),
+  candidate_zpools
+    AS (
+      SELECT
+        DISTINCT ON (zpool.sled_id) old_zpool_usage.pool_id
+      FROM
         old_zpool_usage
-          INNER JOIN
-        (zpool INNER JOIN sled ON (zpool.sled_id = sled.id)) ON (zpool.id = old_zpool_usage.pool_id)
-    )
-    WHERE (
-      ((old_zpool_usage.size_used + $2 ) <= total_size)
-      AND
-      (sled.provision_state = 'provisionable')
-    )
-    ORDER BY zpool.sled_id, md5((CAST(zpool.id as BYTEA) || $3))),
-  candidate_datasets AS (
-    SELECT DISTINCT ON (dataset.pool_id)
-      dataset.id,
-      dataset.pool_id
-    FROM (dataset INNER JOIN candidate_zpools ON (dataset.pool_id = candidate_zpools.pool_id))
-    WHERE (
-      ((dataset.time_deleted IS NULL) AND
-      (dataset.size_used IS NOT NULL)) AND
-      (dataset.kind = 'crucible')
+        INNER JOIN (zpool INNER JOIN sled ON zpool.sled_id = sled.id) ON
+            zpool.id = old_zpool_usage.pool_id
+      WHERE
+        (old_zpool_usage.size_used + $2) <= total_size AND sled.provision_state = 'provisionable'
+      ORDER BY
+        zpool.sled_id, md5(CAST(zpool.id AS BYTES) || $3)
+    ),
+  candidate_datasets
+    AS (
+      SELECT
+        DISTINCT ON (dataset.pool_id) dataset.id, dataset.pool_id
+      FROM
+        dataset INNER JOIN candidate_zpools ON dataset.pool_id = candidate_zpools.pool_id
+      WHERE
+        ((dataset.time_deleted IS NULL) AND (dataset.size_used IS NOT NULL))
+        AND dataset.kind = 'crucible'
+      ORDER BY
+        dataset.pool_id, md5(CAST(dataset.id AS BYTES) || $4)
+    ),
+  shuffled_candidate_datasets
+    AS (
+      SELECT
+        candidate_datasets.id, candidate_datasets.pool_id
+      FROM
+        candidate_datasets
+      ORDER BY
+        md5(CAST(candidate_datasets.id AS BYTES) || $5)
+      LIMIT
+        $6
+    ),
+  candidate_regions
+    AS (
+      SELECT
+        gen_random_uuid() AS id,
+        now() AS time_created,
+        now() AS time_modified,
+        shuffled_candidate_datasets.id AS dataset_id,
+        $7 AS volume_id,
+        $8 AS block_size,
+        $9 AS blocks_per_extent,
+        $10 AS extent_count
+      FROM
+        shuffled_candidate_datasets
+    ),
+  proposed_dataset_changes
+    AS (
+      SELECT
+        candidate_regions.dataset_id AS id,
+        dataset.pool_id AS pool_id,
+        candidate_regions.block_size
+        * candidate_regions.blocks_per_extent
+        * candidate_regions.extent_count
+          AS size_used_delta
+      FROM
+        candidate_regions INNER JOIN dataset ON dataset.id = candidate_regions.dataset_id
+    ),
+  do_insert
+    AS (
+      SELECT
+        (
+          (
+            (SELECT count(*) FROM old_regions LIMIT 1) < $11
+            AND CAST(
+                IF(
+                  ((SELECT count(*) FROM candidate_zpools LIMIT 1) >= $12),
+                  'TRUE',
+                  'Not enough space'
+                )
+                  AS BOOL
+              )
+          )
+          AND CAST(
+              IF(
+                ((SELECT count(*) FROM candidate_regions LIMIT 1) >= $13),
+                'TRUE',
+                'Not enough datasets'
+              )
+                AS BOOL
+            )
+        )
+        AND CAST(
+            IF(
+              (
+                (
+                  SELECT
+                    count(DISTINCT dataset.pool_id)
+                  FROM
+                    candidate_regions
+                    INNER JOIN dataset ON candidate_regions.dataset_id = dataset.id
+                  LIMIT
+                    1
+                )
+                >= $14
+              ),
+              'TRUE',
+              'Not enough unique zpools selected'
+            )
+              AS BOOL
+          )
+          AS insert
+    ),
+  inserted_regions
+    AS (
+      INSERT
+      INTO
+        region
+          (
+            id,
+            time_created,
+            time_modified,
+            dataset_id,
+            volume_id,
+            block_size,
+            blocks_per_extent,
+            extent_count
+          )
+      SELECT
+        candidate_regions.id,
+        candidate_regions.time_created,
+        candidate_regions.time_modified,
+        candidate_regions.dataset_id,
+        candidate_regions.volume_id,
+        candidate_regions.block_size,
+        candidate_regions.blocks_per_extent,
+        candidate_regions.extent_count
+      FROM
+        candidate_regions
+      WHERE
+        (SELECT do_insert.insert FROM do_insert LIMIT 1)
+      RETURNING
+        region.id,
+        region.time_created,
+        region.time_modified,
+        region.dataset_id,
+        region.volume_id,
+        region.block_size,
+        region.blocks_per_extent,
+        region.extent_count
+    ),
+  updated_datasets
+    AS (
+      UPDATE
+        dataset
+      SET
+        size_used
+          = dataset.size_used
+          + (
+              SELECT
+                proposed_dataset_changes.size_used_delta
+              FROM
+                proposed_dataset_changes
+              WHERE
+                proposed_dataset_changes.id = dataset.id
+              LIMIT
+                1
+            )
+      WHERE
+        dataset.id = ANY (SELECT proposed_dataset_changes.id FROM proposed_dataset_changes)
+        AND (SELECT do_insert.insert FROM do_insert LIMIT 1)
+      RETURNING
+        dataset.id,
+        dataset.time_created,
+        dataset.time_modified,
+        dataset.time_deleted,
+        dataset.rcgen,
+        dataset.pool_id,
+        dataset.ip,
+        dataset.port,
+        dataset.kind,
+        dataset.size_used
     )
-    ORDER BY dataset.pool_id, md5((CAST(dataset.id as BYTEA) || $4))
-  ),
-  shuffled_candidate_datasets AS (
-    SELECT
-      candidate_datasets.id,
-      candidate_datasets.pool_id
-    FROM candidate_datasets
-    ORDER BY md5((CAST(candidate_datasets.id as BYTEA) || $5)) LIMIT $6
-  ),
-  candidate_regions AS (
-    SELECT
-      gen_random_uuid() AS id,
-      now() AS time_created,
-      now() AS time_modified,
-      shuffled_candidate_datasets.id AS dataset_id,
-      $7 AS volume_id,
-      $8 AS block_size,
-      $9 AS blocks_per_extent,
-      $10 AS extent_count
-    FROM shuffled_candidate_datasets
-  ),
-  proposed_dataset_changes AS (
-    SELECT
-      candidate_regions.dataset_id AS id,
-      dataset.pool_id AS pool_id,
-      ((candidate_regions.block_size * candidate_regions.blocks_per_extent) * candidate_regions.extent_count) AS size_used_delta
-    FROM (candidate_regions INNER JOIN dataset ON (dataset.id = candidate_regions.dataset_id))
-  ),
-  do_insert AS (
-    SELECT (((
-        ((SELECT COUNT(*) FROM old_regions LIMIT 1) < $11) AND
-        CAST(IF(((SELECT COUNT(*) FROM candidate_zpools LIMIT 1) >= $12), 'TRUE', 'Not enough space') AS BOOL)) AND
-        CAST(IF(((SELECT COUNT(*) FROM candidate_regions LIMIT 1) >= $13), 'TRUE', 'Not enough datasets') AS BOOL)) AND
-        CAST(IF(((SELECT COUNT(DISTINCT dataset.pool_id) FROM (candidate_regions INNER JOIN dataset ON (candidate_regions.dataset_id = dataset.id)) LIMIT 1) >= $14), 'TRUE', 'Not enough unique zpools selected') AS BOOL)
-    ) AS insert
-  ),
-  inserted_regions AS (
-    INSERT INTO region
-      (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count)
-    SELECT candidate_regions.id, candidate_regions.time_created, candidate_regions.time_modified, candidate_regions.dataset_id, candidate_regions.volume_id, candidate_regions.block_size, candidate_regions.blocks_per_extent, candidate_regions.extent_count
-    FROM candidate_regions
-    WHERE
-      (SELECT do_insert.insert FROM do_insert LIMIT 1)
-    RETURNING region.id, region.time_created, region.time_modified, region.dataset_id, region.volume_id, region.block_size, region.blocks_per_extent, region.extent_count
-  ),
-  updated_datasets AS (
-    UPDATE dataset SET
-      size_used = (dataset.size_used + (SELECT proposed_dataset_changes.size_used_delta FROM proposed_dataset_changes WHERE (proposed_dataset_changes.id = dataset.id) LIMIT 1))
-    WHERE (
-      (dataset.id = ANY(SELECT proposed_dataset_changes.id FROM proposed_dataset_changes)) AND
-      (SELECT do_insert.insert FROM do_insert LIMIT 1))
-    RETURNING dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used
-  )
 (
-  SELECT dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used, old_regions.id, old_regions.time_created, old_regions.time_modified, old_regions.dataset_id, old_regions.volume_id, old_regions.block_size, old_regions.blocks_per_extent, old_regions.extent_count
+  SELECT
+    dataset.id,
+    dataset.time_created,
+    dataset.time_modified,
+    dataset.time_deleted,
+    dataset.rcgen,
+    dataset.pool_id,
+    dataset.ip,
+    dataset.port,
+    dataset.kind,
+    dataset.size_used,
+    old_regions.id,
+    old_regions.time_created,
+    old_regions.time_modified,
+    old_regions.dataset_id,
+    old_regions.volume_id,
+    old_regions.block_size,
+    old_regions.blocks_per_extent,
+    old_regions.extent_count
   FROM
-    (old_regions INNER JOIN dataset ON (old_regions.dataset_id = dataset.id))
+    old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id
 )
 UNION
-(
-  SELECT updated_datasets.id, updated_datasets.time_created, updated_datasets.time_modified, updated_datasets.time_deleted, updated_datasets.rcgen, updated_datasets.pool_id, updated_datasets.ip, updated_datasets.port, updated_datasets.kind, updated_datasets.size_used, inserted_regions.id, inserted_regions.time_created, inserted_regions.time_modified, inserted_regions.dataset_id, inserted_regions.volume_id, inserted_regions.block_size, inserted_regions.blocks_per_extent, inserted_regions.extent_count
-  FROM (inserted_regions INNER JOIN updated_datasets ON (inserted_regions.dataset_id = updated_datasets.id))
-) -- binds: [00000000-0000-0000-0000-000000000000, 16384, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3, 00000000-0000-0000-0000-000000000000, 512, 4, 8, 3, 3, 3, 3]
\ No newline at end of file
+  (
+    SELECT
+      updated_datasets.id,
+      updated_datasets.time_created,
+      updated_datasets.time_modified,
+      updated_datasets.time_deleted,
+      updated_datasets.rcgen,
+      updated_datasets.pool_id,
+      updated_datasets.ip,
+      updated_datasets.port,
+      updated_datasets.kind,
+      updated_datasets.size_used,
+      inserted_regions.id,
+      inserted_regions.time_created,
+      inserted_regions.time_modified,
+      inserted_regions.dataset_id,
+      inserted_regions.volume_id,
+      inserted_regions.block_size,
+      inserted_regions.blocks_per_extent,
+      inserted_regions.extent_count
+    FROM
+      inserted_regions
+      INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id
+  )
diff --git a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql
index d5b8e5f5fb..c5b97df72d 100644
--- a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql
+++ b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql
@@ -1,97 +1,251 @@
 WITH
-  old_regions AS (
-    SELECT region.id, region.time_created, region.time_modified, region.dataset_id, region.volume_id, region.block_size, region.blocks_per_extent, region.extent_count
-    FROM region WHERE (region.volume_id = $1)),
-  old_zpool_usage AS (
-    SELECT
-      dataset.pool_id,
-      sum(dataset.size_used) AS size_used
-    FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),
-  candidate_zpools AS (
-    SELECT
-      old_zpool_usage.pool_id
-    FROM (
-      old_zpool_usage
-      INNER JOIN
-      (zpool INNER JOIN sled ON (zpool.sled_id = sled.id))
-      ON (zpool.id = old_zpool_usage.pool_id)
-    )
-    WHERE (
-      ((old_zpool_usage.size_used + $2 ) <= total_size)
-      AND
-      (sled.provision_state = 'provisionable')
+  old_regions
+    AS (
+      SELECT
+        region.id,
+        region.time_created,
+        region.time_modified,
+        region.dataset_id,
+        region.volume_id,
+        region.block_size,
+        region.blocks_per_extent,
+        region.extent_count
+      FROM
+        region
+      WHERE
+        region.volume_id = $1
+    ),
+  old_zpool_usage
+    AS (
+      SELECT
+        dataset.pool_id, sum(dataset.size_used) AS size_used
+      FROM
+        dataset
+      WHERE
+        (dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)
+      GROUP BY
+        dataset.pool_id
+    ),
+  candidate_zpools
+    AS (
+      SELECT
+        old_zpool_usage.pool_id
+      FROM
+        old_zpool_usage
+        INNER JOIN (zpool INNER JOIN sled ON zpool.sled_id = sled.id) ON
+            zpool.id = old_zpool_usage.pool_id
+      WHERE
+        (old_zpool_usage.size_used + $2) <= total_size AND sled.provision_state = 'provisionable'
+    ),
+  candidate_datasets
+    AS (
+      SELECT
+        DISTINCT ON (dataset.pool_id) dataset.id, dataset.pool_id
+      FROM
+        dataset INNER JOIN candidate_zpools ON dataset.pool_id = candidate_zpools.pool_id
+      WHERE
+        ((dataset.time_deleted IS NULL) AND (dataset.size_used IS NOT NULL))
+        AND dataset.kind = 'crucible'
+      ORDER BY
+        dataset.pool_id, md5(CAST(dataset.id AS BYTES) || $3)
+    ),
+  shuffled_candidate_datasets
+    AS (
+      SELECT
+        candidate_datasets.id, candidate_datasets.pool_id
+      FROM
+        candidate_datasets
+      ORDER BY
+        md5(CAST(candidate_datasets.id AS BYTES) || $4)
+      LIMIT
+        $5
+    ),
+  candidate_regions
+    AS (
+      SELECT
+        gen_random_uuid() AS id,
+        now() AS time_created,
+        now() AS time_modified,
+        shuffled_candidate_datasets.id AS dataset_id,
+        $6 AS volume_id,
+        $7 AS block_size,
+        $8 AS blocks_per_extent,
+        $9 AS extent_count
+      FROM
+        shuffled_candidate_datasets
+    ),
+  proposed_dataset_changes
+    AS (
+      SELECT
+        candidate_regions.dataset_id AS id,
+        dataset.pool_id AS pool_id,
+        candidate_regions.block_size
+        * candidate_regions.blocks_per_extent
+        * candidate_regions.extent_count
+          AS size_used_delta
+      FROM
+        candidate_regions INNER JOIN dataset ON dataset.id = candidate_regions.dataset_id
+    ),
+  do_insert
+    AS (
+      SELECT
+        (
+          (
+            (SELECT count(*) FROM old_regions LIMIT 1) < $10
+            AND CAST(
+                IF(
+                  ((SELECT count(*) FROM candidate_zpools LIMIT 1) >= $11),
+                  'TRUE',
+                  'Not enough space'
+                )
+                  AS BOOL
+              )
+          )
+          AND CAST(
+              IF(
+                ((SELECT count(*) FROM candidate_regions LIMIT 1) >= $12),
+                'TRUE',
+                'Not enough datasets'
+              )
+                AS BOOL
+            )
+        )
+        AND CAST(
+            IF(
+              (
+                (
+                  SELECT
+                    count(DISTINCT dataset.pool_id)
+                  FROM
+                    candidate_regions
+                    INNER JOIN dataset ON candidate_regions.dataset_id = dataset.id
+                  LIMIT
+                    1
+                )
+                >= $13
+              ),
+              'TRUE',
+              'Not enough unique zpools selected'
+            )
+              AS BOOL
+          )
+          AS insert
+    ),
+  inserted_regions
+    AS (
+      INSERT
+      INTO
+        region
+          (
+            id,
+            time_created,
+            time_modified,
+            dataset_id,
+            volume_id,
+            block_size,
+            blocks_per_extent,
+            extent_count
+          )
+      SELECT
+        candidate_regions.id,
+        candidate_regions.time_created,
+        candidate_regions.time_modified,
+        candidate_regions.dataset_id,
+        candidate_regions.volume_id,
+        candidate_regions.block_size,
+        candidate_regions.blocks_per_extent,
+        candidate_regions.extent_count
+      FROM
+        candidate_regions
+      WHERE
+        (SELECT do_insert.insert FROM do_insert LIMIT 1)
+      RETURNING
+        region.id,
+        region.time_created,
+        region.time_modified,
+        region.dataset_id,
+        region.volume_id,
+        region.block_size,
+        region.blocks_per_extent,
+        region.extent_count
+    ),
+  updated_datasets
+    AS (
+      UPDATE
+        dataset
+      SET
+        size_used
+          = dataset.size_used
+          + (
+              SELECT
+                proposed_dataset_changes.size_used_delta
+              FROM
+                proposed_dataset_changes
+              WHERE
+                proposed_dataset_changes.id = dataset.id
+              LIMIT
+                1
+            )
+      WHERE
+        dataset.id = ANY (SELECT proposed_dataset_changes.id FROM proposed_dataset_changes)
+        AND (SELECT do_insert.insert FROM do_insert LIMIT 1)
+      RETURNING
+        dataset.id,
+        dataset.time_created,
+        dataset.time_modified,
+        dataset.time_deleted,
+        dataset.rcgen,
+        dataset.pool_id,
+        dataset.ip,
+        dataset.port,
+        dataset.kind,
+        dataset.size_used
     )
-  ),
-  candidate_datasets AS (
-    SELECT DISTINCT ON (dataset.pool_id)
-      dataset.id,
-      dataset.pool_id
-    FROM (dataset INNER JOIN candidate_zpools ON (dataset.pool_id = candidate_zpools.pool_id))
-    WHERE (
-      ((dataset.time_deleted IS NULL) AND
-      (dataset.size_used IS NOT NULL)) AND
-      (dataset.kind = 'crucible')
-    )
-    ORDER BY dataset.pool_id, md5((CAST(dataset.id as BYTEA) || $3))
-  ),
-  shuffled_candidate_datasets AS (
-    SELECT
-      candidate_datasets.id,
-      candidate_datasets.pool_id
-    FROM candidate_datasets
-    ORDER BY md5((CAST(candidate_datasets.id as BYTEA) || $4)) LIMIT $5
-  ),
-  candidate_regions AS (
-    SELECT
-      gen_random_uuid() AS id,
-      now() AS time_created,
-      now() AS time_modified,
-      shuffled_candidate_datasets.id AS dataset_id,
-      $6 AS volume_id,
-      $7 AS block_size,
-      $8 AS blocks_per_extent,
-      $9 AS extent_count
-    FROM shuffled_candidate_datasets
-  ),
-  proposed_dataset_changes AS (
-    SELECT
-      candidate_regions.dataset_id AS id,
-      dataset.pool_id AS pool_id,
-      ((candidate_regions.block_size * candidate_regions.blocks_per_extent) * candidate_regions.extent_count) AS size_used_delta
-    FROM (candidate_regions INNER JOIN dataset ON (dataset.id = candidate_regions.dataset_id))
-  ),
-  do_insert AS (
-    SELECT (((
-        ((SELECT COUNT(*) FROM old_regions LIMIT 1) < $10) AND
-        CAST(IF(((SELECT COUNT(*) FROM candidate_zpools LIMIT 1) >= $11), 'TRUE', 'Not enough space') AS BOOL)) AND
-        CAST(IF(((SELECT COUNT(*) FROM candidate_regions LIMIT 1) >= $12), 'TRUE', 'Not enough datasets') AS BOOL)) AND
-        CAST(IF(((SELECT COUNT(DISTINCT dataset.pool_id) FROM (candidate_regions INNER JOIN dataset ON (candidate_regions.dataset_id = dataset.id)) LIMIT 1) >= $13), 'TRUE', 'Not enough unique zpools selected') AS BOOL)
-    ) AS insert
-  ),
-  inserted_regions AS (
-    INSERT INTO region
-      (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count)
-    SELECT candidate_regions.id, candidate_regions.time_created, candidate_regions.time_modified, candidate_regions.dataset_id, candidate_regions.volume_id, candidate_regions.block_size, candidate_regions.blocks_per_extent, candidate_regions.extent_count
-    FROM candidate_regions
-    WHERE
-      (SELECT do_insert.insert FROM do_insert LIMIT 1)
-    RETURNING region.id, region.time_created, region.time_modified, region.dataset_id, region.volume_id, region.block_size, region.blocks_per_extent, region.extent_count
-  ),
-  updated_datasets AS (
-    UPDATE dataset SET
-      size_used = (dataset.size_used + (SELECT proposed_dataset_changes.size_used_delta FROM proposed_dataset_changes WHERE (proposed_dataset_changes.id = dataset.id) LIMIT 1))
-    WHERE (
-      (dataset.id = ANY(SELECT proposed_dataset_changes.id FROM proposed_dataset_changes)) AND
-      (SELECT do_insert.insert FROM do_insert LIMIT 1))
-    RETURNING dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used
-  )
 (
-  SELECT dataset.id, dataset.time_created, dataset.time_modified, dataset.time_deleted, dataset.rcgen, dataset.pool_id, dataset.ip, dataset.port, dataset.kind, dataset.size_used, old_regions.id, old_regions.time_created, old_regions.time_modified, old_regions.dataset_id, old_regions.volume_id, old_regions.block_size, old_regions.blocks_per_extent, old_regions.extent_count
+  SELECT
+    dataset.id,
+    dataset.time_created,
+    dataset.time_modified,
+    dataset.time_deleted,
+    dataset.rcgen,
+    dataset.pool_id,
+    dataset.ip,
+    dataset.port,
+    dataset.kind,
+    dataset.size_used,
+    old_regions.id,
+    old_regions.time_created,
+    old_regions.time_modified,
+    old_regions.dataset_id,
+    old_regions.volume_id,
+    old_regions.block_size,
+    old_regions.blocks_per_extent,
+    old_regions.extent_count
   FROM
-    (old_regions INNER JOIN dataset ON (old_regions.dataset_id = dataset.id))
+    old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id
 )
 UNION
-(
-  SELECT updated_datasets.id, updated_datasets.time_created, updated_datasets.time_modified, updated_datasets.time_deleted, updated_datasets.rcgen, updated_datasets.pool_id, updated_datasets.ip, updated_datasets.port, updated_datasets.kind, updated_datasets.size_used, inserted_regions.id, inserted_regions.time_created, inserted_regions.time_modified, inserted_regions.dataset_id, inserted_regions.volume_id, inserted_regions.block_size, inserted_regions.blocks_per_extent, inserted_regions.extent_count
-  FROM (inserted_regions INNER JOIN updated_datasets ON (inserted_regions.dataset_id = updated_datasets.id))
-) -- binds: [00000000-0000-0000-0000-000000000000, 16384, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3, 00000000-0000-0000-0000-000000000000, 512, 4, 8, 3, 3, 3, 3]
\ No newline at end of file
+  (
+    SELECT
+      updated_datasets.id,
+      updated_datasets.time_created,
+      updated_datasets.time_modified,
+      updated_datasets.time_deleted,
+      updated_datasets.rcgen,
+      updated_datasets.pool_id,
+      updated_datasets.ip,
+      updated_datasets.port,
+      updated_datasets.kind,
+      updated_datasets.size_used,
+      inserted_regions.id,
+      inserted_regions.time_created,
+      inserted_regions.time_modified,
+      inserted_regions.dataset_id,
+      inserted_regions.volume_id,
+      inserted_regions.block_size,
+      inserted_regions.blocks_per_extent,
+      inserted_regions.extent_count
+    FROM
+      inserted_regions
+      INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id
+  )
diff --git a/test-utils/src/dev/db.rs b/test-utils/src/dev/db.rs
index da6cc90b03..5766402f7f 100644
--- a/test-utils/src/dev/db.rs
+++ b/test-utils/src/dev/db.rs
@@ -21,6 +21,7 @@ use std::time::Duration;
 use tempfile::tempdir;
 use tempfile::TempDir;
 use thiserror::Error;
+use tokio::io::AsyncWriteExt;
 use tokio_postgres::config::Host;
 use tokio_postgres::config::SslMode;
 
@@ -497,6 +498,15 @@ pub enum CockroachStartError {
     )]
     TimedOut { pid: u32, time_waited: Duration },
 
+    #[error("failed to write input to cockroachdb")]
+    FailedToWrite(#[source] std::io::Error),
+
+    #[error("failed to await cockroachdb completing")]
+    FailedToWait(#[source] std::io::Error),
+
+    #[error("Invalid cockroachdb output")]
+    InvalidOutput(#[from] std::string::FromUtf8Error),
+
     #[error("unknown error waiting for cockroach to start")]
     Unknown {
         #[source]
@@ -648,6 +658,45 @@ impl Drop for CockroachInstance {
     }
 }
 
+/// Uses cockroachdb to run the "sqlfmt" command.
+pub async fn format_sql(input: &str) -> Result<String, CockroachStartError> {
+    let mut cmd = tokio::process::Command::new(COCKROACHDB_BIN);
+    let mut child = cmd
+        .stdin(Stdio::piped())
+        .stdout(Stdio::piped())
+        .args(&[
+            "sqlfmt",
+            "--tab-width",
+            "2",
+            "--use-spaces",
+            "true",
+            "--print-width",
+            "100",
+        ])
+        .spawn()
+        .map_err(|source| CockroachStartError::BadCmd {
+            cmd: COCKROACHDB_BIN.to_string(),
+            source,
+        })?;
+    let stdin = child.stdin.as_mut().unwrap();
+    stdin
+        .write_all(input.as_bytes())
+        .await
+        .map_err(CockroachStartError::FailedToWrite)?;
+    let output = child
+        .wait_with_output()
+        .await
+        .map_err(CockroachStartError::FailedToWait)?;
+
+    if !output.status.success() {
+        return Err(CockroachStartError::Exited {
+            exit_code: output.status.code().unwrap_or_else(|| -1),
+        });
+    }
+
+    Ok(String::from_utf8(output.stdout)?)
+}
+
 /// Verify that CockroachDB has the correct version
 pub async fn check_db_version() -> Result<(), CockroachStartError> {
     let mut cmd = tokio::process::Command::new(COCKROACHDB_BIN);

From c77a16760f38ff69364720e43bd42e74ce8899b8 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Tue, 26 Mar 2024 14:25:29 -0700
Subject: [PATCH 14/14] feedback

---
 nexus/db-queries/src/db/queries/region_allocation.rs | 11 ++++++-----
 nexus/db-queries/src/db/raw_query_builder.rs         |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index 776fc6c34b..2e4f4cd776 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -243,11 +243,12 @@ pub fn allocation_query(
     //
     // Selecting two datasets on the same zpool will not initially be
     // possible, as at the time of writing each zpool only has one dataset.
-    // Additionally, we intend to modify the allocation strategy to select
-    // from 3 distinct sleds, removing the possibility entirely. But, if we
-    // introduce a change that adds another crucible dataset to zpools
-    // before we improve the allocation strategy, this check will make sure
-    // we don't violate drive redundancy, and generate an error instead.
+    // Additionally, provide a configuration option ("distinct_sleds") to modify
+    // the allocation strategy to select from 3 distinct sleds, removing the
+    // possibility entirely. But, if we introduce a change that adds another
+    // crucible dataset to zpools before we improve the allocation strategy,
+    // this check will make sure we don't violate drive redundancy, and generate
+    // an error instead.
     .sql("
   do_insert AS (
     SELECT (((
diff --git a/nexus/db-queries/src/db/raw_query_builder.rs b/nexus/db-queries/src/db/raw_query_builder.rs
index 65ff22f86f..5c803e20ac 100644
--- a/nexus/db-queries/src/db/raw_query_builder.rs
+++ b/nexus/db-queries/src/db/raw_query_builder.rs
@@ -159,7 +159,7 @@ impl QueryBuilder {
 }
 
 /// Diesel's [diesel::query_builder::BoxedSqlQuery] has a few drawbacks that
-/// make wrapper more palatable:
+/// make this wrapper more palatable:
 ///
 /// - It always implements "Query" with SqlType = Untyped, so a caller could try to
 /// execute this query and get back any type.