From d5dace65fa093174a7502e0f6a3dde4ccabe6337 Mon Sep 17 00:00:00 2001 From: David Crespo Date: Fri, 26 Jan 2024 09:54:23 -0600 Subject: [PATCH] Fix IP pools data migration (#4903) Closes #4875 ## Problem After the IP pools migrations on the dogfood rack, the `default` pool was not marked `is_default=true` for the `oxide` silo when it should have been. ## Diagnosis When checking for silo-scoped default pools overriding a fleet-scoped default, I neglected to require that the silo-scoped defaults in question were non-deleted. This means that if there was a deleted pool with `silo_id=` and `is_default=true`, that would be considered an overriding default and leave us with `is_default=false` on the `default` pool. Well, I can't check `silo_id` and `is_default` on the pools because those columns have been dropped, but there is a deleted pool called `oxide-default` that says in the description it was meant as the default pool for only the `oxide` silo. ``` oot@[fd00:1122:3344:105::3]:32221/omicron> select * from omicron.public.ip_pool; id | name | description | time_created | time_modified | time_deleted | rcgen ---------------------------------------+--------------------+--------------------------------+-------------------------------+-------------------------------+-------------------------------+-------- 1efa49a2-3f3a-43ab-97ac-d38658069c39 | oxide-default | oxide silo-only pool - default | 2023-08-31 05:33:00.11079+00 | 2023-08-31 05:33:00.11079+00 | 2023-08-31 06:03:22.426488+00 | 1 ``` I think we can be pretty confident this is what got us. ## Fix Add `AND time_deleted IS NULL` to the subquery. ## Mitigation in existing systems Already done. Dogfood is the only long-running system where the bad migration ran, and all I had to do there was use the API to set `is_default=true` for the (`default` pool, `oxide` silo) link. --- nexus/tests/integration_tests/schema.rs | 90 ++++++++++++------------- schema/crdb/23.0.0/up4.sql | 7 +- 2 files changed, 47 insertions(+), 50 deletions(-) diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs index c3ba02d5ce..2d496fcd8e 100644 --- a/nexus/tests/integration_tests/schema.rs +++ b/nexus/tests/integration_tests/schema.rs @@ -951,9 +951,11 @@ const SILO1: Uuid = Uuid::from_u128(0x111151F0_5c3d_4647_83b0_8f3515da7be1); const SILO2: Uuid = Uuid::from_u128(0x222251F0_5c3d_4647_83b0_8f3515da7be1); // "6001" -> "Pool" +const POOL0: Uuid = Uuid::from_u128(0x00006001_5c3d_4647_83b0_8f3515da7be1); const POOL1: Uuid = Uuid::from_u128(0x11116001_5c3d_4647_83b0_8f3515da7be1); const POOL2: Uuid = Uuid::from_u128(0x22226001_5c3d_4647_83b0_8f3515da7be1); const POOL3: Uuid = Uuid::from_u128(0x33336001_5c3d_4647_83b0_8f3515da7be1); +const POOL4: Uuid = Uuid::from_u128(0x44446001_5c3d_4647_83b0_8f3515da7be1); // "513D" -> "Sled" const SLED1: Uuid = Uuid::from_u128(0x1111513d_5c3d_4647_83b0_8f3515da7be1); @@ -975,9 +977,11 @@ fn before_23_0_0(client: &Client) -> BoxFuture<'_, ()> { // no corresponding silo. client.batch_execute(&format!("INSERT INTO ip_pool (id, name, description, time_created, time_modified, time_deleted, rcgen, silo_id, is_default) VALUES + ('{POOL0}', 'pool2', '', now(), now(), now(), 1, '{SILO2}', true), ('{POOL1}', 'pool1', '', now(), now(), NULL, 1, '{SILO1}', true), ('{POOL2}', 'pool2', '', now(), now(), NULL, 1, '{SILO2}', false), - ('{POOL3}', 'pool3', '', now(), now(), NULL, 1, null, true); + ('{POOL3}', 'pool3', '', now(), now(), NULL, 1, null, true), + ('{POOL4}', 'pool4', '', now(), now(), NULL, 1, null, false); ")).await.expect("Failed to create IP Pool"); }) } @@ -992,56 +996,46 @@ fn after_23_0_0(client: &Client) -> BoxFuture<'_, ()> { .expect("Failed to query ip pool resource"); let ip_pool_resources = process_rows(&rows); - assert_eq!(ip_pool_resources.len(), 4); + assert_eq!(ip_pool_resources.len(), 6); + + fn assert_row( + row: &Vec, + ip_pool_id: Uuid, + silo_id: Uuid, + is_default: bool, + ) { + let type_silo = SqlEnum::from(("ip_pool_resource_type", "silo")); + assert_eq!( + row, + &vec![ + ColumnValue::new("ip_pool_id", ip_pool_id), + ColumnValue::new("resource_type", type_silo), + ColumnValue::new("resource_id", silo_id), + ColumnValue::new("is_default", is_default), + ], + ); + } - let type_silo = SqlEnum::from(("ip_pool_resource_type", "silo")); + // pool1 was default on silo1, so gets an entry in the join table + // reflecting that + assert_row(&ip_pool_resources[0].values, POOL1, SILO1, true); - // pool1, which referenced silo1 in the "ip_pool" table, has a newly - // created resource. - // - // The same relationship is true for pool2 / silo2. - assert_eq!( - ip_pool_resources[0].values, - vec![ - ColumnValue::new("ip_pool_id", POOL1), - ColumnValue::new("resource_type", type_silo.clone()), - ColumnValue::new("resource_id", SILO1), - ColumnValue::new("is_default", true), - ], - ); - assert_eq!( - ip_pool_resources[1].values, - vec![ - ColumnValue::new("ip_pool_id", POOL2), - ColumnValue::new("resource_type", type_silo.clone()), - ColumnValue::new("resource_id", SILO2), - ColumnValue::new("is_default", false), - ], - ); + // pool1 was default on silo1, so gets an entry in the join table + // reflecting that + assert_row(&ip_pool_resources[1].values, POOL2, SILO2, false); - // pool3 did not previously have a corresponding silo, so now it's associated - // with both silos as a new resource in each. - // - // Additionally, silo1 already had a default pool (pool1), but silo2 did - // not have one. As a result, pool3 becomes the new default pool for silo2. - assert_eq!( - ip_pool_resources[2].values, - vec![ - ColumnValue::new("ip_pool_id", POOL3), - ColumnValue::new("resource_type", type_silo.clone()), - ColumnValue::new("resource_id", SILO1), - ColumnValue::new("is_default", false), - ], - ); - assert_eq!( - ip_pool_resources[3].values, - vec![ - ColumnValue::new("ip_pool_id", POOL3), - ColumnValue::new("resource_type", type_silo.clone()), - ColumnValue::new("resource_id", SILO2), - ColumnValue::new("is_default", true), - ], - ); + // fleet-scoped silos are a little more complicated + + // pool3 was a fleet-level default, so now it's associated with both + // silos. silo1 had its own default pool as well (pool1), so pool3 + // cannot also be default for silo1. silo2 did not have its own default, + // so pool3 is default for silo2. + assert_row(&ip_pool_resources[2].values, POOL3, SILO1, false); + assert_row(&ip_pool_resources[3].values, POOL3, SILO2, true); + + // fleet-level pool that was not default becomes non-default on all silos + assert_row(&ip_pool_resources[4].values, POOL4, SILO1, false); + assert_row(&ip_pool_resources[5].values, POOL4, SILO2, false); }) } diff --git a/schema/crdb/23.0.0/up4.sql b/schema/crdb/23.0.0/up4.sql index 8fb43f9cf1..2235d0aa01 100644 --- a/schema/crdb/23.0.0/up4.sql +++ b/schema/crdb/23.0.0/up4.sql @@ -23,8 +23,11 @@ SELECT -- AND NOT EXISTS here causes is_default to be false in row 1 if there is a -- conflicting silo default pool. row 2 is inserted in up5. p.is_default AND NOT EXISTS ( - SELECT 1 FROM omicron.public.ip_pool - WHERE silo_id = s.id AND is_default + SELECT 1 + FROM omicron.public.ip_pool p0 + WHERE p0.silo_id = s.id + AND p0.is_default + AND p0.time_deleted IS NULL ) FROM omicron.public.ip_pool AS p -- cross join means we are looking at the cartesian product of all fleet-scoped