From d5dace65fa093174a7502e0f6a3dde4ccabe6337 Mon Sep 17 00:00:00 2001
From: David Crespo <david-crespo@users.noreply.github.com>
Date: Fri, 26 Jan 2024 09:54:23 -0600
Subject: [PATCH] Fix IP pools data migration (#4903)

Closes #4875

## Problem

After the IP pools migrations on the dogfood rack, the `default` pool
was not marked `is_default=true` for the `oxide` silo when it should
have been.

## Diagnosis

When checking for silo-scoped default pools overriding a fleet-scoped
default, I neglected to require that the silo-scoped defaults in
question were non-deleted. This means that if there was a deleted pool
with `silo_id=<oxide silo id>` and `is_default=true`, that would be
considered an overriding default and leave us with `is_default=false` on
the `default` pool.

Well, I can't check `silo_id` and `is_default` on the pools because
those columns have been dropped, but there is a deleted pool called
`oxide-default` that says in the description it was meant as the default
pool for only the `oxide` silo.

```
oot@[fd00:1122:3344:105::3]:32221/omicron> select * from omicron.public.ip_pool;
                   id                  |        name        |          description           |         time_created          |         time_modified         |         time_deleted          | rcgen
---------------------------------------+--------------------+--------------------------------+-------------------------------+-------------------------------+-------------------------------+--------
  1efa49a2-3f3a-43ab-97ac-d38658069c39 | oxide-default      | oxide silo-only pool - default | 2023-08-31 05:33:00.11079+00  | 2023-08-31 05:33:00.11079+00  | 2023-08-31 06:03:22.426488+00 |     1
```

I think we can be pretty confident this is what got us.

## Fix

Add `AND time_deleted IS NULL` to the subquery.

## Mitigation in existing systems

Already done. Dogfood is the only long-running system where the bad
migration ran, and all I had to do there was use the API to set
`is_default=true` for the (`default` pool, `oxide` silo) link.
---
 nexus/tests/integration_tests/schema.rs | 90 ++++++++++++-------------
 schema/crdb/23.0.0/up4.sql              |  7 +-
 2 files changed, 47 insertions(+), 50 deletions(-)
diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs
index c3ba02d5ce..2d496fcd8e 100644
--- a/nexus/tests/integration_tests/schema.rs
+++ b/nexus/tests/integration_tests/schema.rs
@@ -951,9 +951,11 @@ const SILO1: Uuid = Uuid::from_u128(0x111151F0_5c3d_4647_83b0_8f3515da7be1);
 const SILO2: Uuid = Uuid::from_u128(0x222251F0_5c3d_4647_83b0_8f3515da7be1);
 
 // "6001" -> "Pool"
+const POOL0: Uuid = Uuid::from_u128(0x00006001_5c3d_4647_83b0_8f3515da7be1);
 const POOL1: Uuid = Uuid::from_u128(0x11116001_5c3d_4647_83b0_8f3515da7be1);
 const POOL2: Uuid = Uuid::from_u128(0x22226001_5c3d_4647_83b0_8f3515da7be1);
 const POOL3: Uuid = Uuid::from_u128(0x33336001_5c3d_4647_83b0_8f3515da7be1);
+const POOL4: Uuid = Uuid::from_u128(0x44446001_5c3d_4647_83b0_8f3515da7be1);
 
 // "513D" -> "Sled"
 const SLED1: Uuid = Uuid::from_u128(0x1111513d_5c3d_4647_83b0_8f3515da7be1);
@@ -975,9 +977,11 @@ fn before_23_0_0(client: &Client) -> BoxFuture<'_, ()> {
         // no corresponding silo.
         client.batch_execute(&format!("INSERT INTO ip_pool
             (id, name, description, time_created, time_modified, time_deleted, rcgen, silo_id, is_default) VALUES
+          ('{POOL0}', 'pool2', '', now(), now(), now(), 1, '{SILO2}', true),
           ('{POOL1}', 'pool1', '', now(), now(), NULL, 1, '{SILO1}', true),
           ('{POOL2}', 'pool2', '', now(), now(), NULL, 1, '{SILO2}', false),
-          ('{POOL3}', 'pool3', '', now(), now(), NULL, 1, null, true);
+          ('{POOL3}', 'pool3', '', now(), now(), NULL, 1, null, true),
+          ('{POOL4}', 'pool4', '', now(), now(), NULL, 1, null, false);
         ")).await.expect("Failed to create IP Pool");
     })
 }
@@ -992,56 +996,46 @@ fn after_23_0_0(client: &Client) -> BoxFuture<'_, ()> {
             .expect("Failed to query ip pool resource");
         let ip_pool_resources = process_rows(&rows);
 
-        assert_eq!(ip_pool_resources.len(), 4);
+        assert_eq!(ip_pool_resources.len(), 6);
+
+        fn assert_row(
+            row: &Vec<ColumnValue>,
+            ip_pool_id: Uuid,
+            silo_id: Uuid,
+            is_default: bool,
+        ) {
+            let type_silo = SqlEnum::from(("ip_pool_resource_type", "silo"));
+            assert_eq!(
+                row,
+                &vec![
+                    ColumnValue::new("ip_pool_id", ip_pool_id),
+                    ColumnValue::new("resource_type", type_silo),
+                    ColumnValue::new("resource_id", silo_id),
+                    ColumnValue::new("is_default", is_default),
+                ],
+            );
+        }
 
-        let type_silo = SqlEnum::from(("ip_pool_resource_type", "silo"));
+        // pool1 was default on silo1, so gets an entry in the join table
+        // reflecting that
+        assert_row(&ip_pool_resources[0].values, POOL1, SILO1, true);
 
-        // pool1, which referenced silo1 in the "ip_pool" table, has a newly
-        // created resource.
-        //
-        // The same relationship is true for pool2 / silo2.
-        assert_eq!(
-            ip_pool_resources[0].values,
-            vec![
-                ColumnValue::new("ip_pool_id", POOL1),
-                ColumnValue::new("resource_type", type_silo.clone()),
-                ColumnValue::new("resource_id", SILO1),
-                ColumnValue::new("is_default", true),
-            ],
-        );
-        assert_eq!(
-            ip_pool_resources[1].values,
-            vec![
-                ColumnValue::new("ip_pool_id", POOL2),
-                ColumnValue::new("resource_type", type_silo.clone()),
-                ColumnValue::new("resource_id", SILO2),
-                ColumnValue::new("is_default", false),
-            ],
-        );
+        // pool1 was default on silo1, so gets an entry in the join table
+        // reflecting that
+        assert_row(&ip_pool_resources[1].values, POOL2, SILO2, false);
 
-        // pool3 did not previously have a corresponding silo, so now it's associated
-        // with both silos as a new resource in each.
-        //
-        // Additionally, silo1 already had a default pool (pool1), but silo2 did
-        // not have one. As a result, pool3 becomes the new default pool for silo2.
-        assert_eq!(
-            ip_pool_resources[2].values,
-            vec![
-                ColumnValue::new("ip_pool_id", POOL3),
-                ColumnValue::new("resource_type", type_silo.clone()),
-                ColumnValue::new("resource_id", SILO1),
-                ColumnValue::new("is_default", false),
-            ],
-        );
-        assert_eq!(
-            ip_pool_resources[3].values,
-            vec![
-                ColumnValue::new("ip_pool_id", POOL3),
-                ColumnValue::new("resource_type", type_silo.clone()),
-                ColumnValue::new("resource_id", SILO2),
-                ColumnValue::new("is_default", true),
-            ],
-        );
+        // fleet-scoped silos are a little more complicated
+
+        // pool3 was a fleet-level default, so now it's associated with both
+        // silos. silo1 had its own default pool as well (pool1), so pool3
+        // cannot also be default for silo1. silo2 did not have its own default,
+        // so pool3 is default for silo2.
+        assert_row(&ip_pool_resources[2].values, POOL3, SILO1, false);
+        assert_row(&ip_pool_resources[3].values, POOL3, SILO2, true);
+
+        // fleet-level pool that was not default becomes non-default on all silos
+        assert_row(&ip_pool_resources[4].values, POOL4, SILO1, false);
+        assert_row(&ip_pool_resources[5].values, POOL4, SILO2, false);
     })
 }
 
diff --git a/schema/crdb/23.0.0/up4.sql b/schema/crdb/23.0.0/up4.sql
index 8fb43f9cf1..2235d0aa01 100644
--- a/schema/crdb/23.0.0/up4.sql
+++ b/schema/crdb/23.0.0/up4.sql
@@ -23,8 +23,11 @@ SELECT
   -- AND NOT EXISTS here causes is_default to be false in row 1 if there is a
   -- conflicting silo default pool. row 2 is inserted in up5.
   p.is_default AND NOT EXISTS (
-    SELECT 1 FROM omicron.public.ip_pool 
-    WHERE silo_id = s.id AND is_default
+    SELECT 1 
+    FROM omicron.public.ip_pool p0
+    WHERE p0.silo_id = s.id 
+      AND p0.is_default
+      AND p0.time_deleted IS NULL
   )
 FROM omicron.public.ip_pool AS p
 -- cross join means we are looking at the cartesian product of all fleet-scoped