From def1ceb1a4150aa3739517e6b1cc16c8adf19b06 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Thu, 22 Aug 2024 00:37:51 +0000 Subject: [PATCH 01/16] index sled agent instances by propolis id, not instance id --- nexus/src/app/sagas/instance_create.rs | 3 +- openapi/sled-agent.json | 603 ++++++++++++------------- sled-agent/api/src/lib.rs | 60 +-- sled-agent/src/http_entrypoints.rs | 71 ++- sled-agent/src/instance.rs | 53 ++- sled-agent/src/instance_manager.rs | 199 ++++---- sled-agent/src/sim/collection.rs | 29 -- sled-agent/src/sim/http_entrypoints.rs | 94 ++-- sled-agent/src/sim/sled_agent.rs | 114 ++--- sled-agent/src/sim/storage.rs | 4 +- sled-agent/src/sled_agent.rs | 30 +- sled-agent/types/src/instance.rs | 8 +- 12 files changed, 593 insertions(+), 675 deletions(-) diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index d19230892f..0b6d8cc0f8 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -1220,8 +1220,7 @@ pub mod test { } async fn no_instances_or_disks_on_sled(sled_agent: &SledAgent) -> bool { - sled_agent.instance_count().await == 0 - && sled_agent.disk_count().await == 0 + sled_agent.vmm_count().await == 0 && sled_agent.disk_count().await == 0 } pub(crate) async fn verify_clean_slate( diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 4c40fb5da0..1ea29e35dc 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -220,36 +220,17 @@ } } }, - "/instances/{instance_id}": { - "put": { - "operationId": "instance_register", - "parameters": [ - { - "in": "path", - "name": "instance_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/TypedUuidForInstanceKind" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/InstanceEnsureBody" - } - } - }, - "required": true - }, + "/inventory": { + "get": { + "summary": "Fetch basic information about this sled", + "operationId": "inventory", "responses": { "200": { "description": "successful operation", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SledInstanceState" + "$ref": "#/components/schemas/Inventory" } } } @@ -261,26 +242,20 @@ "$ref": "#/components/responses/Error" } } - }, - "delete": { - "operationId": "instance_unregister", - "parameters": [ - { - "in": "path", - "name": "instance_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/TypedUuidForInstanceKind" - } - } - ], + } + }, + "/network-bootstore-config": { + "get": { + "summary": "This API endpoint is only reading the local sled agent's view of the", + "description": "bootstore. The boostore is a distributed data store that is eventually consistent. Reads from individual nodes may not represent the latest state.", + "operationId": "read_network_bootstore_config_cache", "responses": { "200": { "description": "successful operation", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstanceUnregisterResponse" + "$ref": "#/components/schemas/EarlyNetworkConfig" } } } @@ -292,52 +267,22 @@ "$ref": "#/components/responses/Error" } } - } - }, - "/instances/{instance_id}/disks/{disk_id}/snapshot": { - "post": { - "summary": "Take a snapshot of a disk that is attached to an instance", - "operationId": "instance_issue_disk_snapshot_request", - "parameters": [ - { - "in": "path", - "name": "disk_id", - "required": true, - "schema": { - "type": "string", - "format": "uuid" - } - }, - { - "in": "path", - "name": "instance_id", - "required": true, - "schema": { - "type": "string", - "format": "uuid" - } - } - ], + }, + "put": { + "operationId": "write_network_bootstore_config", "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstanceIssueDiskSnapshotRequestBody" + "$ref": "#/components/schemas/EarlyNetworkConfig" } } }, "required": true }, "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/InstanceIssueDiskSnapshotRequestResponse" - } - } - } + "204": { + "description": "resource updated" }, "4XX": { "$ref": "#/components/responses/Error" @@ -348,33 +293,20 @@ } } }, - "/instances/{instance_id}/external-ip": { - "put": { - "operationId": "instance_put_external_ip", - "parameters": [ - { - "in": "path", - "name": "instance_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/TypedUuidForInstanceKind" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/InstanceExternalIpBody" + "/omicron-physical-disks": { + "get": { + "operationId": "omicron_physical_disks_get", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OmicronPhysicalDisksConfig" + } } } }, - "required": true - }, - "responses": { - "204": { - "description": "resource updated" - }, "4XX": { "$ref": "#/components/responses/Error" }, @@ -383,31 +315,28 @@ } } }, - "delete": { - "operationId": "instance_delete_external_ip", - "parameters": [ - { - "in": "path", - "name": "instance_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/TypedUuidForInstanceKind" - } - } - ], + "put": { + "operationId": "omicron_physical_disks_put", "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstanceExternalIpBody" + "$ref": "#/components/schemas/OmicronPhysicalDisksConfig" } } }, "required": true }, "responses": { - "204": { - "description": "resource updated" + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DisksManagementResult" + } + } + } }, "4XX": { "$ref": "#/components/responses/Error" @@ -418,26 +347,16 @@ } } }, - "/instances/{instance_id}/state": { + "/omicron-zones": { "get": { - "operationId": "instance_get_state", - "parameters": [ - { - "in": "path", - "name": "instance_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/TypedUuidForInstanceKind" - } - } - ], + "operationId": "omicron_zones_get", "responses": { "200": { "description": "successful operation", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SledInstanceState" + "$ref": "#/components/schemas/OmicronZonesConfig" } } } @@ -451,37 +370,20 @@ } }, "put": { - "operationId": "instance_put_state", - "parameters": [ - { - "in": "path", - "name": "instance_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/TypedUuidForInstanceKind" - } - } - ], + "operationId": "omicron_zones_put", "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstancePutStateBody" + "$ref": "#/components/schemas/OmicronZonesConfig" } } }, "required": true }, "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/InstancePutStateResponse" - } - } - } + "204": { + "description": "resource updated" }, "4XX": { "$ref": "#/components/responses/Error" @@ -492,17 +394,17 @@ } } }, - "/inventory": { + "/sled-identifiers": { "get": { - "summary": "Fetch basic information about this sled", - "operationId": "inventory", + "summary": "Fetch sled identifiers", + "operationId": "sled_identifiers", "responses": { "200": { "description": "successful operation", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Inventory" + "$ref": "#/components/schemas/SledIdentifiers" } } } @@ -516,18 +418,16 @@ } } }, - "/network-bootstore-config": { + "/sled-role": { "get": { - "summary": "This API endpoint is only reading the local sled agent's view of the", - "description": "bootstore. The boostore is a distributed data store that is eventually consistent. Reads from individual nodes may not represent the latest state.", - "operationId": "read_network_bootstore_config_cache", + "operationId": "sled_role_get", "responses": { "200": { "description": "successful operation", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EarlyNetworkConfig" + "$ref": "#/components/schemas/SledRole" } } } @@ -539,14 +439,17 @@ "$ref": "#/components/responses/Error" } } - }, + } + }, + "/sleds": { "put": { - "operationId": "write_network_bootstore_config", + "summary": "Add a sled to a rack that was already initialized via RSS", + "operationId": "sled_add", "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EarlyNetworkConfig" + "$ref": "#/components/schemas/AddSledRequest" } } }, @@ -565,16 +468,42 @@ } } }, - "/omicron-physical-disks": { + "/switch-ports": { + "post": { + "operationId": "uplink_ensure", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SwitchPorts" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/timesync": { "get": { - "operationId": "omicron_physical_disks_get", + "operationId": "timesync_get", "responses": { "200": { "description": "successful operation", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OmicronPhysicalDisksConfig" + "$ref": "#/components/schemas/TimeSync" } } } @@ -586,29 +515,24 @@ "$ref": "#/components/responses/Error" } } - }, - "put": { - "operationId": "omicron_physical_disks_put", + } + }, + "/update": { + "post": { + "operationId": "update_artifact", "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OmicronPhysicalDisksConfig" + "$ref": "#/components/schemas/UpdateArtifactId" } } }, "required": true }, "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/DisksManagementResult" - } - } - } + "204": { + "description": "resource updated" }, "4XX": { "$ref": "#/components/responses/Error" @@ -619,16 +543,21 @@ } } }, - "/omicron-zones": { + "/v2p": { "get": { - "operationId": "omicron_zones_get", + "summary": "List v2p mappings present on sled", + "operationId": "list_v2p", "responses": { "200": { "description": "successful operation", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OmicronZonesConfig" + "title": "Array_of_VirtualNetworkInterfaceHost", + "type": "array", + "items": { + "$ref": "#/components/schemas/VirtualNetworkInterfaceHost" + } } } } @@ -642,12 +571,38 @@ } }, "put": { - "operationId": "omicron_zones_put", + "summary": "Create a mapping from a virtual NIC to a physical host", + "operationId": "set_v2p", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/VirtualNetworkInterfaceHost" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "summary": "Delete a mapping from a virtual NIC to a physical host", + "operationId": "del_v2p", "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OmicronZonesConfig" + "$ref": "#/components/schemas/VirtualNetworkInterfaceHost" } } }, @@ -666,17 +621,36 @@ } } }, - "/sled-identifiers": { - "get": { - "summary": "Fetch sled identifiers", - "operationId": "sled_identifiers", + "/vmms/{propolis_id}": { + "put": { + "operationId": "vmm_register", + "parameters": [ + { + "in": "path", + "name": "propolis_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForPropolisKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InstanceEnsureBody" + } + } + }, + "required": true + }, "responses": { "200": { "description": "successful operation", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SledIdentifiers" + "$ref": "#/components/schemas/SledInstanceState" } } } @@ -688,18 +662,26 @@ "$ref": "#/components/responses/Error" } } - } - }, - "/sled-role": { - "get": { - "operationId": "sled_role_get", + }, + "delete": { + "operationId": "vmm_unregister", + "parameters": [ + { + "in": "path", + "name": "propolis_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForPropolisKind" + } + } + ], "responses": { "200": { "description": "successful operation", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SledRole" + "$ref": "#/components/schemas/InstanceUnregisterResponse" } } } @@ -713,23 +695,49 @@ } } }, - "/sleds": { - "put": { - "summary": "Add a sled to a rack that was already initialized via RSS", - "operationId": "sled_add", + "/vmms/{propolis_id}/disks/{disk_id}/snapshot": { + "post": { + "summary": "Take a snapshot of a disk that is attached to an instance", + "operationId": "vmm_issue_disk_snapshot_request", + "parameters": [ + { + "in": "path", + "name": "disk_id", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + }, + { + "in": "path", + "name": "propolis_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForPropolisKind" + } + } + ], "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/AddSledRequest" + "$ref": "#/components/schemas/VmmIssueDiskSnapshotRequestBody" } } }, "required": true }, "responses": { - "204": { - "description": "resource updated" + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/VmmIssueDiskSnapshotRequestResponse" + } + } + } }, "4XX": { "$ref": "#/components/responses/Error" @@ -740,14 +748,24 @@ } } }, - "/switch-ports": { - "post": { - "operationId": "uplink_ensure", + "/vmms/{propolis_id}/external-ip": { + "put": { + "operationId": "vmm_put_external_ip", + "parameters": [ + { + "in": "path", + "name": "propolis_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForPropolisKind" + } + } + ], "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SwitchPorts" + "$ref": "#/components/schemas/InstanceExternalIpBody" } } }, @@ -764,39 +782,24 @@ "$ref": "#/components/responses/Error" } } - } - }, - "/timesync": { - "get": { - "operationId": "timesync_get", - "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/TimeSync" - } - } + }, + "delete": { + "operationId": "vmm_delete_external_ip", + "parameters": [ + { + "in": "path", + "name": "propolis_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForPropolisKind" } - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" } - } - } - }, - "/update": { - "post": { - "operationId": "update_artifact", + ], "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/UpdateArtifactId" + "$ref": "#/components/schemas/InstanceExternalIpBody" } } }, @@ -815,21 +818,26 @@ } } }, - "/v2p": { + "/vmms/{propolis_id}/state": { "get": { - "summary": "List v2p mappings present on sled", - "operationId": "list_v2p", + "operationId": "vmm_get_state", + "parameters": [ + { + "in": "path", + "name": "propolis_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForPropolisKind" + } + } + ], "responses": { "200": { "description": "successful operation", "content": { "application/json": { "schema": { - "title": "Array_of_VirtualNetworkInterfaceHost", - "type": "array", - "items": { - "$ref": "#/components/schemas/VirtualNetworkInterfaceHost" - } + "$ref": "#/components/schemas/SledInstanceState" } } } @@ -843,46 +851,37 @@ } }, "put": { - "summary": "Create a mapping from a virtual NIC to a physical host", - "operationId": "set_v2p", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/VirtualNetworkInterfaceHost" - } + "operationId": "vmm_put_state", + "parameters": [ + { + "in": "path", + "name": "propolis_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForPropolisKind" } - }, - "required": true - }, - "responses": { - "204": { - "description": "resource updated" - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" } - } - }, - "delete": { - "summary": "Delete a mapping from a virtual NIC to a physical host", - "operationId": "del_v2p", + ], "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/VirtualNetworkInterfaceHost" + "$ref": "#/components/schemas/InstancePutStateBody" } } }, "required": true }, "responses": { - "204": { - "description": "resource updated" + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InstancePutStateResponse" + } + } + } }, "4XX": { "$ref": "#/components/responses/Error" @@ -2837,6 +2836,14 @@ } ] }, + "instance_id": { + "description": "The ID of the instance for which this VMM is being created.", + "allOf": [ + { + "$ref": "#/components/schemas/TypedUuidForInstanceKind" + } + ] + }, "instance_runtime": { "description": "The instance runtime state for the instance being registered.", "allOf": [ @@ -2857,14 +2864,6 @@ "description": "The address at which this VMM should serve a Propolis server API.", "type": "string" }, - "propolis_id": { - "description": "The ID of the VMM being registered. This may not be the active VMM ID in the instance runtime state (e.g. if the new VMM is going to be a migration target).", - "allOf": [ - { - "$ref": "#/components/schemas/TypedUuidForPropolisKind" - } - ] - }, "vmm_runtime": { "description": "The initial VMM runtime state for the VMM being registered.", "allOf": [ @@ -2876,10 +2875,10 @@ }, "required": [ "hardware", + "instance_id", "instance_runtime", "metadata", "propolis_addr", - "propolis_id", "vmm_runtime" ] }, @@ -2985,30 +2984,6 @@ "source_nat" ] }, - "InstanceIssueDiskSnapshotRequestBody": { - "type": "object", - "properties": { - "snapshot_id": { - "type": "string", - "format": "uuid" - } - }, - "required": [ - "snapshot_id" - ] - }, - "InstanceIssueDiskSnapshotRequestResponse": { - "type": "object", - "properties": { - "snapshot_id": { - "type": "string", - "format": "uuid" - } - }, - "required": [ - "snapshot_id" - ] - }, "InstanceMetadata": { "description": "Metadata used to track statistics about an instance.", "type": "object", @@ -4912,6 +4887,10 @@ "sync" ] }, + "TypedUuidForInstanceKind": { + "type": "string", + "format": "uuid" + }, "TypedUuidForPropolisKind": { "type": "string", "format": "uuid" @@ -4996,6 +4975,30 @@ "vni" ] }, + "VmmIssueDiskSnapshotRequestBody": { + "type": "object", + "properties": { + "snapshot_id": { + "type": "string", + "format": "uuid" + } + }, + "required": [ + "snapshot_id" + ] + }, + "VmmIssueDiskSnapshotRequestResponse": { + "type": "object", + "properties": { + "snapshot_id": { + "type": "string", + "format": "uuid" + } + }, + "required": [ + "snapshot_id" + ] + }, "VmmRuntimeState": { "description": "The dynamic runtime properties of an individual VMM process.", "type": "object", @@ -5408,10 +5411,6 @@ "A", "B" ] - }, - "TypedUuidForInstanceKind": { - "type": "string", - "format": "uuid" } }, "responses": { diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index c44b24d712..5787a0fd41 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -23,7 +23,7 @@ use omicron_common::{ }, disk::{DiskVariant, DisksManagementResult, OmicronPhysicalDisksConfig}, }; -use omicron_uuid_kinds::{InstanceUuid, ZpoolUuid}; +use omicron_uuid_kinds::{PropolisUuid, ZpoolUuid}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sled_agent_types::{ @@ -212,59 +212,59 @@ pub trait SledAgentApi { #[endpoint { method = PUT, - path = "/instances/{instance_id}", + path = "/vmms/{propolis_id}", }] - async fn instance_register( + async fn vmm_register( rqctx: RequestContext, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result, HttpError>; #[endpoint { method = DELETE, - path = "/instances/{instance_id}", + path = "/vmms/{propolis_id}", }] - async fn instance_unregister( + async fn vmm_unregister( rqctx: RequestContext, - path_params: Path, + path_params: Path, ) -> Result, HttpError>; #[endpoint { method = PUT, - path = "/instances/{instance_id}/state", + path = "/vmms/{propolis_id}/state", }] - async fn instance_put_state( + async fn vmm_put_state( rqctx: RequestContext, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result, HttpError>; #[endpoint { method = GET, - path = "/instances/{instance_id}/state", + path = "/vmms/{propolis_id}/state", }] - async fn instance_get_state( + async fn vmm_get_state( rqctx: RequestContext, - path_params: Path, + path_params: Path, ) -> Result, HttpError>; #[endpoint { method = PUT, - path = "/instances/{instance_id}/external-ip", + path = "/vmms/{propolis_id}/external-ip", }] - async fn instance_put_external_ip( + async fn vmm_put_external_ip( rqctx: RequestContext, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result; #[endpoint { method = DELETE, - path = "/instances/{instance_id}/external-ip", + path = "/vmms/{propolis_id}/external-ip", }] - async fn instance_delete_external_ip( + async fn vmm_delete_external_ip( rqctx: RequestContext, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result; @@ -290,14 +290,14 @@ pub trait SledAgentApi { /// Take a snapshot of a disk that is attached to an instance #[endpoint { method = POST, - path = "/instances/{instance_id}/disks/{disk_id}/snapshot", + path = "/vmms/{propolis_id}/disks/{disk_id}/snapshot", }] - async fn instance_issue_disk_snapshot_request( + async fn vmm_issue_disk_snapshot_request( rqctx: RequestContext, - path_params: Path, - body: TypedBody, + path_params: Path, + body: TypedBody, ) -> Result< - HttpResponseOk, + HttpResponseOk, HttpError, >; @@ -516,8 +516,8 @@ impl From for DiskType { /// Path parameters for Instance requests (sled agent API) #[derive(Deserialize, JsonSchema)] -pub struct InstancePathParam { - pub instance_id: InstanceUuid, +pub struct VmmPathParam { + pub propolis_id: PropolisUuid, } /// Path parameters for Disk requests (sled agent API) @@ -527,18 +527,18 @@ pub struct DiskPathParam { } #[derive(Deserialize, JsonSchema)] -pub struct InstanceIssueDiskSnapshotRequestPathParam { - pub instance_id: Uuid, +pub struct VmmIssueDiskSnapshotRequestPathParam { + pub propolis_id: PropolisUuid, pub disk_id: Uuid, } #[derive(Deserialize, JsonSchema)] -pub struct InstanceIssueDiskSnapshotRequestBody { +pub struct VmmIssueDiskSnapshotRequestBody { pub snapshot_id: Uuid, } #[derive(Serialize, JsonSchema)] -pub struct InstanceIssueDiskSnapshotRequestResponse { +pub struct VmmIssueDiskSnapshotRequestResponse { pub snapshot_id: Uuid, } diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 2bf8067d1c..d5a85fa960 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -30,7 +30,6 @@ use omicron_common::api::internal::shared::{ use omicron_common::disk::{ DiskVariant, DisksManagementResult, M2Slot, OmicronPhysicalDisksConfig, }; -use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; use sled_agent_api::*; use sled_agent_types::boot_disk::{ BootDiskOsWriteStatus, BootDiskPathParams, BootDiskUpdatePathParams, @@ -294,18 +293,18 @@ impl SledAgentApi for SledAgentImpl { Ok(HttpResponseUpdatedNoContent()) } - async fn instance_register( + async fn vmm_register( rqctx: RequestContext, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result, HttpError> { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; + let propolis_id = path_params.into_inner().propolis_id; let body_args = body.into_inner(); Ok(HttpResponseOk( sa.instance_ensure_registered( - instance_id, - body_args.propolis_id, + body_args.instance_id, + propolis_id, body_args.hardware, body_args.instance_runtime, body_args.vmm_runtime, @@ -316,58 +315,56 @@ impl SledAgentApi for SledAgentImpl { )) } - async fn instance_unregister( + async fn vmm_unregister( rqctx: RequestContext, - path_params: Path, + path_params: Path, ) -> Result, HttpError> { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - Ok(HttpResponseOk(sa.instance_ensure_unregistered(instance_id).await?)) + let id = path_params.into_inner().propolis_id; + Ok(HttpResponseOk(sa.instance_ensure_unregistered(id).await?)) } - async fn instance_put_state( + async fn vmm_put_state( rqctx: RequestContext, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result, HttpError> { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; + let id = path_params.into_inner().propolis_id; let body_args = body.into_inner(); - Ok(HttpResponseOk( - sa.instance_ensure_state(instance_id, body_args.state).await?, - )) + Ok(HttpResponseOk(sa.instance_ensure_state(id, body_args.state).await?)) } - async fn instance_get_state( + async fn vmm_get_state( rqctx: RequestContext, - path_params: Path, + path_params: Path, ) -> Result, HttpError> { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?)) + let id = path_params.into_inner().propolis_id; + Ok(HttpResponseOk(sa.instance_get_state(id).await?)) } - async fn instance_put_external_ip( + async fn vmm_put_external_ip( rqctx: RequestContext, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; + let id = path_params.into_inner().propolis_id; let body_args = body.into_inner(); - sa.instance_put_external_ip(instance_id, &body_args).await?; + sa.instance_put_external_ip(id, &body_args).await?; Ok(HttpResponseUpdatedNoContent()) } - async fn instance_delete_external_ip( + async fn vmm_delete_external_ip( rqctx: RequestContext, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; + let id = path_params.into_inner().propolis_id; let body_args = body.into_inner(); - sa.instance_delete_external_ip(instance_id, &body_args).await?; + sa.instance_delete_external_ip(id, &body_args).await?; Ok(HttpResponseUpdatedNoContent()) } @@ -399,26 +396,24 @@ impl SledAgentApi for SledAgentImpl { Ok(HttpResponseUpdatedNoContent()) } - async fn instance_issue_disk_snapshot_request( + async fn vmm_issue_disk_snapshot_request( rqctx: RequestContext, - path_params: Path, - body: TypedBody, - ) -> Result< - HttpResponseOk, - HttpError, - > { + path_params: Path, + body: TypedBody, + ) -> Result, HttpError> + { let sa = rqctx.context(); let path_params = path_params.into_inner(); let body = body.into_inner(); - sa.instance_issue_disk_snapshot_request( - InstanceUuid::from_untyped_uuid(path_params.instance_id), + sa.vmm_issue_disk_snapshot_request( + path_params.propolis_id, path_params.disk_id, body.snapshot_id, ) .await?; - Ok(HttpResponseOk(InstanceIssueDiskSnapshotRequestResponse { + Ok(HttpResponseOk(VmmIssueDiskSnapshotRequestResponse { snapshot_id: body.snapshot_id, })) } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 0bcbc97fd2..cb9cf7e5aa 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -104,11 +104,11 @@ pub enum Error { #[error("Error resolving DNS name: {0}")] ResolveError(#[from] internal_dns::resolver::ResolveError), - #[error("Instance {0} not running!")] - InstanceNotRunning(InstanceUuid), + #[error("Propolis job with ID {0} is registered but not running")] + VmNotRunning(PropolisUuid), - #[error("Instance already registered with Propolis ID {0}")] - InstanceAlreadyRegistered(PropolisUuid), + #[error("Propolis job with ID {0} already registered")] + PropolisAlreadyRegistered(PropolisUuid), #[error("No U.2 devices found")] U2NotFound, @@ -499,15 +499,10 @@ impl InstanceRunner { } /// Yields this instance's ID. - fn id(&self) -> InstanceUuid { + fn instance_id(&self) -> InstanceUuid { InstanceUuid::from_untyped_uuid(self.properties.id) } - /// Yields this instance's Propolis's ID. - fn propolis_id(&self) -> &PropolisUuid { - &self.propolis_id - } - async fn publish_state_to_nexus(&self) { // Retry until Nexus acknowledges that it has applied this state update. // Note that Nexus may receive this call but then fail while reacting @@ -518,13 +513,13 @@ impl InstanceRunner { || async { let state = self.state.sled_instance_state(); info!(self.log, "Publishing instance state update to Nexus"; - "instance_id" => %self.id(), + "instance_id" => %self.instance_id(), "state" => ?state, ); self.nexus_client .cpapi_instances_put( - &self.id().into_untyped_uuid(), + &self.instance_id().into_untyped_uuid(), &state.into(), ) .await @@ -576,7 +571,7 @@ impl InstanceRunner { warn!(self.log, "Failed to publish instance state to Nexus: {}", err.to_string(); - "instance_id" => %self.id(), + "instance_id" => %self.instance_id(), "retry_after" => ?delay); }, ) @@ -586,7 +581,7 @@ impl InstanceRunner { error!( self.log, "Failed to publish state to Nexus, will not retry: {:?}", e; - "instance_id" => %self.id() + "instance_id" => %self.instance_id() ); } } @@ -634,7 +629,7 @@ impl InstanceRunner { match action { Some(InstanceAction::Destroy) => { info!(self.log, "terminating VMM that has exited"; - "instance_id" => %self.id()); + "instance_id" => %self.instance_id()); let mark_failed = false; self.terminate(mark_failed).await; Reaction::Terminate @@ -780,7 +775,7 @@ impl InstanceRunner { /// This routine is safe to call even if the instance's zone was never /// started. It is also safe to call multiple times on a single instance. async fn terminate_inner(&mut self) { - let zname = propolis_zone_name(self.propolis_id()); + let zname = propolis_zone_name(&self.propolis_id); // First fetch the running state. // @@ -948,8 +943,10 @@ impl InstanceRunner { } } -/// A reference to a single instance running a running Propolis server. +/// Describes a single Propolis server that incarnates a specific instance. pub struct Instance { + id: InstanceUuid, + tx: mpsc::Sender, #[allow(dead_code)] @@ -1104,7 +1101,11 @@ impl Instance { let runner_handle = tokio::task::spawn(async move { runner.run().await }); - Ok(Instance { tx, runner_handle }) + Ok(Instance { id, tx, runner_handle }) + } + + pub fn id(&self) -> InstanceUuid { + self.id } /// Create bundle from an instance zone. @@ -1224,7 +1225,7 @@ impl InstanceRunner { async fn request_zone_bundle( &self, ) -> Result { - let name = propolis_zone_name(self.propolis_id()); + let name = propolis_zone_name(&self.propolis_id); match &self.running_state { None => Err(BundleError::Unavailable { name }), Some(RunningState { ref running_zone, .. }) => { @@ -1330,7 +1331,7 @@ impl InstanceRunner { } InstanceStateRequested::Reboot => { if self.running_state.is_none() { - return Err(Error::InstanceNotRunning(self.id())); + return Err(Error::VmNotRunning(self.propolis_id)); } ( Some(PropolisRequest::Reboot), @@ -1379,7 +1380,7 @@ impl InstanceRunner { // Create a zone for the propolis instance, using the previously // configured VNICs. - let zname = propolis_zone_name(self.propolis_id()); + let zname = propolis_zone_name(&self.propolis_id); let mut rng = rand::rngs::StdRng::from_entropy(); let latest_disks = self .storage @@ -1399,7 +1400,7 @@ impl InstanceRunner { .with_zone_root_path(root) .with_zone_image_paths(&["/opt/oxide".into()]) .with_zone_type("propolis-server") - .with_unique_name(self.propolis_id().into_untyped_uuid()) + .with_unique_name(self.propolis_id.into_untyped_uuid()) .with_datasets(&[]) .with_filesystems(&[]) .with_data_links(&[]) @@ -1508,9 +1509,7 @@ impl InstanceRunner { Ok(()) } else { - Err(Error::InstanceNotRunning(InstanceUuid::from_untyped_uuid( - self.properties.id, - ))) + Err(Error::VmNotRunning(self.propolis_id)) } } @@ -1760,7 +1759,7 @@ mod tests { let id = InstanceUuid::new_v4(); let propolis_id = PropolisUuid::from_untyped_uuid(PROPOLIS_ID); - let ticket = InstanceTicket::new_without_manager_for_test(id); + let ticket = InstanceTicket::new_without_manager_for_test(propolis_id); let initial_state = fake_instance_initial_state(propolis_addr); @@ -2256,7 +2255,7 @@ mod tests { .await .unwrap(); - mgr.ensure_state(instance_id, InstanceStateRequested::Running) + mgr.ensure_state(propolis_id, InstanceStateRequested::Running) .await .unwrap(); diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 63164ed290..19c4562910 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -44,8 +44,8 @@ pub enum Error { #[error("Instance error: {0}")] Instance(#[from] crate::instance::Error), - #[error("No such instance ID: {0}")] - NoSuchInstance(InstanceUuid), + #[error("Instance with ID {0} not found")] + NoSuchInstance(PropolisUuid), #[error("OPTE port management error: {0}")] Opte(#[from] illumos_utils::opte::Error), @@ -117,7 +117,7 @@ impl InstanceManager { terminate_tx, terminate_rx, nexus_client, - instances: BTreeMap::new(), + jobs: BTreeMap::new(), vnic_allocator: VnicAllocator::new("Instance", etherstub), port_manager, storage_generation: None, @@ -172,13 +172,13 @@ impl InstanceManager { pub async fn ensure_unregistered( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ) -> Result { let (tx, rx) = oneshot::channel(); self.inner .tx .send(InstanceManagerRequest::EnsureUnregistered { - instance_id, + propolis_id, tx, }) .await @@ -188,14 +188,14 @@ impl InstanceManager { pub async fn ensure_state( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, target: InstanceStateRequested, ) -> Result { let (tx, rx) = oneshot::channel(); self.inner .tx .send(InstanceManagerRequest::EnsureState { - instance_id, + propolis_id, target, tx, }) @@ -220,17 +220,17 @@ impl InstanceManager { } } - pub async fn instance_issue_disk_snapshot_request( + pub async fn issue_disk_snapshot_request( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, disk_id: Uuid, snapshot_id: Uuid, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); self.inner .tx - .send(InstanceManagerRequest::InstanceIssueDiskSnapshot { - instance_id, + .send(InstanceManagerRequest::IssueDiskSnapshot { + propolis_id, disk_id, snapshot_id, tx, @@ -259,14 +259,14 @@ impl InstanceManager { pub async fn add_external_ip( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ip: &InstanceExternalIpBody, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); self.inner .tx - .send(InstanceManagerRequest::InstanceAddExternalIp { - instance_id, + .send(InstanceManagerRequest::AddExternalIp { + propolis_id, ip: *ip, tx, }) @@ -277,14 +277,14 @@ impl InstanceManager { pub async fn delete_external_ip( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ip: &InstanceExternalIpBody, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); self.inner .tx - .send(InstanceManagerRequest::InstanceDeleteExternalIp { - instance_id, + .send(InstanceManagerRequest::DeleteExternalIp { + propolis_id, ip: *ip, tx, }) @@ -300,12 +300,12 @@ impl InstanceManager { pub async fn get_instance_state( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ) -> Result { let (tx, rx) = oneshot::channel(); self.inner .tx - .send(InstanceManagerRequest::GetState { instance_id, tx }) + .send(InstanceManagerRequest::GetState { propolis_id, tx }) .await .map_err(|_| Error::FailedSendInstanceManagerClosed)?; rx.await? @@ -354,17 +354,17 @@ enum InstanceManagerRequest { tx: oneshot::Sender>, }, EnsureUnregistered { - instance_id: InstanceUuid, + propolis_id: PropolisUuid, tx: oneshot::Sender>, }, EnsureState { - instance_id: InstanceUuid, + propolis_id: PropolisUuid, target: InstanceStateRequested, tx: oneshot::Sender>, }, - InstanceIssueDiskSnapshot { - instance_id: InstanceUuid, + IssueDiskSnapshot { + propolis_id: PropolisUuid, disk_id: Uuid, snapshot_id: Uuid, tx: oneshot::Sender>, @@ -373,18 +373,18 @@ enum InstanceManagerRequest { name: String, tx: oneshot::Sender>, }, - InstanceAddExternalIp { - instance_id: InstanceUuid, + AddExternalIp { + propolis_id: PropolisUuid, ip: InstanceExternalIpBody, tx: oneshot::Sender>, }, - InstanceDeleteExternalIp { - instance_id: InstanceUuid, + DeleteExternalIp { + propolis_id: PropolisUuid, ip: InstanceExternalIpBody, tx: oneshot::Sender>, }, GetState { - instance_id: InstanceUuid, + propolis_id: PropolisUuid, tx: oneshot::Sender>, }, OnlyUseDisks { @@ -396,7 +396,7 @@ enum InstanceManagerRequest { // Requests that the instance manager stop processing information about a // particular instance. struct InstanceDeregisterRequest { - id: InstanceUuid, + id: PropolisUuid, } struct InstanceManagerRunner { @@ -422,8 +422,8 @@ struct InstanceManagerRunner { // TODO: If we held an object representing an enum of "Created OR Running" // instance, we could avoid the methods within "instance.rs" that panic // if the Propolis client hasn't been initialized. - /// A mapping from a Sled Agent "Instance ID" to ("Propolis ID", [Instance]). - instances: BTreeMap, + /// A mapping from a Propolis ID to the [Instance] that Propolis incarnates. + jobs: BTreeMap, vnic_allocator: VnicAllocator, port_manager: PortManager, @@ -451,7 +451,7 @@ impl InstanceManagerRunner { request = self.terminate_rx.recv() => { match request { Some(request) => { - self.instances.remove(&request.id); + self.jobs.remove(&request.id); }, None => { warn!(self.log, "InstanceManager's 'instance terminate' channel closed; shutting down"); @@ -484,31 +484,31 @@ impl InstanceManagerRunner { metadata ).await).map_err(|_| Error::FailedSendClientClosed) }, - Some(EnsureUnregistered { instance_id, tx }) => { - self.ensure_unregistered(tx, instance_id).await + Some(EnsureUnregistered { propolis_id, tx }) => { + self.ensure_unregistered(tx, propolis_id).await }, - Some(EnsureState { instance_id, target, tx }) => { - self.ensure_state(tx, instance_id, target).await + Some(EnsureState { propolis_id, target, tx }) => { + self.ensure_state(tx, propolis_id, target).await }, - Some(InstanceIssueDiskSnapshot { instance_id, disk_id, snapshot_id, tx }) => { - self.instance_issue_disk_snapshot_request(tx, instance_id, disk_id, snapshot_id).await + Some(IssueDiskSnapshot { propolis_id, disk_id, snapshot_id, tx }) => { + self.issue_disk_snapshot_request(tx, propolis_id, disk_id, snapshot_id).await }, Some(CreateZoneBundle { name, tx }) => { self.create_zone_bundle(tx, &name).await.map_err(Error::from) }, - Some(InstanceAddExternalIp { instance_id, ip, tx }) => { - self.add_external_ip(tx, instance_id, &ip).await + Some(AddExternalIp { propolis_id, ip, tx }) => { + self.add_external_ip(tx, propolis_id, &ip).await }, - Some(InstanceDeleteExternalIp { instance_id, ip, tx }) => { - self.delete_external_ip(tx, instance_id, &ip).await + Some(DeleteExternalIp { propolis_id, ip, tx }) => { + self.delete_external_ip(tx, propolis_id, &ip).await }, - Some(GetState { instance_id, tx }) => { + Some(GetState { propolis_id, tx }) => { // TODO(eliza): it could potentially be nice to // refactor this to use `tokio::sync::watch`, rather // than having to force `GetState` requests to // serialize with the requests that actually update // the state... - self.get_instance_state(tx, instance_id).await + self.get_instance_state(tx, propolis_id).await }, Some(OnlyUseDisks { disks, tx } ) => { self.use_only_these_disks(disks).await; @@ -533,8 +533,8 @@ impl InstanceManagerRunner { } } - fn get_instance(&self, instance_id: InstanceUuid) -> Option<&Instance> { - self.instances.get(&instance_id).map(|(_id, v)| v) + fn get_propolis(&self, propolis_id: PropolisUuid) -> Option<&Instance> { + self.jobs.get(&propolis_id) } /// Ensures that the instance manager contains a registered instance with @@ -579,17 +579,16 @@ impl InstanceManagerRunner { ); let instance = { - if let Some((existing_propolis_id, existing_instance)) = - self.instances.get(&instance_id) - { - if propolis_id != *existing_propolis_id { + if let Some(existing_instance) = self.jobs.get(&propolis_id) { + if instance_id != existing_instance.id() { info!(&self.log, - "instance already registered with another Propolis ID"; - "instance_id" => %instance_id, - "existing_propolis_id" => %*existing_propolis_id); + "Propolis ID already used by another instance"; + "propolis_id" => %propolis_id, + "existing_instanceId" => %existing_instance.id()); + return Err(Error::Instance( - crate::instance::Error::InstanceAlreadyRegistered( - *existing_propolis_id, + crate::instance::Error::PropolisAlreadyRegistered( + propolis_id, ), )); } else { @@ -602,11 +601,14 @@ impl InstanceManagerRunner { } else { info!(&self.log, "registering new instance"; - "instance_id" => ?instance_id); - let instance_log = - self.log.new(o!("instance_id" => format!("{instance_id}"))); + "instance_id" => %instance_id, + "propolis_id" => %propolis_id); + let instance_log = self.log.new(o!( + "instance_id" => format!("{instance_id}"), + "propolis_id" => format!("{propolis_id}") + )); let ticket = - InstanceTicket::new(instance_id, self.terminate_tx.clone()); + InstanceTicket::new(propolis_id, self.terminate_tx.clone()); let services = InstanceManagerServices { nexus_client: self.nexus_client.clone(), @@ -635,26 +637,25 @@ impl InstanceManagerRunner { sled_identifiers, metadata, )?; - let _old = - self.instances.insert(instance_id, (propolis_id, instance)); + let _old = self.jobs.insert(propolis_id, instance); assert!(_old.is_none()); - &self.instances.get(&instance_id).unwrap().1 + &self.jobs.get(&propolis_id).unwrap() } }; Ok(instance.current_state().await?) } - /// Idempotently ensures the instance is not registered with this instance - /// manager. If the instance exists and has a running Propolis, that - /// Propolis is rudely terminated. + /// Idempotently ensures this VM is not registered with this instance + /// manager. If this Propolis job is registered and has a running zone, the + /// zone is rudely terminated. async fn ensure_unregistered( &mut self, tx: oneshot::Sender>, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ) -> Result<(), Error> { // If the instance does not exist, we response immediately. - let Some(instance) = self.get_instance(instance_id) else { + let Some(instance) = self.get_propolis(propolis_id) else { tx.send(Ok(InstanceUnregisterResponse { updated_runtime: None })) .map_err(|_| Error::FailedSendClientClosed)?; return Ok(()); @@ -667,15 +668,15 @@ impl InstanceManagerRunner { Ok(()) } - /// Idempotently attempts to drive the supplied instance into the supplied + /// Idempotently attempts to drive the supplied Propolis into the supplied /// runtime state. async fn ensure_state( &mut self, tx: oneshot::Sender>, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, target: InstanceStateRequested, ) -> Result<(), Error> { - let Some(instance) = self.get_instance(instance_id) else { + let Some(instance) = self.get_propolis(propolis_id) else { match target { // If the instance isn't registered, then by definition it // isn't running here. Allow requests to stop or destroy the @@ -692,7 +693,7 @@ impl InstanceManagerRunner { .map_err(|_| Error::FailedSendClientClosed)?; } _ => { - tx.send(Err(Error::NoSuchInstance(instance_id))) + tx.send(Err(Error::NoSuchInstance(propolis_id))) .map_err(|_| Error::FailedSendClientClosed)?; } } @@ -702,20 +703,17 @@ impl InstanceManagerRunner { Ok(()) } - async fn instance_issue_disk_snapshot_request( + async fn issue_disk_snapshot_request( &self, tx: oneshot::Sender>, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, disk_id: Uuid, snapshot_id: Uuid, ) -> Result<(), Error> { - let instance = { - let (_, instance) = self - .instances - .get(&instance_id) - .ok_or(Error::NoSuchInstance(instance_id))?; - instance - }; + let instance = self + .jobs + .get(&propolis_id) + .ok_or(Error::NoSuchInstance(propolis_id))?; instance .issue_snapshot_request(tx, disk_id, snapshot_id) @@ -729,10 +727,11 @@ impl InstanceManagerRunner { tx: oneshot::Sender>, name: &str, ) -> Result<(), BundleError> { - let Some((_propolis_id, instance)) = - self.instances.values().find(|(propolis_id, _instance)| { - name == propolis_zone_name(propolis_id) - }) + let Some(instance) = self + .jobs + .iter() + .find(|(propolis_id, _)| name == propolis_zone_name(propolis_id)) + .map(|entry| entry.1) else { return Err(BundleError::NoSuchZone { name: name.to_string() }); }; @@ -742,11 +741,11 @@ impl InstanceManagerRunner { async fn add_external_ip( &self, tx: oneshot::Sender>, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ip: &InstanceExternalIpBody, ) -> Result<(), Error> { - let Some(instance) = self.get_instance(instance_id) else { - return Err(Error::NoSuchInstance(instance_id)); + let Some(instance) = self.get_propolis(propolis_id) else { + return Err(Error::NoSuchInstance(propolis_id)); }; instance.add_external_ip(tx, ip).await?; Ok(()) @@ -755,11 +754,11 @@ impl InstanceManagerRunner { async fn delete_external_ip( &self, tx: oneshot::Sender>, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ip: &InstanceExternalIpBody, ) -> Result<(), Error> { - let Some(instance) = self.get_instance(instance_id) else { - return Err(Error::NoSuchInstance(instance_id)); + let Some(instance) = self.get_propolis(propolis_id) else { + return Err(Error::NoSuchInstance(propolis_id)); }; instance.delete_external_ip(tx, ip).await?; @@ -769,11 +768,11 @@ impl InstanceManagerRunner { async fn get_instance_state( &self, tx: oneshot::Sender>, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ) -> Result<(), Error> { - let Some(instance) = self.get_instance(instance_id) else { + let Some(instance) = self.get_propolis(propolis_id) else { return tx - .send(Err(Error::NoSuchInstance(instance_id))) + .send(Err(Error::NoSuchInstance(propolis_id))) .map_err(|_| Error::FailedSendClientClosed); }; @@ -801,7 +800,7 @@ impl InstanceManagerRunner { let u2_set: HashSet<_> = disks.all_u2_zpools().into_iter().collect(); let mut to_remove = vec![]; - for (id, (_, instance)) in self.instances.iter() { + for (id, instance) in self.jobs.iter() { // If we can read the filesystem pool, consider it. Otherwise, move // on, to prevent blocking the cleanup of other instances. let Ok(Some(filesystem_pool)) = @@ -817,7 +816,7 @@ impl InstanceManagerRunner { for id in to_remove { info!(self.log, "use_only_these_disks: Removing instance"; "instance_id" => ?id); - if let Some((_, instance)) = self.instances.remove(&id) { + if let Some(instance) = self.jobs.remove(&id) { let (tx, rx) = oneshot::channel(); let mark_failed = true; if let Err(e) = instance.terminate(tx, mark_failed).await { @@ -835,22 +834,22 @@ impl InstanceManagerRunner { /// Represents membership of an instance in the [`InstanceManager`]. pub struct InstanceTicket { - id: InstanceUuid, + id: PropolisUuid, terminate_tx: Option>, } impl InstanceTicket { - // Creates a new instance ticket for instance "id" to be removed - // from the manger on destruction. + // Creates a new instance ticket for the Propolis job with the supplied `id` + // to be removed from the manager on destruction. fn new( - id: InstanceUuid, + id: PropolisUuid, terminate_tx: mpsc::UnboundedSender, ) -> Self { InstanceTicket { id, terminate_tx: Some(terminate_tx) } } #[cfg(all(test, target_os = "illumos"))] - pub(crate) fn new_without_manager_for_test(id: InstanceUuid) -> Self { + pub(crate) fn new_without_manager_for_test(id: PropolisUuid) -> Self { Self { id, terminate_tx: None } } diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index 6057d03f70..aaa2092ceb 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -364,35 +364,6 @@ impl SimCollection { pub async fn contains_key(self: &Arc, id: &Uuid) -> bool { self.objects.lock().await.contains_key(id) } - - /// Iterates over all of the existing objects in the collection and, for any - /// that meet `condition`, asks to transition them into the supplied target - /// state. - /// - /// If any such transition fails, this routine short-circuits and does not - /// attempt to transition any other objects. - // - // TODO: It's likely more idiomatic to have an `iter_mut` routine that - // returns a struct that impls Iterator and yields &mut S references. The - // tricky bit is that the struct must hold the objects lock during the - // iteration. Figure out if there's a better way to arrange all this. - pub async fn sim_ensure_for_each_where( - self: &Arc, - condition: C, - target: &S::RequestedState, - ) -> Result<(), Error> - where - C: Fn(&S) -> bool, - { - let mut objects = self.objects.lock().await; - for o in objects.values_mut() { - if condition(&o.object) { - o.transition(target.clone())?; - } - } - - Ok(()) - } } impl SimCollection { diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index e93bebad98..8a2ea3a65e 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -32,7 +32,6 @@ use omicron_common::api::internal::shared::{ }; use omicron_common::disk::DisksManagementResult; use omicron_common::disk::OmicronPhysicalDisksConfig; -use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; use sled_agent_api::*; use sled_agent_types::boot_disk::BootDiskOsWriteStatus; use sled_agent_types::boot_disk::BootDiskPathParams; @@ -83,18 +82,18 @@ enum SledAgentSimImpl {} impl SledAgentApi for SledAgentSimImpl { type Context = Arc; - async fn instance_register( + async fn vmm_register( rqctx: RequestContext, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result, HttpError> { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; + let propolis_id = path_params.into_inner().propolis_id; let body_args = body.into_inner(); Ok(HttpResponseOk( sa.instance_register( - instance_id, - body_args.propolis_id, + body_args.instance_id, + propolis_id, body_args.hardware, body_args.instance_runtime, body_args.vmm_runtime, @@ -104,58 +103,56 @@ impl SledAgentApi for SledAgentSimImpl { )) } - async fn instance_unregister( + async fn vmm_unregister( rqctx: RequestContext, - path_params: Path, + path_params: Path, ) -> Result, HttpError> { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - Ok(HttpResponseOk(sa.instance_unregister(instance_id).await?)) + let id = path_params.into_inner().propolis_id; + Ok(HttpResponseOk(sa.instance_unregister(id).await?)) } - async fn instance_put_state( + async fn vmm_put_state( rqctx: RequestContext, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result, HttpError> { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; + let id = path_params.into_inner().propolis_id; let body_args = body.into_inner(); - Ok(HttpResponseOk( - sa.instance_ensure_state(instance_id, body_args.state).await?, - )) + Ok(HttpResponseOk(sa.instance_ensure_state(id, body_args.state).await?)) } - async fn instance_get_state( + async fn vmm_get_state( rqctx: RequestContext, - path_params: Path, + path_params: Path, ) -> Result, HttpError> { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?)) + let id = path_params.into_inner().propolis_id; + Ok(HttpResponseOk(sa.instance_get_state(id).await?)) } - async fn instance_put_external_ip( + async fn vmm_put_external_ip( rqctx: RequestContext, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; + let id = path_params.into_inner().propolis_id; let body_args = body.into_inner(); - sa.instance_put_external_ip(instance_id, &body_args).await?; + sa.instance_put_external_ip(id, &body_args).await?; Ok(HttpResponseUpdatedNoContent()) } - async fn instance_delete_external_ip( + async fn vmm_delete_external_ip( rqctx: RequestContext, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; + let id = path_params.into_inner().propolis_id; let body_args = body.into_inner(); - sa.instance_delete_external_ip(instance_id, &body_args).await?; + sa.instance_delete_external_ip(id, &body_args).await?; Ok(HttpResponseUpdatedNoContent()) } @@ -192,27 +189,25 @@ impl SledAgentApi for SledAgentSimImpl { Ok(HttpResponseUpdatedNoContent()) } - async fn instance_issue_disk_snapshot_request( + async fn vmm_issue_disk_snapshot_request( rqctx: RequestContext, - path_params: Path, - body: TypedBody, - ) -> Result< - HttpResponseOk, - HttpError, - > { + path_params: Path, + body: TypedBody, + ) -> Result, HttpError> + { let sa = rqctx.context(); let path_params = path_params.into_inner(); let body = body.into_inner(); sa.instance_issue_disk_snapshot_request( - InstanceUuid::from_untyped_uuid(path_params.instance_id), + path_params.propolis_id, path_params.disk_id, body.snapshot_id, ) .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; - Ok(HttpResponseOk(InstanceIssueDiskSnapshotRequestResponse { + Ok(HttpResponseOk(VmmIssueDiskSnapshotRequestResponse { snapshot_id: body.snapshot_id, })) } @@ -512,45 +507,44 @@ fn method_unimplemented() -> Result { #[endpoint { method = POST, - path = "/instances/{instance_id}/poke", + path = "/vmms/{propolis_id}/poke", }] async fn instance_poke_post( rqctx: RequestContext>, - path_params: Path, + path_params: Path, ) -> Result { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - sa.instance_poke(instance_id, PokeMode::Drain).await; + let id = path_params.into_inner().propolis_id; + sa.vmm_poke(id, PokeMode::Drain).await; Ok(HttpResponseUpdatedNoContent()) } #[endpoint { method = POST, - path = "/instances/{instance_id}/poke-single-step", + path = "/vmms/{propolis_id}/poke-single-step", }] async fn instance_poke_single_step_post( rqctx: RequestContext>, - path_params: Path, + path_params: Path, ) -> Result { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - sa.instance_poke(instance_id, PokeMode::SingleStep).await; + let id = path_params.into_inner().propolis_id; + sa.vmm_poke(id, PokeMode::SingleStep).await; Ok(HttpResponseUpdatedNoContent()) } #[endpoint { method = POST, - path = "/instances/{instance_id}/sim-migration-source", + path = "/vmms/{propolis_id}/sim-migration-source", }] async fn instance_post_sim_migration_source( rqctx: RequestContext>, - path_params: Path, + path_params: Path, body: TypedBody, ) -> Result { let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - sa.instance_simulate_migration_source(instance_id, body.into_inner()) - .await?; + let id = path_params.into_inner().propolis_id; + sa.instance_simulate_migration_source(id, body.into_inner()).await?; Ok(HttpResponseUpdatedNoContent()) } diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 10536c8c80..5b804d3da5 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -71,8 +71,8 @@ use uuid::Uuid; pub struct SledAgent { pub id: Uuid, pub ip: IpAddr, - /// collection of simulated instances, indexed by instance uuid - instances: Arc>, + /// collection of simulated VMMs, indexed by Propolis uuid + vmms: Arc>, /// collection of simulated disks, indexed by disk uuid disks: Arc>, storage: Mutex, @@ -170,7 +170,7 @@ impl SledAgent { Arc::new(SledAgent { id, ip: config.dropshot.bind_address.ip(), - instances: Arc::new(SimCollection::new( + vmms: Arc::new(SimCollection::new( Arc::clone(&nexus_client), instance_log, sim_mode, @@ -317,11 +317,7 @@ impl SledAgent { // point to the correct address. let mock_lock = self.mock_propolis.lock().await; if let Some((_srv, client)) = mock_lock.as_ref() { - if !self - .instances - .contains_key(&instance_id.into_untyped_uuid()) - .await - { + if !self.vmms.contains_key(&instance_id.into_untyped_uuid()).await { let metadata = propolis_client::types::InstanceMetadata { project_id: metadata.project_id, silo_id: metadata.silo_id, @@ -379,9 +375,9 @@ impl SledAgent { }); let instance_run_time_state = self - .instances + .vmms .sim_ensure( - &instance_id.into_untyped_uuid(), + &propolis_id.into_untyped_uuid(), SledInstanceState { vmm_state: vmm_runtime, propolis_id, @@ -417,11 +413,11 @@ impl SledAgent { /// not notified. pub async fn instance_unregister( self: &Arc, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ) -> Result { let instance = match self - .instances - .sim_get_cloned_object(&instance_id.into_untyped_uuid()) + .vmms + .sim_get_cloned_object(&propolis_id.into_untyped_uuid()) .await { Ok(instance) => instance, @@ -431,19 +427,18 @@ impl SledAgent { Err(e) => return Err(e), }; - self.detach_disks_from_instance(instance_id).await?; let response = InstanceUnregisterResponse { updated_runtime: Some(instance.terminate()), }; - self.instances.sim_force_remove(instance_id.into_untyped_uuid()).await; + self.vmms.sim_force_remove(propolis_id.into_untyped_uuid()).await; Ok(response) } /// Asks the supplied instance to transition to the requested state. pub async fn instance_ensure_state( self: &Arc, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, state: InstanceStateRequested, ) -> Result { if let Some(e) = self.instance_ensure_state_error.lock().await.as_ref() @@ -452,8 +447,8 @@ impl SledAgent { } let current = match self - .instances - .sim_get_cloned_object(&instance_id.into_untyped_uuid()) + .vmms + .sim_get_cloned_object(&propolis_id.into_untyped_uuid()) .await { Ok(i) => i.current().clone(), @@ -465,8 +460,8 @@ impl SledAgent { } _ => { return Err(Error::invalid_request(&format!( - "instance {} not registered on sled", - instance_id, + "Propolis {} not registered on sled", + propolis_id, ))); } }, @@ -481,15 +476,15 @@ impl SledAgent { )); } InstanceStateRequested::Running => { - let instances = self.instances.clone(); + let vmms = self.vmms.clone(); let log = self.log.new( o!("component" => "SledAgent-insure_instance_state"), ); tokio::spawn(async move { tokio::time::sleep(Duration::from_secs(10)).await; - match instances + match vmms .sim_ensure( - &instance_id.into_untyped_uuid(), + &propolis_id.into_untyped_uuid(), current, Some(state), ) @@ -521,30 +516,24 @@ impl SledAgent { } let new_state = self - .instances - .sim_ensure(&instance_id.into_untyped_uuid(), current, Some(state)) + .vmms + .sim_ensure(&propolis_id.into_untyped_uuid(), current, Some(state)) .await?; - // If this request will shut down the simulated instance, look for any - // disks that are attached to it and drive them to the Detached state. - if matches!(state, InstanceStateRequested::Stopped) { - self.detach_disks_from_instance(instance_id).await?; - } - Ok(InstancePutStateResponse { updated_runtime: Some(new_state) }) } pub async fn instance_get_state( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ) -> Result { let instance = self - .instances - .sim_get_cloned_object(&instance_id.into_untyped_uuid()) + .vmms + .sim_get_cloned_object(&propolis_id.into_untyped_uuid()) .await .map_err(|_| { crate::sled_agent::Error::Instance( - crate::instance_manager::Error::NoSuchInstance(instance_id), + crate::instance_manager::Error::NoSuchInstance(propolis_id), ) })?; Ok(instance.current()) @@ -552,16 +541,16 @@ impl SledAgent { pub async fn instance_simulate_migration_source( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, migration: instance::SimulateMigrationSource, ) -> Result<(), HttpError> { let instance = self - .instances - .sim_get_cloned_object(&instance_id.into_untyped_uuid()) + .vmms + .sim_get_cloned_object(&propolis_id.into_untyped_uuid()) .await .map_err(|_| { crate::sled_agent::Error::Instance( - crate::instance_manager::Error::NoSuchInstance(instance_id), + crate::instance_manager::Error::NoSuchInstance(propolis_id), ) })?; instance.set_simulated_migration_source(migration); @@ -572,25 +561,6 @@ impl SledAgent { *self.instance_ensure_state_error.lock().await = error; } - async fn detach_disks_from_instance( - &self, - instance_id: InstanceUuid, - ) -> Result<(), Error> { - self.disks - .sim_ensure_for_each_where( - |disk| match disk.current().disk_state { - DiskState::Attached(id) | DiskState::Attaching(id) => { - id == instance_id.into_untyped_uuid() - } - _ => false, - }, - &DiskStateRequested::Detached, - ) - .await?; - - Ok(()) - } - /// Idempotently ensures that the given API Disk (described by `api_disk`) /// is attached (or not) as specified. This simulates disk attach and /// detach, similar to instance boot and halt. @@ -607,16 +577,16 @@ impl SledAgent { &self.updates } - pub async fn instance_count(&self) -> usize { - self.instances.size().await + pub async fn vmm_count(&self) -> usize { + self.vmms.size().await } pub async fn disk_count(&self) -> usize { self.disks.size().await } - pub async fn instance_poke(&self, id: InstanceUuid, mode: PokeMode) { - self.instances.sim_poke(id.into_untyped_uuid(), mode).await; + pub async fn vmm_poke(&self, id: PropolisUuid, mode: PokeMode) { + self.vmms.sim_poke(id.into_untyped_uuid(), mode).await; } pub async fn disk_poke(&self, id: Uuid) { @@ -699,7 +669,7 @@ impl SledAgent { /// snapshot here. pub async fn instance_issue_disk_snapshot_request( &self, - _instance_id: InstanceUuid, + _propolis_id: PropolisUuid, disk_id: Uuid, snapshot_id: Uuid, ) -> Result<(), Error> { @@ -760,18 +730,17 @@ impl SledAgent { pub async fn instance_put_external_ip( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, body_args: &InstanceExternalIpBody, ) -> Result<(), Error> { - if !self.instances.contains_key(&instance_id.into_untyped_uuid()).await - { + if !self.vmms.contains_key(&propolis_id.into_untyped_uuid()).await { return Err(Error::internal_error( - "can't alter IP state for nonexistent instance", + "can't alter IP state for VMM that's not registered", )); } let mut eips = self.external_ips.lock().await; - let my_eips = eips.entry(instance_id.into_untyped_uuid()).or_default(); + let my_eips = eips.entry(propolis_id.into_untyped_uuid()).or_default(); // High-level behaviour: this should always succeed UNLESS // trying to add a double ephemeral. @@ -794,18 +763,17 @@ impl SledAgent { pub async fn instance_delete_external_ip( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, body_args: &InstanceExternalIpBody, ) -> Result<(), Error> { - if !self.instances.contains_key(&instance_id.into_untyped_uuid()).await - { + if !self.vmms.contains_key(&propolis_id.into_untyped_uuid()).await { return Err(Error::internal_error( - "can't alter IP state for nonexistent instance", + "can't alter IP state for VMM that's not registered", )); } let mut eips = self.external_ips.lock().await; - let my_eips = eips.entry(instance_id.into_untyped_uuid()).or_default(); + let my_eips = eips.entry(propolis_id.into_untyped_uuid()).or_default(); my_eips.remove(&body_args); diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 556388ce93..ac8f80069b 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -24,8 +24,8 @@ use omicron_common::disk::DiskVariant; use omicron_common::disk::DisksManagementResult; use omicron_common::disk::OmicronPhysicalDisksConfig; use omicron_uuid_kinds::GenericUuid; -use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::PropolisUuid; use omicron_uuid_kinds::ZpoolUuid; use propolis_client::types::VolumeConstructionRequest; use slog::Logger; @@ -869,7 +869,7 @@ impl Pantry { self.sled_agent .instance_issue_disk_snapshot_request( - InstanceUuid::new_v4(), // instance id, not used by function + PropolisUuid::new_v4(), // instance id, not used by function volume_id.parse().unwrap(), snapshot_id.parse().unwrap(), ) diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 50e5611027..9752e8d7e2 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -990,11 +990,11 @@ impl SledAgent { /// rudely terminates the instance. pub async fn instance_ensure_unregistered( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ) -> Result { self.inner .instances - .ensure_unregistered(instance_id) + .ensure_unregistered(propolis_id) .await .map_err(|e| Error::Instance(e)) } @@ -1003,12 +1003,12 @@ impl SledAgent { /// state. pub async fn instance_ensure_state( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, target: InstanceStateRequested, ) -> Result { self.inner .instances - .ensure_state(instance_id, target) + .ensure_state(propolis_id, target) .await .map_err(|e| Error::Instance(e)) } @@ -1020,12 +1020,12 @@ impl SledAgent { /// does not match the current ephemeral IP. pub async fn instance_put_external_ip( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, external_ip: &InstanceExternalIpBody, ) -> Result<(), Error> { self.inner .instances - .add_external_ip(instance_id, external_ip) + .add_external_ip(propolis_id, external_ip) .await .map_err(|e| Error::Instance(e)) } @@ -1034,12 +1034,12 @@ impl SledAgent { /// specified external IP address in either its ephemeral or floating IP set. pub async fn instance_delete_external_ip( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, external_ip: &InstanceExternalIpBody, ) -> Result<(), Error> { self.inner .instances - .delete_external_ip(instance_id, external_ip) + .delete_external_ip(propolis_id, external_ip) .await .map_err(|e| Error::Instance(e)) } @@ -1047,11 +1047,11 @@ impl SledAgent { /// Returns the state of the instance with the provided ID. pub async fn instance_get_state( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ) -> Result { self.inner .instances - .get_instance_state(instance_id) + .get_instance_state(propolis_id) .await .map_err(|e| Error::Instance(e)) } @@ -1082,19 +1082,15 @@ impl SledAgent { } /// Issue a snapshot request for a Crucible disk attached to an instance - pub async fn instance_issue_disk_snapshot_request( + pub async fn vmm_issue_disk_snapshot_request( &self, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, disk_id: Uuid, snapshot_id: Uuid, ) -> Result<(), Error> { self.inner .instances - .instance_issue_disk_snapshot_request( - instance_id, - disk_id, - snapshot_id, - ) + .issue_disk_snapshot_request(propolis_id, disk_id, snapshot_id) .await .map_err(Error::from) } diff --git a/sled-agent/types/src/instance.rs b/sled-agent/types/src/instance.rs index 0753e273dc..12ceb98fb4 100644 --- a/sled-agent/types/src/instance.rs +++ b/sled-agent/types/src/instance.rs @@ -18,7 +18,7 @@ use omicron_common::api::internal::{ DhcpConfig, NetworkInterface, ResolvedVpcFirewallRule, SourceNatConfig, }, }; -use omicron_uuid_kinds::PropolisUuid; +use omicron_uuid_kinds::InstanceUuid; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use uuid::Uuid; @@ -37,10 +37,8 @@ pub struct InstanceEnsureBody { /// The initial VMM runtime state for the VMM being registered. pub vmm_runtime: VmmRuntimeState, - /// The ID of the VMM being registered. This may not be the active VMM ID in - /// the instance runtime state (e.g. if the new VMM is going to be a - /// migration target). - pub propolis_id: PropolisUuid, + /// The ID of the instance for which this VMM is being created. + pub instance_id: InstanceUuid, /// The address at which this VMM should serve a Propolis server API. pub propolis_addr: SocketAddr, From 79b68164ad2c9c0e305640a5c67bea45b1da01e0 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Thu, 22 Aug 2024 19:51:54 +0000 Subject: [PATCH 02/16] fix up Nexus to call new APIs --- .../app/background/tasks/instance_watcher.rs | 5 +- nexus/src/app/instance.rs | 50 +++++++++------- nexus/src/app/sagas/instance_common.rs | 60 ++++++++++--------- nexus/src/app/sagas/instance_ip_attach.rs | 29 ++++----- nexus/src/app/sagas/instance_ip_detach.rs | 31 +++------- nexus/src/app/sagas/instance_migrate.rs | 14 +---- nexus/src/app/sagas/instance_start.rs | 6 +- nexus/src/app/sagas/snapshot_create.rs | 25 ++++---- nexus/src/app/snapshot.rs | 5 +- 9 files changed, 105 insertions(+), 120 deletions(-) diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index f63c21105e..6195051db9 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -22,6 +22,7 @@ use omicron_common::api::external::InstanceState; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; +use omicron_uuid_kinds::PropolisUuid; use oximeter::types::ProducerRegistry; use sled_agent_client::Client as SledAgentClient; use std::borrow::Cow; @@ -83,9 +84,7 @@ impl InstanceWatcher { async move { slog::trace!(opctx.log, "checking on instance..."); let rsp = client - .instance_get_state(&InstanceUuid::from_untyped_uuid( - target.instance_id, - )) + .vmm_get_state(&PropolisUuid::from_untyped_uuid(target.vmm_id)) .await; let mut check = Check { target, diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 3106ab9f2a..7715c01cde 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -176,7 +176,7 @@ enum InstanceStateChangeRequestAction { /// Request the appropriate state change from the sled with the specified /// UUID. - SendToSled(SledUuid), + SendToSled { sled_id: SledUuid, propolis_id: PropolisUuid }, } /// What is the higher level operation that is calling @@ -664,21 +664,19 @@ impl super::Nexus { /// this sled, this operation rudely terminates it. pub(crate) async fn instance_ensure_unregistered( &self, - opctx: &OpContext, - authz_instance: &authz::Instance, + propolis_id: &PropolisUuid, sled_id: &SledUuid, ) -> Result, InstanceStateChangeError> { - opctx.authorize(authz::Action::Modify, authz_instance).await?; let sa = self.sled_client(&sled_id).await?; - sa.instance_unregister(&InstanceUuid::from_untyped_uuid( - authz_instance.id(), - )) - .await - .map(|res| res.into_inner().updated_runtime.map(Into::into)) - .map_err(|e| { - InstanceStateChangeError::SledAgent(SledAgentInstancePutError(e)) - }) + sa.vmm_unregister(propolis_id) + .await + .map(|res| res.into_inner().updated_runtime.map(Into::into)) + .map_err(|e| { + InstanceStateChangeError::SledAgent(SledAgentInstancePutError( + e, + )) + }) } /// Determines the action to take on an instance's active VMM given a @@ -712,8 +710,11 @@ impl super::Nexus { // Requests that operate on active instances have to be directed to the // instance's current sled agent. If there is none, the request needs to // be handled specially based on its type. - let sled_id = if let Some(vmm) = vmm_state { - SledUuid::from_untyped_uuid(vmm.sled_id) + let (sled_id, propolis_id) = if let Some(vmm) = vmm_state { + ( + SledUuid::from_untyped_uuid(vmm.sled_id), + PropolisUuid::from_untyped_uuid(vmm.id), + ) } else { match effective_state { // If there's no active sled because the instance is stopped, @@ -814,7 +815,10 @@ impl super::Nexus { }; if allowed { - Ok(InstanceStateChangeRequestAction::SendToSled(sled_id)) + Ok(InstanceStateChangeRequestAction::SendToSled { + sled_id, + propolis_id, + }) } else { Err(Error::invalid_request(format!( "instance state cannot be changed from state \"{}\"", @@ -831,7 +835,6 @@ impl super::Nexus { prev_vmm_state: &Option, requested: InstanceStateChangeRequest, ) -> Result<(), InstanceStateChangeError> { - opctx.authorize(authz::Action::Modify, authz_instance).await?; let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); match self.select_runtime_change_action( @@ -840,11 +843,14 @@ impl super::Nexus { &requested, )? { InstanceStateChangeRequestAction::AlreadyDone => Ok(()), - InstanceStateChangeRequestAction::SendToSled(sled_id) => { + InstanceStateChangeRequestAction::SendToSled { + sled_id, + propolis_id, + } => { let sa = self.sled_client(&sled_id).await?; let instance_put_result = sa - .instance_put_state( - &instance_id, + .vmm_put_state( + &propolis_id, &InstancePutStateBody { state: requested.into() }, ) .await @@ -1120,13 +1126,13 @@ impl super::Nexus { .sled_client(&SledUuid::from_untyped_uuid(initial_vmm.sled_id)) .await?; let instance_register_result = sa - .instance_register( - &instance_id, + .vmm_register( + propolis_id, &sled_agent_client::types::InstanceEnsureBody { hardware: instance_hardware, instance_runtime: db_instance.runtime().clone().into(), vmm_runtime: initial_vmm.clone().into(), - propolis_id: *propolis_id, + instance_id, propolis_addr: SocketAddr::new( initial_vmm.propolis_ip.ip(), initial_vmm.propolis_port.into(), diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index 6e431aaca7..b649c369ad 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -25,6 +25,12 @@ use super::NexusActionContext; /// The port propolis-server listens on inside the propolis zone. const DEFAULT_PROPOLIS_PORT: u16 = 12400; +#[derive(Clone, Debug, Serialize, Deserialize)] +pub(super) struct PropolisAndSledId { + pub(super) propolis_id: PropolisUuid, + pub(super) sled_id: SledUuid, +} + /// Reserves resources for a new VMM whose instance has `ncpus` guest logical /// processors and `guest_memory` bytes of guest RAM. The selected sled is /// random within the set of sleds allowed by the supplied `constraints`. @@ -213,12 +219,12 @@ pub async fn instance_ip_move_state( /// the Attaching or Detaching state so that concurrent attempts to start the /// instance will notice that the IP state is in flux and ask the caller to /// retry. -pub async fn instance_ip_get_instance_state( +pub(super) async fn instance_ip_get_instance_state( sagactx: &NexusActionContext, serialized_authn: &authn::saga::Serialized, authz_instance: &authz::Instance, verb: &str, -) -> Result, ActionError> { +) -> Result, ActionError> { // XXX: we can get instance state (but not sled ID) in same transaction // as attach (but not detach) wth current design. We need to re-query // for sled ID anyhow, so keep consistent between attach/detach. @@ -236,7 +242,11 @@ pub async fn instance_ip_get_instance_state( inst_and_vmm.vmm().as_ref().map(|vmm| vmm.runtime.state); let found_instance_state = inst_and_vmm.instance().runtime_state.nexus_state; - let mut sled_id = inst_and_vmm.sled_id(); + let mut propolis_and_sled_id = + inst_and_vmm.vmm().as_ref().map(|vmm| PropolisAndSledId { + propolis_id: PropolisUuid::from_untyped_uuid(vmm.id), + sled_id: SledUuid::from_untyped_uuid(vmm.sled_id), + }); slog::debug!( osagactx.log(), "evaluating instance state for IP attach/detach"; @@ -257,7 +267,7 @@ pub async fn instance_ip_get_instance_state( match (found_instance_state, found_vmm_state) { // If there's no VMM, the instance is definitely not on any sled. (InstanceState::NoVmm, _) | (_, Some(VmmState::SagaUnwound)) => { - sled_id = None; + propolis_and_sled_id = None; } // If the instance is running normally or rebooting, it's resident on @@ -340,7 +350,7 @@ pub async fn instance_ip_get_instance_state( } } - Ok(sled_id) + Ok(propolis_and_sled_id) } /// Adds a NAT entry to DPD, routing packets bound for `target_ip` to a @@ -441,18 +451,19 @@ pub async fn instance_ip_remove_nat( /// Inform the OPTE port for a running instance that it should start /// sending/receiving traffic on a given IP address. /// -/// This call is a no-op if `sled_uuid` is `None` or the saga is explicitly -/// set to be inactive in event of double attach/detach (`!target_ip.do_saga`). -pub async fn instance_ip_add_opte( +/// This call is a no-op if the instance is not active (`propolis_and_sled` is +/// `None`) or the calling saga is explicitly set to be inactive in the event of +/// a double attach/detach (`!target_ip.do_saga`). +pub(super) async fn instance_ip_add_opte( sagactx: &NexusActionContext, - authz_instance: &authz::Instance, - sled_uuid: Option, + propolis_and_sled: Option, target_ip: ModifyStateForExternalIp, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); // No physical sled? Don't inform OPTE. - let Some(sled_uuid) = sled_uuid else { + let Some(PropolisAndSledId { propolis_id, sled_id }) = propolis_and_sled + else { return Ok(()); }; @@ -470,17 +481,14 @@ pub async fn instance_ip_add_opte( osagactx .nexus() - .sled_client(&sled_uuid) + .sled_client(&sled_id) .await .map_err(|_| { ActionError::action_failed(Error::unavail( "sled agent client went away mid-attach/detach", )) })? - .instance_put_external_ip( - &InstanceUuid::from_untyped_uuid(authz_instance.id()), - &sled_agent_body, - ) + .vmm_put_external_ip(&propolis_id, &sled_agent_body) .await .map_err(|e| { ActionError::action_failed(match e { @@ -499,18 +507,19 @@ pub async fn instance_ip_add_opte( /// Inform the OPTE port for a running instance that it should cease /// sending/receiving traffic on a given IP address. /// -/// This call is a no-op if `sled_uuid` is `None` or the saga is explicitly -/// set to be inactive in event of double attach/detach (`!target_ip.do_saga`). -pub async fn instance_ip_remove_opte( +/// This call is a no-op if the instance is not active (`propolis_and_sled` is +/// `None`) or the calling saga is explicitly set to be inactive in the event of +/// a double attach/detach (`!target_ip.do_saga`). +pub(super) async fn instance_ip_remove_opte( sagactx: &NexusActionContext, - authz_instance: &authz::Instance, - sled_uuid: Option, + propolis_and_sled: Option, target_ip: ModifyStateForExternalIp, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); // No physical sled? Don't inform OPTE. - let Some(sled_uuid) = sled_uuid else { + let Some(PropolisAndSledId { propolis_id, sled_id }) = propolis_and_sled + else { return Ok(()); }; @@ -528,17 +537,14 @@ pub async fn instance_ip_remove_opte( osagactx .nexus() - .sled_client(&sled_uuid) + .sled_client(&sled_id) .await .map_err(|_| { ActionError::action_failed(Error::unavail( "sled agent client went away mid-attach/detach", )) })? - .instance_delete_external_ip( - &InstanceUuid::from_untyped_uuid(authz_instance.id()), - &sled_agent_body, - ) + .vmm_delete_external_ip(&propolis_id, &sled_agent_body) .await .map_err(|e| { ActionError::action_failed(match e { diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index a14054cf66..b2a03e268e 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -5,7 +5,7 @@ use super::instance_common::{ instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, instance_ip_move_state, instance_ip_remove_opte, ExternalIpAttach, - ModifyStateForExternalIp, + ModifyStateForExternalIp, PropolisAndSledId, }; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; @@ -13,7 +13,7 @@ use crate::app::{authn, authz}; use nexus_db_model::{IpAttachState, Ipv4NatEntry}; use nexus_types::external_api::views; use omicron_common::api::external::Error; -use omicron_uuid_kinds::{GenericUuid, InstanceUuid, SledUuid}; +use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -161,7 +161,7 @@ async fn siia_begin_attach_ip_undo( async fn siia_get_instance_state( sagactx: NexusActionContext, -) -> Result, ActionError> { +) -> Result, ActionError> { let params = sagactx.saga_params::()?; instance_ip_get_instance_state( &sagactx, @@ -177,7 +177,10 @@ async fn siia_nat( sagactx: NexusActionContext, ) -> Result, ActionError> { let params = sagactx.saga_params::()?; - let sled_id = sagactx.lookup::>("instance_state")?; + let sled_id = sagactx + .lookup::>("instance_state")? + .map(|ids| ids.sled_id); + let target_ip = sagactx.lookup::("target_ip")?; instance_ip_add_nat( &sagactx, @@ -245,28 +248,18 @@ async fn siia_nat_undo( async fn siia_update_opte( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let params = sagactx.saga_params::()?; - let sled_id = sagactx.lookup::>("instance_state")?; + let ids = sagactx.lookup::>("instance_state")?; let target_ip = sagactx.lookup::("target_ip")?; - instance_ip_add_opte(&sagactx, ¶ms.authz_instance, sled_id, target_ip) - .await + instance_ip_add_opte(&sagactx, ids, target_ip).await } async fn siia_update_opte_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let log = sagactx.user_data().log(); - let params = sagactx.saga_params::()?; - let sled_id = sagactx.lookup::>("instance_state")?; + let ids = sagactx.lookup::>("instance_state")?; let target_ip = sagactx.lookup::("target_ip")?; - if let Err(e) = instance_ip_remove_opte( - &sagactx, - ¶ms.authz_instance, - sled_id, - target_ip, - ) - .await - { + if let Err(e) = instance_ip_remove_opte(&sagactx, ids, target_ip).await { error!(log, "siia_update_opte_undo: failed to notify sled-agent: {e}"); } Ok(()) diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index a5b51ce375..d5f0b55528 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -5,7 +5,7 @@ use super::instance_common::{ instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, instance_ip_move_state, instance_ip_remove_nat, instance_ip_remove_opte, - ModifyStateForExternalIp, + ModifyStateForExternalIp, PropolisAndSledId, }; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; @@ -155,7 +155,7 @@ async fn siid_begin_detach_ip_undo( async fn siid_get_instance_state( sagactx: NexusActionContext, -) -> Result, ActionError> { +) -> Result, ActionError> { let params = sagactx.saga_params::()?; instance_ip_get_instance_state( &sagactx, @@ -184,7 +184,9 @@ async fn siid_nat_undo( ) -> Result<(), anyhow::Error> { let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - let sled_id = sagactx.lookup::>("instance_state")?; + let sled_id = sagactx + .lookup::>("instance_state")? + .map(|ids| ids.sled_id); let target_ip = sagactx.lookup::("target_ip")?; if let Err(e) = instance_ip_add_nat( &sagactx, @@ -204,33 +206,18 @@ async fn siid_nat_undo( async fn siid_update_opte( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let params = sagactx.saga_params::()?; - let sled_id = sagactx.lookup::>("instance_state")?; + let ids = sagactx.lookup::>("instance_state")?; let target_ip = sagactx.lookup::("target_ip")?; - instance_ip_remove_opte( - &sagactx, - ¶ms.authz_instance, - sled_id, - target_ip, - ) - .await + instance_ip_remove_opte(&sagactx, ids, target_ip).await } async fn siid_update_opte_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let log = sagactx.user_data().log(); - let params = sagactx.saga_params::()?; - let sled_id = sagactx.lookup::>("instance_state")?; + let ids = sagactx.lookup::>("instance_state")?; let target_ip = sagactx.lookup::("target_ip")?; - if let Err(e) = instance_ip_add_opte( - &sagactx, - ¶ms.authz_instance, - sled_id, - target_ip, - ) - .await - { + if let Err(e) = instance_ip_add_opte(&sagactx, ids, target_ip).await { error!(log, "siid_update_opte_undo: failed to notify sled-agent: {e}"); } Ok(()) diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 19bef2f046..e1c198f25e 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -437,20 +437,10 @@ async fn sim_ensure_destination_propolis_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let osagactx = sagactx.user_data(); - let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( - &sagactx, - ¶ms.serialized_authn, - ); - + let dst_propolis_id = sagactx.lookup::("dst_propolis_id")?; let dst_sled_id = sagactx.lookup::("dst_sled_id")?; let db_instance = sagactx.lookup::("set_migration_ids")?; - let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) - .instance_id(db_instance.id()) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; info!(osagactx.log(), "unregistering destination vmm for migration unwind"; "instance_id" => %db_instance.id(), @@ -465,7 +455,7 @@ async fn sim_ensure_destination_propolis_undo( // needed. match osagactx .nexus() - .instance_ensure_unregistered(&opctx, &authz_instance, &dst_sled_id) + .instance_ensure_unregistered(&dst_propolis_id, &dst_sled_id) .await { Ok(_) => Ok(()), diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index 55fc312ae7..bd948ff668 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -538,6 +538,7 @@ async fn sis_ensure_registered_undo( let params = sagactx.saga_params::()?; let datastore = osagactx.datastore(); let instance_id = InstanceUuid::from_untyped_uuid(params.db_instance.id()); + let propolis_id = sagactx.lookup::("propolis_id")?; let sled_id = sagactx.lookup::("sled_id")?; let opctx = crate::context::op_context_for_saga_action( &sagactx, @@ -546,11 +547,12 @@ async fn sis_ensure_registered_undo( info!(osagactx.log(), "start saga: unregistering instance from sled"; "instance_id" => %instance_id, + "propolis_id" => %propolis_id, "sled_id" => %sled_id); // Fetch the latest record so that this callee can drive the instance into // a Failed state if the unregister call fails. - let (.., authz_instance, db_instance) = LookupPath::new(&opctx, &datastore) + let (.., db_instance) = LookupPath::new(&opctx, &datastore) .instance_id(instance_id.into_untyped_uuid()) .fetch() .await @@ -563,7 +565,7 @@ async fn sis_ensure_registered_undo( // returned. if let Err(e) = osagactx .nexus() - .instance_ensure_unregistered(&opctx, &authz_instance, &sled_id) + .instance_ensure_unregistered(&propolis_id, &sled_id) .await { error!(osagactx.log(), diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index eeb14091b2..f74dec6711 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -106,11 +106,12 @@ use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external; use omicron_common::api::external::Error; use omicron_common::retry_until_known_result; +use omicron_uuid_kinds::{GenericUuid, PropolisUuid, SledUuid}; use rand::{rngs::StdRng, RngCore, SeedableRng}; use serde::Deserialize; use serde::Serialize; use sled_agent_client::types::CrucibleOpts; -use sled_agent_client::types::InstanceIssueDiskSnapshotRequestBody; +use sled_agent_client::types::VmmIssueDiskSnapshotRequestBody; use sled_agent_client::types::VolumeConstructionRequest; use slog::info; use std::collections::BTreeMap; @@ -826,39 +827,43 @@ async fn ssc_send_snapshot_request_to_sled_agent( .await .map_err(ActionError::action_failed)?; - let sled_id = osagactx + let instance_and_vmm = osagactx .datastore() .instance_fetch_with_vmm(&opctx, &authz_instance) .await - .map_err(ActionError::action_failed)? - .sled_id(); + .map_err(ActionError::action_failed)?; + + let vmm = instance_and_vmm.vmm(); // If this instance does not currently have a sled, we can't continue this // saga - the user will have to reissue the snapshot request and it will get // run on a Pantry. - let Some(sled_id) = sled_id else { + let Some((propolis_id, sled_id)) = + vmm.as_ref().map(|vmm| (vmm.id, vmm.sled_id)) + else { return Err(ActionError::action_failed(Error::unavail( - "sled id is None!", + "instance no longer has an active VMM!", ))); }; info!(log, "asking for disk snapshot from Propolis via sled agent"; "disk_id" => %params.disk_id, "instance_id" => %attach_instance_id, + "propolis_id" => %propolis_id, "sled_id" => %sled_id); let sled_agent_client = osagactx .nexus() - .sled_client(&sled_id) + .sled_client(&SledUuid::from_untyped_uuid(sled_id)) .await .map_err(ActionError::action_failed)?; retry_until_known_result(log, || async { sled_agent_client - .instance_issue_disk_snapshot_request( + .vmm_issue_disk_snapshot_request( + &PropolisUuid::from_untyped_uuid(propolis_id), &attach_instance_id, - ¶ms.disk_id, - &InstanceIssueDiskSnapshotRequestBody { snapshot_id }, + &VmmIssueDiskSnapshotRequestBody { snapshot_id }, ) .await }) diff --git a/nexus/src/app/snapshot.rs b/nexus/src/app/snapshot.rs index 040c9fc082..c1a9825f07 100644 --- a/nexus/src/app/snapshot.rs +++ b/nexus/src/app/snapshot.rs @@ -107,11 +107,8 @@ impl super::Nexus { .instance_fetch_with_vmm(&opctx, &authz_instance) .await?; - // If a Propolis _may_ exist, send the snapshot request there, - // otherwise use the pantry. - !instance_state.vmm().is_some() + instance_state.vmm().is_none() } else { - // This disk is not attached to an instance, use the pantry. true }; From 36fe7c3a13180502f391ee52bbc2f06174043219 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Thu, 22 Aug 2024 23:23:24 +0000 Subject: [PATCH 03/16] fix simulated sled agent client paths --- clients/sled-agent-client/src/lib.rs | 23 ++-- nexus/src/app/sagas/snapshot_create.rs | 11 +- nexus/src/app/sagas/test_helpers.rs | 43 ++++-- nexus/src/app/test_interfaces.rs | 148 +++++++++++---------- nexus/tests/integration_tests/disks.rs | 6 +- nexus/tests/integration_tests/instances.rs | 132 +++++++++++------- nexus/tests/integration_tests/ip_pools.rs | 6 +- nexus/tests/integration_tests/pantry.rs | 6 +- 8 files changed, 216 insertions(+), 159 deletions(-) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index ed96d762dc..cf2e69e82d 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -5,6 +5,7 @@ //! Interface for making API requests to a Sled Agent use async_trait::async_trait; +use omicron_uuid_kinds::PropolisUuid; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; @@ -448,11 +449,11 @@ impl From /// are bonus endpoints, not generated in the real client. #[async_trait] pub trait TestInterfaces { - async fn instance_single_step(&self, id: Uuid); - async fn instance_finish_transition(&self, id: Uuid); - async fn instance_simulate_migration_source( + async fn vmm_single_step(&self, id: PropolisUuid); + async fn vmm_finish_transition(&self, id: PropolisUuid); + async fn vmm_simulate_migration_source( &self, - id: Uuid, + id: PropolisUuid, params: SimulateMigrationSource, ); async fn disk_finish_transition(&self, id: Uuid); @@ -460,10 +461,10 @@ pub trait TestInterfaces { #[async_trait] impl TestInterfaces for Client { - async fn instance_single_step(&self, id: Uuid) { + async fn vmm_single_step(&self, id: PropolisUuid) { let baseurl = self.baseurl(); let client = self.client(); - let url = format!("{}/instances/{}/poke-single-step", baseurl, id); + let url = format!("{}/vmms/{}/poke-single-step", baseurl, id); client .post(url) .send() @@ -471,10 +472,10 @@ impl TestInterfaces for Client { .expect("instance_single_step() failed unexpectedly"); } - async fn instance_finish_transition(&self, id: Uuid) { + async fn vmm_finish_transition(&self, id: PropolisUuid) { let baseurl = self.baseurl(); let client = self.client(); - let url = format!("{}/instances/{}/poke", baseurl, id); + let url = format!("{}/vmms/{}/poke", baseurl, id); client .post(url) .send() @@ -493,14 +494,14 @@ impl TestInterfaces for Client { .expect("disk_finish_transition() failed unexpectedly"); } - async fn instance_simulate_migration_source( + async fn vmm_simulate_migration_source( &self, - id: Uuid, + id: PropolisUuid, params: SimulateMigrationSource, ) { let baseurl = self.baseurl(); let client = self.client(); - let url = format!("{baseurl}/instances/{id}/sim-migration-source"); + let url = format!("{baseurl}/vmms/{id}/sim-migration-source"); client .post(url) .json(¶ms) diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index f74dec6711..5c9ff7f2ef 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -2156,12 +2156,15 @@ mod test { .await .unwrap(); - let sled_id = instance_state - .sled_id() - .expect("starting instance should have a sled"); + let vmm_state = instance_state + .vmm() + .as_ref() + .expect("starting instance should have a vmm"); + let propolis_id = PropolisUuid::from_untyped_uuid(vmm_state.id); + let sled_id = SledUuid::from_untyped_uuid(vmm_state.sled_id); let sa = nexus.sled_client(&sled_id).await.unwrap(); + sa.vmm_finish_transition(propolis_id).await; - sa.instance_finish_transition(instance.identity.id).await; let instance_state = nexus .datastore() .instance_fetch_with_vmm(&opctx, &authz_instance) diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index b9388a1116..63bb3d2e66 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -6,10 +6,7 @@ //! saga tests. use super::NexusSaga; -use crate::{ - app::{saga::create_saga_dag, test_interfaces::TestInterfaces as _}, - Nexus, -}; +use crate::{app::saga::create_saga_dag, Nexus}; use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use camino::Utf8Path; use diesel::{ @@ -137,13 +134,14 @@ pub(crate) async fn instance_simulate( info!(&cptestctx.logctx.log, "Poking simulated instance"; "instance_id" => %instance_id); let nexus = &cptestctx.server.server_context().nexus; + let (propolis_id, sled_id) = + instance_fetch_vmm_and_sled_ids(cptestctx, instance_id).await; let sa = nexus - .instance_sled_by_id(instance_id) + .sled_client(&sled_id) .await - .unwrap() .expect("instance must be on a sled to simulate a state change"); - sa.instance_finish_transition(instance_id.into_untyped_uuid()).await; + sa.vmm_finish_transition(propolis_id).await; } pub(crate) async fn instance_single_step_on_sled( @@ -158,12 +156,14 @@ pub(crate) async fn instance_single_step_on_sled( "sled_id" => %sled_id, ); let nexus = &cptestctx.server.server_context().nexus; + let (propolis_id, sled_id) = + instance_fetch_vmm_and_sled_ids(cptestctx, instance_id).await; let sa = nexus - .sled_client(sled_id) + .sled_client(&sled_id) .await - .expect("sled must exist to simulate a state change"); + .expect("instance must be on a sled to simulate a state change"); - sa.instance_single_step(instance_id.into_untyped_uuid()).await; + sa.vmm_single_step(propolis_id).await; } pub(crate) async fn instance_simulate_by_name( @@ -186,12 +186,14 @@ pub(crate) async fn instance_simulate_by_name( let instance_lookup = nexus.instance_lookup(&opctx, instance_selector).unwrap(); let (.., instance) = instance_lookup.fetch().await.unwrap(); + let instance_id = InstanceUuid::from_untyped_uuid(instance.id()); + let (propolis_id, sled_id) = + instance_fetch_vmm_and_sled_ids(cptestctx, &instance_id).await; let sa = nexus - .instance_sled_by_id(&InstanceUuid::from_untyped_uuid(instance.id())) + .sled_client(&sled_id) .await - .unwrap() .expect("instance must be on a sled to simulate a state change"); - sa.instance_finish_transition(instance.id()).await; + sa.vmm_finish_transition(propolis_id).await; } pub async fn instance_fetch( @@ -218,6 +220,21 @@ pub async fn instance_fetch( db_state } +async fn instance_fetch_vmm_and_sled_ids( + cptestctx: &ControlPlaneTestContext, + instance_id: &InstanceUuid, +) -> (PropolisUuid, SledUuid) { + let instance_and_vmm = instance_fetch(cptestctx, *instance_id).await; + let vmm = instance_and_vmm + .vmm() + .as_ref() + .expect("simulating an instance requires an active vmm"); + + let propolis_id = PropolisUuid::from_untyped_uuid(vmm.id); + let sled_id = SledUuid::from_untyped_uuid(vmm.sled_id); + (propolis_id, sled_id) +} + pub async fn instance_fetch_all( cptestctx: &ControlPlaneTestContext, instance_id: InstanceUuid, diff --git a/nexus/src/app/test_interfaces.rs b/nexus/src/app/test_interfaces.rs index adfafa523d..9852225e8c 100644 --- a/nexus/src/app/test_interfaces.rs +++ b/nexus/src/app/test_interfaces.rs @@ -6,8 +6,7 @@ use async_trait::async_trait; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external::Error; -use omicron_uuid_kinds::GenericUuid; -use omicron_uuid_kinds::{InstanceUuid, SledUuid}; +use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid, SledUuid}; use sled_agent_client::Client as SledAgentClient; use std::sync::Arc; use uuid::Uuid; @@ -19,25 +18,47 @@ pub use super::update::SpUpdater; pub use super::update::UpdateProgress; pub use gateway_client::types::SpType; +/// The information needed to talk to a sled agent about an instance that is +/// active on that sled. +pub struct InstanceSledAgentInfo { + /// The ID of the Propolis job to send to sled agent. + pub propolis_id: PropolisUuid, + + /// The ID of the sled where the Propolis job is running. + pub sled_id: SledUuid, + + /// A client for talking to the Propolis's host sled. + pub sled_client: Arc, + + /// The ID of the instance's migration target Propolis, if it has one. + pub dst_propolis_id: Option, +} + /// Exposes additional [`super::Nexus`] interfaces for use by the test suite #[async_trait] pub trait TestInterfaces { /// Access the Rack ID of the currently executing Nexus. fn rack_id(&self) -> Uuid; - /// Returns the SledAgentClient for an Instance from its id. We may also - /// want to split this up into instance_lookup_by_id() and instance_sled(), - /// but after all it's a test suite special to begin with. - async fn instance_sled_by_id( + /// Attempts to obtain the Propolis ID and sled agent information for an + /// instance. + /// + /// # Arguments + /// + /// - `id`: The ID of the instance of interest. + /// - `opctx`: An optional operation context to use for authorization + /// checks. If `None`, this routine supplies the default test opctx. + /// + /// # Return value + /// + /// - `Ok(Some(info))` if the instance has an active Propolis. + /// - `Ok(None)` if the instance has no active Propolis. + /// - `Err` if an error occurred. + async fn active_instance_info( &self, id: &InstanceUuid, - ) -> Result>, Error>; - - async fn instance_sled_by_id_with_opctx( - &self, - id: &InstanceUuid, - opctx: &OpContext, - ) -> Result>, Error>; + opctx: Option<&OpContext>, + ) -> Result, Error>; /// Returns the SledAgentClient for the sled running an instance to which a /// disk is attached. @@ -46,18 +67,6 @@ pub trait TestInterfaces { id: &Uuid, ) -> Result>, Error>; - /// Returns the supplied instance's current active sled ID. - async fn instance_sled_id( - &self, - instance_id: &InstanceUuid, - ) -> Result, Error>; - - async fn instance_sled_id_with_opctx( - &self, - instance_id: &InstanceUuid, - opctx: &OpContext, - ) -> Result, Error>; - async fn set_disk_as_faulted(&self, disk_id: &Uuid) -> Result; fn set_samael_max_issue_delay(&self, max_issue_delay: chrono::Duration); @@ -69,30 +78,49 @@ impl TestInterfaces for super::Nexus { self.rack_id } - async fn instance_sled_by_id( + async fn active_instance_info( &self, id: &InstanceUuid, - ) -> Result>, Error> { - let opctx = OpContext::for_tests( - self.log.new(o!()), - Arc::clone(&self.db_datastore) - as Arc, - ); + opctx: Option<&OpContext>, + ) -> Result, Error> { + let local_opctx; + let opctx = match opctx { + Some(o) => o, + None => { + local_opctx = OpContext::for_tests( + self.log.new(o!()), + Arc::clone(&self.db_datastore) + as Arc, + ); + &local_opctx + } + }; - self.instance_sled_by_id_with_opctx(id, &opctx).await - } + let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore) + .instance_id(id.into_untyped_uuid()) + .lookup_for(nexus_db_queries::authz::Action::Read) + .await?; - async fn instance_sled_by_id_with_opctx( - &self, - id: &InstanceUuid, - opctx: &OpContext, - ) -> Result>, Error> { - let sled_id = self.instance_sled_id_with_opctx(id, opctx).await?; - if let Some(sled_id) = sled_id { - Ok(Some(self.sled_client(&sled_id).await?)) - } else { - Ok(None) - } + let state = self + .datastore() + .instance_fetch_with_vmm(opctx, &authz_instance) + .await?; + + let Some(vmm) = state.vmm() else { + return Ok(None); + }; + + let sled_id = SledUuid::from_untyped_uuid(vmm.sled_id); + Ok(Some(InstanceSledAgentInfo { + propolis_id: PropolisUuid::from_untyped_uuid(vmm.id), + sled_id, + sled_client: self.sled_client(&sled_id).await?, + dst_propolis_id: state + .instance() + .runtime() + .dst_propolis_id + .map(PropolisUuid::from_untyped_uuid), + })) } async fn disk_sled_by_id( @@ -112,37 +140,11 @@ impl TestInterfaces for super::Nexus { let instance_id = InstanceUuid::from_untyped_uuid( db_disk.runtime().attach_instance_id.unwrap(), ); - self.instance_sled_by_id(&instance_id).await - } - - async fn instance_sled_id( - &self, - id: &InstanceUuid, - ) -> Result, Error> { - let opctx = OpContext::for_tests( - self.log.new(o!()), - Arc::clone(&self.db_datastore) - as Arc, - ); - - self.instance_sled_id_with_opctx(id, &opctx).await - } - - async fn instance_sled_id_with_opctx( - &self, - id: &InstanceUuid, - opctx: &OpContext, - ) -> Result, Error> { - let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore) - .instance_id(id.into_untyped_uuid()) - .lookup_for(nexus_db_queries::authz::Action::Read) - .await?; Ok(self - .datastore() - .instance_fetch_with_vmm(opctx, &authz_instance) + .active_instance_info(&instance_id, Some(&opctx)) .await? - .sled_id()) + .map(|info| info.sled_client)) } async fn set_disk_as_faulted(&self, disk_id: &Uuid) -> Result { diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index 234ab5f382..900a849c94 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -188,12 +188,12 @@ async fn set_instance_state( } async fn instance_simulate(nexus: &Arc, id: &InstanceUuid) { - let sa = nexus - .instance_sled_by_id(id) + let info = nexus.active_instance_info(id, None) .await .unwrap() .expect("instance must be on a sled to simulate a state change"); - sa.instance_finish_transition(id.into_untyped_uuid()).await; + + info.sled_client.vmm_finish_transition(info.propolis_id).await; } #[nexus_test] diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index eb3c88eb38..bd2afc97aa 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -780,12 +780,13 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { let instance_next = instance_get(&client, &instance_url).await; assert_eq!(instance_next.runtime.run_state, InstanceState::Running); - let original_sled = nexus - .instance_sled_id(&instance_id) + let sled_info = nexus + .active_instance_info(&instance_id, None) .await .unwrap() .expect("running instance should have a sled"); + let original_sled = sled_info.sled_id; let dst_sled_id = if original_sled == default_sled_id { other_sled_id } else { @@ -808,12 +809,13 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { .parsed_body::() .unwrap(); - let current_sled = nexus - .instance_sled_id(&instance_id) + let new_sled_info = nexus + .active_instance_info(&instance_id, None) .await .unwrap() .expect("running instance should have a sled"); + let current_sled = new_sled_info.sled_id; assert_eq!(current_sled, original_sled); // Ensure that both sled agents report that the migration is in progress. @@ -840,6 +842,15 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { assert_eq!(migration.target_state, MigrationState::Pending.into()); assert_eq!(migration.source_state, MigrationState::Pending.into()); + let info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("instance should be on a sled"); + let src_propolis_id = info.propolis_id; + let dst_propolis_id = + info.dst_propolis_id.expect("instance should have a migration target"); + // Simulate the migration. We will use `instance_single_step_on_sled` to // single-step both sled-agents through the migration state machine and // ensure that the migration state looks nice at each step. @@ -847,15 +858,15 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { cptestctx, nexus, original_sled, - instance_id, + src_propolis_id, migration_id, ) .await; // Move source to "migrating". - instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id) + vmm_single_step_on_sled(cptestctx, nexus, original_sled, src_propolis_id) .await; - instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id) + vmm_single_step_on_sled(cptestctx, nexus, original_sled, src_propolis_id) .await; let migration = dbg!(migration_fetch(cptestctx, migration_id).await); @@ -865,9 +876,9 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { assert_eq!(instance.runtime.run_state, InstanceState::Migrating); // Move target to "migrating". - instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id) + vmm_single_step_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id) .await; - instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id) + vmm_single_step_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id) .await; let migration = dbg!(migration_fetch(cptestctx, migration_id).await); @@ -877,7 +888,7 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { assert_eq!(instance.runtime.run_state, InstanceState::Migrating); // Move the source to "completed" - instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) + vmm_simulate_on_sled(cptestctx, nexus, original_sled, src_propolis_id) .await; let migration = dbg!(migration_fetch(cptestctx, migration_id).await); @@ -887,15 +898,16 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { assert_eq!(instance.runtime.run_state, InstanceState::Migrating); // Move the target to "completed". - instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; + vmm_simulate_on_sled(cptestctx, nexus, dst_sled_id, src_propolis_id).await; instance_wait_for_state(&client, instance_id, InstanceState::Running).await; let current_sled = nexus - .instance_sled_id(&instance_id) + .active_instance_info(&instance_id, None) .await .unwrap() - .expect("migrated instance should still have a sled"); + .expect("migrated instance should still have a sled") + .sled_id; assert_eq!(current_sled, dst_sled_id); @@ -978,11 +990,12 @@ async fn test_instance_migrate_v2p_and_routes( .derive_guest_network_interface_info(&opctx, &authz_instance) .await .unwrap(); + let original_sled_id = nexus - .instance_sled_id(&instance_id) + .active_instance_info(&instance_id, None) .await .unwrap() - .expect("running instance should have a sled"); + .expect("running instance should have a sled").sled_id; let mut sled_agents = vec![cptestctx.sled_agent.sled_agent.clone()]; sled_agents.extend(other_sleds.iter().map(|tup| tup.1.sled_agent.clone())); @@ -1035,25 +1048,35 @@ async fn test_instance_migrate_v2p_and_routes( .expect("since we've started a migration, the instance record must have a migration id!") }; + let info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("instance should be on a sled"); + let src_propolis_id = info.propolis_id; + let dst_propolis_id = + info.dst_propolis_id.expect("instance should have a migration target"); + // Tell both sled-agents to pretend to do the migration. instance_simulate_migration_source( cptestctx, nexus, original_sled_id, - instance_id, + src_propolis_id, migration_id, ) .await; - instance_simulate_on_sled(cptestctx, nexus, original_sled_id, instance_id) + vmm_simulate_on_sled(cptestctx, nexus, original_sled_id, src_propolis_id) .await; - instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; + vmm_simulate_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id).await; instance_wait_for_state(&client, instance_id, InstanceState::Running).await; let current_sled = nexus - .instance_sled_id(&instance_id) + .active_instance_info(&instance_id, None) .await .unwrap() - .expect("migrated instance should have a sled"); + .expect("migrated instance should have a sled") + .sled_id; assert_eq!(current_sled, dst_sled_id); for sled_agent in &sled_agents { @@ -1373,10 +1396,11 @@ async fn test_instance_metrics_with_migration( // Request migration to the other sled. This reserves resources on the // target sled, but shouldn't change the virtual provisioning counters. let original_sled = nexus - .instance_sled_id(&instance_id) + .active_instance_info(&instance_id, None) .await .unwrap() - .expect("running instance should have a sled"); + .expect("running instance should have a sled") + .sled_id; let dst_sled_id = if original_sled == default_sled_id { other_sled_id @@ -1420,6 +1444,15 @@ async fn test_instance_metrics_with_migration( .expect("since we've started a migration, the instance record must have a migration id!") }; + let info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("instance should be on a sled"); + let src_propolis_id = info.propolis_id; + let dst_propolis_id = + info.dst_propolis_id.expect("instance should have a migration target"); + // Wait for the instance to be in the `Migrating` state. Otherwise, the // subsequent `instance_wait_for_state(..., Running)` may see the `Running` // state from the *old* VMM, rather than waiting for the migration to @@ -1428,14 +1461,12 @@ async fn test_instance_metrics_with_migration( cptestctx, nexus, original_sled, - instance_id, + src_propolis_id, migration_id, ) .await; - instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id) - .await; - instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id) - .await; + vmm_single_step_on_sled(cptestctx, nexus, original_sled, src_propolis_id).await; + vmm_single_step_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id).await; instance_wait_for_state(&client, instance_id, InstanceState::Migrating) .await; @@ -1444,9 +1475,9 @@ async fn test_instance_metrics_with_migration( // Complete migration on the target. Simulated migrations always succeed. // After this the instance should be running and should continue to appear // to be provisioned. - instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) + vmm_simulate_on_sled(cptestctx, nexus, original_sled, src_propolis_id) .await; - instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; + vmm_simulate_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id).await; instance_wait_for_state(&client, instance_id, InstanceState::Running).await; check_provisioning_state(4, 1).await; @@ -3337,10 +3368,11 @@ async fn test_disks_detached_when_instance_destroyed( let apictx = &cptestctx.server.server_context(); let nexus = &apictx.nexus; let sa = nexus - .instance_sled_by_id(&instance_id) + .active_instance_info(&instance_id, None) .await .unwrap() - .expect("instance should be on a sled while it's running"); + .expect("instance should be on a sled while it's running") + .sled_client; // Stop and delete instance instance_post(&client, instance_name, InstanceOp::Stop).await; @@ -5080,28 +5112,29 @@ pub async fn assert_sled_vpc_routes( /// instance, and then tell it to finish simulating whatever async transition is /// going on. pub async fn instance_simulate(nexus: &Arc, id: &InstanceUuid) { - let sa = nexus - .instance_sled_by_id(id) + let sled_info = nexus + .active_instance_info(id, None) .await .unwrap() .expect("instance must be on a sled to simulate a state change"); - sa.instance_finish_transition(id.into_untyped_uuid()).await; + + sled_info.sled_client.vmm_finish_transition(sled_info.propolis_id).await; } /// Simulate one step of an ongoing instance state transition. To do this, we /// have to look up the instance, then get the sled agent associated with that /// instance, and then tell it to finish simulating whatever async transition is /// going on. -async fn instance_single_step_on_sled( +async fn vmm_single_step_on_sled( cptestctx: &ControlPlaneTestContext, nexus: &Arc, sled_id: SledUuid, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ) { info!(&cptestctx.logctx.log, "Single-stepping simulated instance on sled"; - "instance_id" => %instance_id, "sled_id" => %sled_id); + "propolis_id" => %propolis_id, "sled_id" => %sled_id); let sa = nexus.sled_client(&sled_id).await.unwrap(); - sa.instance_single_step(instance_id.into_untyped_uuid()).await; + sa.vmm_single_step(propolis_id).await; } pub async fn instance_simulate_with_opctx( @@ -5109,27 +5142,28 @@ pub async fn instance_simulate_with_opctx( id: &InstanceUuid, opctx: &OpContext, ) { - let sa = nexus - .instance_sled_by_id_with_opctx(id, opctx) + let sled_info = nexus + .active_instance_info(id, Some(opctx)) .await .unwrap() .expect("instance must be on a sled to simulate a state change"); - sa.instance_finish_transition(id.into_untyped_uuid()).await; + + sled_info.sled_client.vmm_finish_transition(sled_info.propolis_id).await; } /// Simulates state transitions for the incarnation of the instance on the /// supplied sled (which may not be the sled ID currently stored in the /// instance's CRDB record). -async fn instance_simulate_on_sled( +async fn vmm_simulate_on_sled( cptestctx: &ControlPlaneTestContext, nexus: &Arc, sled_id: SledUuid, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, ) { info!(&cptestctx.logctx.log, "Poking simulated instance on sled"; - "instance_id" => %instance_id, "sled_id" => %sled_id); + "propolis_id" => %propolis_id, "sled_id" => %sled_id); let sa = nexus.sled_client(&sled_id).await.unwrap(); - sa.instance_finish_transition(instance_id.into_untyped_uuid()).await; + sa.vmm_finish_transition(propolis_id).await; } /// Simulates a migration source for the provided instance ID, sled ID, and @@ -5138,19 +5172,19 @@ async fn instance_simulate_migration_source( cptestctx: &ControlPlaneTestContext, nexus: &Arc, sled_id: SledUuid, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, migration_id: Uuid, ) { info!( &cptestctx.logctx.log, "Simulating migration source sled"; - "instance_id" => %instance_id, + "propolis_id" => %propolis_id, "sled_id" => %sled_id, "migration_id" => %migration_id, ); let sa = nexus.sled_client(&sled_id).await.unwrap(); - sa.instance_simulate_migration_source( - instance_id.into_untyped_uuid(), + sa.vmm_simulate_migration_source( + propolis_id, sled_agent_client::SimulateMigrationSource { migration_id, result: sled_agent_client::SimulatedMigrationResult::Success, diff --git a/nexus/tests/integration_tests/ip_pools.rs b/nexus/tests/integration_tests/ip_pools.rs index e872cc6fe3..f56755d85c 100644 --- a/nexus/tests/integration_tests/ip_pools.rs +++ b/nexus/tests/integration_tests/ip_pools.rs @@ -1344,12 +1344,12 @@ async fn test_ip_range_delete_with_allocated_external_ip_fails( .expect("Failed to stop instance"); // Simulate the transition, wait until it is in fact stopped. - let sa = nexus - .instance_sled_by_id(&instance_id) + let info = nexus + .active_instance_info(&instance_id, None) .await .unwrap() .expect("running instance should be on a sled"); - sa.instance_finish_transition(instance.identity.id).await; + info.sled_client.vmm_finish_transition(info.propolis_id).await; instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Delete the instance diff --git a/nexus/tests/integration_tests/pantry.rs b/nexus/tests/integration_tests/pantry.rs index d77ad49db6..22d35b01b5 100644 --- a/nexus/tests/integration_tests/pantry.rs +++ b/nexus/tests/integration_tests/pantry.rs @@ -88,12 +88,12 @@ async fn set_instance_state( } async fn instance_simulate(nexus: &Arc, id: &InstanceUuid) { - let sa = nexus - .instance_sled_by_id(id) + let info = nexus + .active_instance_info(id, None) .await .unwrap() .expect("instance must be on a sled to simulate a state change"); - sa.instance_finish_transition(id.into_untyped_uuid()).await; + info.sled_client.vmm_finish_transition(info.propolis_id).await; } async fn disk_get(client: &ClientTestContext, disk_url: &str) -> Disk { From 6e574f5a52e30199f482e31c1ad786f35b91e4d9 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 23 Aug 2024 18:41:17 +0000 Subject: [PATCH 04/16] turn cpapi_instances_put on its head --- common/src/api/internal/nexus.rs | 1 + nexus/db-queries/src/db/datastore/vmm.rs | 5 - nexus/internal-api/src/lib.rs | 16 ++- .../app/background/tasks/instance_watcher.rs | 7 +- nexus/src/app/instance.rs | 121 +++++++++--------- nexus/src/app/sagas/instance_migrate.rs | 7 - nexus/src/app/sagas/instance_start.rs | 8 -- nexus/src/app/sagas/instance_update/mod.rs | 2 - nexus/src/internal_api/http_entrypoints.rs | 8 +- openapi/nexus-internal.json | 75 ++++++----- sled-agent/src/fakes/nexus.rs | 15 +-- sled-agent/src/instance.rs | 5 +- sled-agent/src/sim/instance.rs | 3 +- 13 files changed, 126 insertions(+), 147 deletions(-) diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 7f4eb358a4..143d7dd5b9 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -117,6 +117,7 @@ pub struct VmmRuntimeState { /// specific VMM and the instance it incarnates. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct SledInstanceState { + // TODO(gjc) this is probably redundant now /// The ID of the VMM whose state is being reported. pub propolis_id: PropolisUuid, diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index 14c3405a70..f909b7e9d3 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -5,7 +5,6 @@ //! [`DataStore`] helpers for working with VMM records. use super::DataStore; -use crate::authz; use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; @@ -108,14 +107,10 @@ impl DataStore { pub async fn vmm_fetch( &self, opctx: &OpContext, - authz_instance: &authz::Instance, vmm_id: &PropolisUuid, ) -> LookupResult { - opctx.authorize(authz::Action::Read, authz_instance).await?; - let vmm = dsl::vmm .filter(dsl::id.eq(vmm_id.into_untyped_uuid())) - .filter(dsl::instance_id.eq(authz_instance.id())) .filter(dsl::time_deleted.is_null()) .select(Vmm::as_select()) .get_result_async(&*self.pool_connection_authorized(opctx).await?) diff --git a/nexus/internal-api/src/lib.rs b/nexus/internal-api/src/lib.rs index 7ac3e42f57..47ca4f250a 100644 --- a/nexus/internal-api/src/lib.rs +++ b/nexus/internal-api/src/lib.rs @@ -39,8 +39,8 @@ use omicron_common::{ update::ArtifactId, }; use omicron_uuid_kinds::{ - DemoSagaUuid, DownstairsKind, SledUuid, TypedUuid, UpstairsKind, - UpstairsRepairKind, + DemoSagaUuid, DownstairsKind, PropolisUuid, SledUuid, TypedUuid, + UpstairsKind, UpstairsRepairKind, }; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -108,14 +108,14 @@ pub trait NexusInternalApi { body: TypedBody, ) -> Result, HttpError>; - /// Report updated state for an instance. + /// Report updated state for a VMM. #[endpoint { method = PUT, - path = "/instances/{instance_id}", + path = "/vmms/{propolis_id}", }] async fn cpapi_instances_put( rqctx: RequestContext, - path_params: Path, + path_params: Path, new_runtime_state: TypedBody, ) -> Result; @@ -568,6 +568,12 @@ pub struct InstancePathParam { pub instance_id: Uuid, } +/// Path parameters for VMM requests (internal API) +#[derive(Deserialize, JsonSchema)] +pub struct VmmPathParam { + pub propolis_id: PropolisUuid, +} + #[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] pub struct CollectorIdPathParams { /// The ID of the oximeter collector. diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index 6195051db9..51dbb40831 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -21,7 +21,6 @@ use omicron_common::api::external::Error; use omicron_common::api::external::InstanceState; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_uuid_kinds::GenericUuid; -use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; use oximeter::types::ProducerRegistry; use sled_agent_client::Client as SledAgentClient; @@ -158,10 +157,10 @@ impl InstanceWatcher { "updating instance state"; "state" => ?new_runtime_state.vmm_state.state, ); - match crate::app::instance::notify_instance_updated( + match crate::app::instance::process_vmm_update( &datastore, &opctx, - InstanceUuid::from_untyped_uuid(target.instance_id), + PropolisUuid::from_untyped_uuid(target.vmm_id), &new_runtime_state, ) .await @@ -175,7 +174,7 @@ impl InstanceWatcher { _ => Err(Incomplete::UpdateFailed), }; } - Ok(Some(saga)) => { + Ok(Some((_, saga))) => { check.update_saga_queued = true; if let Err(e) = sagas.saga_start(saga).await { warn!(opctx.log, "update saga failed"; "error" => ?e); diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 7715c01cde..054c16a871 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -553,7 +553,6 @@ impl super::Nexus { if let Err(e) = self .instance_request_state( opctx, - &authz_instance, state.instance(), state.vmm(), InstanceStateChangeRequest::Reboot, @@ -632,7 +631,6 @@ impl super::Nexus { if let Err(e) = self .instance_request_state( opctx, - &authz_instance, state.instance(), state.vmm(), InstanceStateChangeRequest::Stop, @@ -830,13 +828,10 @@ impl super::Nexus { pub(crate) async fn instance_request_state( &self, opctx: &OpContext, - authz_instance: &authz::Instance, prev_instance_state: &db::model::Instance, prev_vmm_state: &Option, requested: InstanceStateChangeRequest, ) -> Result<(), InstanceStateChangeError> { - let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); - match self.select_runtime_change_action( prev_instance_state, prev_vmm_state, @@ -868,7 +863,7 @@ impl super::Nexus { // Ok(None) here, in which case, there's nothing to write back. match instance_put_result { Ok(Some(ref state)) => self - .notify_instance_updated(opctx, instance_id, state) + .notify_vmm_updated(opctx, propolis_id, state) .await .map_err(Into::into), Ok(None) => Ok(()), @@ -1147,8 +1142,7 @@ impl super::Nexus { match instance_register_result { Ok(state) => { - self.notify_instance_updated(opctx, instance_id, &state) - .await?; + self.notify_vmm_updated(opctx, *propolis_id, &state).await?; } Err(e) => { if e.instance_unhealthy() { @@ -1327,19 +1321,22 @@ impl super::Nexus { /// Invoked by a sled agent to publish an updated runtime state for an /// Instance. - pub(crate) async fn notify_instance_updated( + pub(crate) async fn notify_vmm_updated( &self, opctx: &OpContext, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, new_runtime_state: &nexus::SledInstanceState, ) -> Result<(), Error> { - let saga = notify_instance_updated( + let Some((instance_id, saga)) = process_vmm_update( &self.db_datastore, opctx, - instance_id, + propolis_id, new_runtime_state, ) - .await?; + .await? + else { + return Ok(()); + }; // We don't need to wait for the instance update saga to run to // completion to return OK to the sled-agent --- all it needs to care @@ -1350,53 +1347,51 @@ impl super::Nexus { // one is eventually executed. // // Therefore, just spawn the update saga in a new task, and return. - if let Some(saga) = saga { - info!(opctx.log, "starting update saga for {instance_id}"; - "instance_id" => %instance_id, - "vmm_state" => ?new_runtime_state.vmm_state, - "migration_state" => ?new_runtime_state.migrations(), - ); - let sagas = self.sagas.clone(); - let task_instance_updater = - self.background_tasks.task_instance_updater.clone(); - let log = opctx.log.clone(); - tokio::spawn(async move { - // TODO(eliza): maybe we should use the lower level saga API so - // we can see if the saga failed due to the lock being held and - // retry it immediately? - let running_saga = async move { - let runnable_saga = sagas.saga_prepare(saga).await?; - runnable_saga.start().await - } - .await; - let result = match running_saga { - Err(error) => { - error!(&log, "failed to start update saga for {instance_id}"; - "instance_id" => %instance_id, - "error" => %error, - ); - // If we couldn't start the update saga for this - // instance, kick the instance-updater background task - // to try and start it again in a timely manner. - task_instance_updater.activate(); - return; - } - Ok(saga) => { - saga.wait_until_stopped().await.into_omicron_result() - } - }; - if let Err(error) = result { - error!(&log, "update saga for {instance_id} failed"; + info!(opctx.log, "starting update saga for {instance_id}"; + "instance_id" => %instance_id, + "vmm_state" => ?new_runtime_state.vmm_state, + "migration_state" => ?new_runtime_state.migrations(), + ); + let sagas = self.sagas.clone(); + let task_instance_updater = + self.background_tasks.task_instance_updater.clone(); + let log = opctx.log.clone(); + tokio::spawn(async move { + // TODO(eliza): maybe we should use the lower level saga API so + // we can see if the saga failed due to the lock being held and + // retry it immediately? + let running_saga = async move { + let runnable_saga = sagas.saga_prepare(saga).await?; + runnable_saga.start().await + } + .await; + let result = match running_saga { + Err(error) => { + error!(&log, "failed to start update saga for {instance_id}"; "instance_id" => %instance_id, "error" => %error, ); - // If we couldn't complete the update saga for this + // If we couldn't start the update saga for this // instance, kick the instance-updater background task // to try and start it again in a timely manner. task_instance_updater.activate(); + return; } - }); - } + Ok(saga) => { + saga.wait_until_stopped().await.into_omicron_result() + } + }; + if let Err(error) = result { + error!(&log, "update saga for {instance_id} failed"; + "instance_id" => %instance_id, + "error" => %error, + ); + // If we couldn't complete the update saga for this + // instance, kick the instance-updater background task + // to try and start it again in a timely manner. + task_instance_updater.activate(); + } + }); Ok(()) } @@ -1839,18 +1834,16 @@ impl super::Nexus { /// Invoked by a sled agent to publish an updated runtime state for an /// Instance, returning an update saga for that instance (if one must be /// executed). -pub(crate) async fn notify_instance_updated( +pub(crate) async fn process_vmm_update( datastore: &DataStore, opctx: &OpContext, - instance_id: InstanceUuid, + propolis_id: PropolisUuid, new_runtime_state: &nexus::SledInstanceState, -) -> Result, Error> { +) -> Result, Error> { use sagas::instance_update; let migrations = new_runtime_state.migrations(); - let propolis_id = new_runtime_state.propolis_id; info!(opctx.log, "received new VMM runtime state from sled agent"; - "instance_id" => %instance_id, "propolis_id" => %propolis_id, "vmm_state" => ?new_runtime_state.vmm_state, "migration_state" => ?migrations, @@ -1870,10 +1863,18 @@ pub(crate) async fn notify_instance_updated( // prepare and return it. if instance_update::update_saga_needed( &opctx.log, - instance_id, new_runtime_state, &result, ) { + let instance_id = InstanceUuid::from_untyped_uuid( + datastore.vmm_fetch(&opctx, &propolis_id).await?.instance_id, + ); + + info!(opctx.log, "scheduling update saga in response to vmm update"; + "instance_id" => %instance_id, + "propolis_id" => %propolis_id, + ); + let (.., authz_instance) = LookupPath::new(&opctx, datastore) .instance_id(instance_id.into_untyped_uuid()) .lookup_for(authz::Action::Modify) @@ -1884,7 +1885,7 @@ pub(crate) async fn notify_instance_updated( authz_instance, }, )?; - Ok(Some(saga)) + Ok(Some((instance_id, saga))) } else { Ok(None) } diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index e1c198f25e..24d11fcae2 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -490,12 +490,6 @@ async fn sim_instance_migrate( let src_propolis_id = db_instance.runtime().propolis_id.unwrap(); let dst_vmm = sagactx.lookup::("dst_vmm_record")?; - let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) - .instance_id(db_instance.id()) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; - info!(osagactx.log(), "initiating migration from destination sled"; "instance_id" => %db_instance.id(), "dst_vmm_record" => ?dst_vmm, @@ -519,7 +513,6 @@ async fn sim_instance_migrate( .nexus() .instance_request_state( &opctx, - &authz_instance, &db_instance, &Some(dst_vmm), InstanceStateChangeRequest::Migrate( diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index bd948ff668..b6b78bd43c 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -646,7 +646,6 @@ async fn sis_ensure_running( ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; - let datastore = osagactx.datastore(); let opctx = crate::context::op_context_for_saga_action( &sagactx, ¶ms.serialized_authn, @@ -661,17 +660,10 @@ async fn sis_ensure_running( "instance_id" => %instance_id, "sled_id" => %sled_id); - let (.., authz_instance) = LookupPath::new(&opctx, &datastore) - .instance_id(instance_id.into_untyped_uuid()) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; - match osagactx .nexus() .instance_request_state( &opctx, - &authz_instance, &db_instance, &Some(db_vmm), crate::app::instance::InstanceStateChangeRequest::Run, diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 5f226480b8..020e541bb1 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -407,7 +407,6 @@ mod destroyed; /// VMM/migration states. pub fn update_saga_needed( log: &slog::Logger, - instance_id: InstanceUuid, state: &SledInstanceState, result: &VmmStateUpdateResult, ) -> bool { @@ -443,7 +442,6 @@ pub fn update_saga_needed( debug!(log, "new VMM runtime state from sled agent requires an \ instance-update saga"; - "instance_id" => %instance_id, "propolis_id" => %state.propolis_id, "vmm_needs_update" => vmm_needs_update, "migration_in_needs_update" => migration_in_needs_update, diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 9965b6e21e..87bded2281 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -168,7 +168,7 @@ impl NexusInternalApi for NexusInternalApiImpl { async fn cpapi_instances_put( rqctx: RequestContext, - path_params: Path, + path_params: Path, new_runtime_state: TypedBody, ) -> Result { let apictx = &rqctx.context().context; @@ -178,11 +178,7 @@ impl NexusInternalApi for NexusInternalApiImpl { let opctx = crate::context::op_context_for_internal_api(&rqctx).await; let handler = async { nexus - .notify_instance_updated( - &opctx, - InstanceUuid::from_untyped_uuid(path.instance_id), - &new_state, - ) + .notify_vmm_updated(&opctx, path.propolis_id, &new_state) .await?; Ok(HttpResponseUpdatedNoContent()) }; diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 54b4822e51..5a5643a678 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -746,44 +746,6 @@ } } }, - "/instances/{instance_id}": { - "put": { - "summary": "Report updated state for an instance.", - "operationId": "cpapi_instances_put", - "parameters": [ - { - "in": "path", - "name": "instance_id", - "required": true, - "schema": { - "type": "string", - "format": "uuid" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SledInstanceState" - } - } - }, - "required": true - }, - "responses": { - "204": { - "description": "resource updated" - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, "/instances/{instance_id}/migrate": { "post": { "operationId": "instance_migrate", @@ -1470,6 +1432,43 @@ } } }, + "/vmms/{propolis_id}": { + "put": { + "summary": "Report updated state for a VMM.", + "operationId": "cpapi_instances_put", + "parameters": [ + { + "in": "path", + "name": "propolis_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForPropolisKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SledInstanceState" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/volume/{volume_id}/remove-read-only-parent": { "post": { "summary": "Request removal of a read_only_parent from a volume.", diff --git a/sled-agent/src/fakes/nexus.rs b/sled-agent/src/fakes/nexus.rs index 246ef07b60..9221a4b267 100644 --- a/sled-agent/src/fakes/nexus.rs +++ b/sled-agent/src/fakes/nexus.rs @@ -18,9 +18,10 @@ use omicron_common::api::external::Error; use omicron_common::api::internal::nexus::{ SledInstanceState, UpdateArtifactId, }; -use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::{OmicronZoneUuid, PropolisUuid}; use schemars::JsonSchema; use serde::Deserialize; +use sled_agent_api::VmmPathParam; use uuid::Uuid; /// Implements a fake Nexus. @@ -50,7 +51,7 @@ pub trait FakeNexusServer: Send + Sync { fn cpapi_instances_put( &self, - _instance_id: Uuid, + _propolis_id: PropolisUuid, _new_runtime_state: SledInstanceState, ) -> Result<(), Error> { Err(Error::internal_error("Not implemented")) @@ -118,22 +119,18 @@ async fn sled_agent_put( Ok(HttpResponseUpdatedNoContent()) } -#[derive(Deserialize, JsonSchema)] -struct InstancePathParam { - instance_id: Uuid, -} #[endpoint { method = PUT, - path = "/instances/{instance_id}", + path = "/vmms/{propolis_id}", }] async fn cpapi_instances_put( request_context: RequestContext, - path_params: Path, + path_params: Path, new_runtime_state: TypedBody, ) -> Result { let context = request_context.context(); context.cpapi_instances_put( - path_params.into_inner().instance_id, + path_params.into_inner().propolis_id, new_runtime_state.into_inner(), )?; Ok(HttpResponseUpdatedNoContent()) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index cb9cf7e5aa..4342572c4c 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -514,12 +514,13 @@ impl InstanceRunner { let state = self.state.sled_instance_state(); info!(self.log, "Publishing instance state update to Nexus"; "instance_id" => %self.instance_id(), + "propolis_id" => %self.state.propolis_id(), "state" => ?state, ); self.nexus_client .cpapi_instances_put( - &self.instance_id().into_untyped_uuid(), + &self.state.propolis_id(), &state.into(), ) .await @@ -1613,7 +1614,7 @@ mod tests { impl FakeNexusServer for NexusServer { fn cpapi_instances_put( &self, - _instance_id: Uuid, + _propolis_id: PropolisUuid, new_runtime_state: SledInstanceState, ) -> Result<(), omicron_common::api::external::Error> { self.observed_runtime_state diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index 33bc1c40c1..75b0f4e195 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -15,6 +15,7 @@ use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::ResourceType; use omicron_common::api::internal::nexus::{SledInstanceState, VmmState}; +use omicron_uuid_kinds::{GenericUuid, PropolisUuid}; use propolis_client::types::{ InstanceMigrateStatusResponse as PropolisMigrateResponse, InstanceMigrationStatus as PropolisMigrationStatus, @@ -512,7 +513,7 @@ impl Simulatable for SimInstance { ) -> Result<(), Error> { nexus_client .cpapi_instances_put( - id, + &PropolisUuid::from_untyped_uuid(*id), &nexus_client::types::SledInstanceState::from(current), ) .await From d56af4f829baca876aebd453c77964ff66f2c8c0 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 23 Aug 2024 19:46:51 +0000 Subject: [PATCH 05/16] fix instance ip tests --- nexus/src/app/sagas/instance_ip_attach.rs | 30 +++++++++++++++++------ nexus/src/app/sagas/instance_ip_detach.rs | 14 ++++++++--- nexus/src/app/sagas/test_helpers.rs | 2 +- sled-agent/src/sim/sled_agent.rs | 7 +++--- 4 files changed, 38 insertions(+), 15 deletions(-) diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index b2a03e268e..6a456e7c44 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -429,8 +429,14 @@ pub(crate) mod test { } // Sled agent has a record of the new external IPs. + let (propolis_id, _) = + crate::app::sagas::test_helpers::instance_fetch_vmm_and_sled_ids( + cptestctx, + &instance_id, + ) + .await; let mut eips = sled_agent.external_ips.lock().await; - let my_eips = eips.entry(instance_id.into_untyped_uuid()).or_default(); + let my_eips = eips.entry(propolis_id).or_default(); assert!(my_eips .iter() .any(|v| matches!(v, InstanceExternalIpBody::Floating(_)))); @@ -451,7 +457,7 @@ pub(crate) mod test { pub(crate) async fn verify_clean_slate( cptestctx: &ControlPlaneTestContext, - instance_id: Uuid, + instance_id: InstanceUuid, ) { use nexus_db_queries::db::schema::external_ip::dsl; @@ -464,7 +470,7 @@ pub(crate) mod test { assert!(dsl::external_ip .filter(dsl::kind.eq(IpKind::Floating)) .filter(dsl::time_deleted.is_null()) - .filter(dsl::parent_id.eq(instance_id)) + .filter(dsl::parent_id.eq(instance_id.into_untyped_uuid())) .filter(dsl::state.ne(IpAttachState::Detached)) .select(ExternalIp::as_select()) .first_async::(&*conn) @@ -485,8 +491,14 @@ pub(crate) mod test { .is_none()); // No IP bindings remain on sled-agent. + let (propolis_id, _) = + crate::app::sagas::test_helpers::instance_fetch_vmm_and_sled_ids( + cptestctx, + &instance_id, + ) + .await; let mut eips = sled_agent.external_ips.lock().await; - let my_eips = eips.entry(instance_id).or_default(); + let my_eips = eips.entry(propolis_id).or_default(); assert!(my_eips.is_empty()); } @@ -505,9 +517,10 @@ pub(crate) mod test { let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); crate::app::sagas::test_helpers::instance_simulate( cptestctx, - &InstanceUuid::from_untyped_uuid(instance.identity.id), + &instance_id, ) .await; @@ -515,7 +528,7 @@ pub(crate) mod test { test_helpers::action_failure_can_unwind::( nexus, || Box::pin(new_test_params(&opctx, datastore, use_float) ), - || Box::pin(verify_clean_slate(&cptestctx, instance.id())), + || Box::pin(verify_clean_slate(&cptestctx, instance_id)), log, ) .await; @@ -537,9 +550,10 @@ pub(crate) mod test { let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); crate::app::sagas::test_helpers::instance_simulate( cptestctx, - &InstanceUuid::from_untyped_uuid(instance.identity.id), + &instance_id, ) .await; @@ -551,7 +565,7 @@ pub(crate) mod test { >( nexus, || Box::pin(new_test_params(&opctx, datastore, use_float)), - || Box::pin(verify_clean_slate(&cptestctx, instance.id())), + || Box::pin(verify_clean_slate(&cptestctx, instance_id)), log, ) .await; diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index d5f0b55528..4a4aa0c2d2 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -15,7 +15,7 @@ use nexus_db_model::IpAttachState; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; use omicron_common::api::external::NameOrId; -use omicron_uuid_kinds::{GenericUuid, InstanceUuid, SledUuid}; +use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; use ref_cast::RefCast; use serde::Deserialize; use serde::Serialize; @@ -168,7 +168,9 @@ async fn siid_get_instance_state( async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { let params = sagactx.saga_params::()?; - let sled_id = sagactx.lookup::>("instance_state")?; + let sled_id = sagactx + .lookup::>("instance_state")? + .map(|ids| ids.sled_id); let target_ip = sagactx.lookup::("target_ip")?; instance_ip_remove_nat( &sagactx, @@ -397,8 +399,14 @@ pub(crate) mod test { } // Sled agent has removed its records of the external IPs. + let (propolis_id, _) = + crate::app::sagas::test_helpers::instance_fetch_vmm_and_sled_ids( + cptestctx, + &instance_id, + ) + .await; let mut eips = sled_agent.external_ips.lock().await; - let my_eips = eips.entry(instance_id.into_untyped_uuid()).or_default(); + let my_eips = eips.entry(propolis_id).or_default(); assert!(my_eips.is_empty()); // DB only has record for SNAT. diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index 63bb3d2e66..3de1f3c8d2 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -220,7 +220,7 @@ pub async fn instance_fetch( db_state } -async fn instance_fetch_vmm_and_sled_ids( +pub(super) async fn instance_fetch_vmm_and_sled_ids( cptestctx: &ControlPlaneTestContext, instance_id: &InstanceUuid, ) -> (PropolisUuid, SledUuid) { diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 5b804d3da5..2cb26e1bb0 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -84,7 +84,8 @@ pub struct SledAgent { mock_propolis: Mutex>, PropolisClient)>>, /// lists of external IPs assigned to instances - pub external_ips: Mutex>>, + pub external_ips: + Mutex>>, pub vpc_routes: Mutex>, config: Config, fake_zones: Mutex, @@ -740,7 +741,7 @@ impl SledAgent { } let mut eips = self.external_ips.lock().await; - let my_eips = eips.entry(propolis_id.into_untyped_uuid()).or_default(); + let my_eips = eips.entry(propolis_id).or_default(); // High-level behaviour: this should always succeed UNLESS // trying to add a double ephemeral. @@ -773,7 +774,7 @@ impl SledAgent { } let mut eips = self.external_ips.lock().await; - let my_eips = eips.entry(propolis_id.into_untyped_uuid()).or_default(); + let my_eips = eips.entry(propolis_id).or_default(); my_eips.remove(&body_args); From 6e4ac97370a0e0f899704df6c5235f5c26c42e0e Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 23 Aug 2024 20:11:47 +0000 Subject: [PATCH 06/16] fix assorted uuid misuses --- nexus/src/app/sagas/snapshot_create.rs | 2 +- nexus/tests/integration_tests/instances.rs | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index 5c9ff7f2ef..540ab90e28 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -862,7 +862,7 @@ async fn ssc_send_snapshot_request_to_sled_agent( sled_agent_client .vmm_issue_disk_snapshot_request( &PropolisUuid::from_untyped_uuid(propolis_id), - &attach_instance_id, + ¶ms.disk_id, &VmmIssueDiskSnapshotRequestBody { snapshot_id }, ) .await diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index bd2afc97aa..a7228e0841 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -898,7 +898,7 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { assert_eq!(instance.runtime.run_state, InstanceState::Migrating); // Move the target to "completed". - vmm_simulate_on_sled(cptestctx, nexus, dst_sled_id, src_propolis_id).await; + vmm_simulate_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id).await; instance_wait_for_state(&client, instance_id, InstanceState::Running).await; @@ -995,7 +995,8 @@ async fn test_instance_migrate_v2p_and_routes( .active_instance_info(&instance_id, None) .await .unwrap() - .expect("running instance should have a sled").sled_id; + .expect("running instance should have a sled") + .sled_id; let mut sled_agents = vec![cptestctx.sled_agent.sled_agent.clone()]; sled_agents.extend(other_sleds.iter().map(|tup| tup.1.sled_agent.clone())); @@ -1465,8 +1466,10 @@ async fn test_instance_metrics_with_migration( migration_id, ) .await; - vmm_single_step_on_sled(cptestctx, nexus, original_sled, src_propolis_id).await; - vmm_single_step_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id).await; + vmm_single_step_on_sled(cptestctx, nexus, original_sled, src_propolis_id) + .await; + vmm_single_step_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id) + .await; instance_wait_for_state(&client, instance_id, InstanceState::Migrating) .await; From 63d09901e9a2a8bea39b5dcff98c95d3acee6718 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 23 Aug 2024 21:47:44 +0000 Subject: [PATCH 07/16] remove unnecessary propolis_id --- clients/nexus-client/src/lib.rs | 1 - clients/sled-agent-client/src/lib.rs | 1 - common/src/api/internal/nexus.rs | 4 ---- nexus/src/app/instance.rs | 1 + nexus/src/app/sagas/instance_update/mod.rs | 3 ++- openapi/nexus-internal.json | 17 ++++------------- openapi/sled-agent.json | 9 --------- sled-agent/src/common/instance.rs | 21 ++++----------------- sled-agent/src/instance.rs | 11 ++++------- sled-agent/src/sim/collection.rs | 3 --- sled-agent/src/sim/instance.rs | 1 - sled-agent/src/sim/sled_agent.rs | 1 - 12 files changed, 15 insertions(+), 58 deletions(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 62366c45e1..d65de072f7 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -138,7 +138,6 @@ impl From s: omicron_common::api::internal::nexus::SledInstanceState, ) -> Self { Self { - propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), migration_in: s.migration_in.map(Into::into), migration_out: s.migration_out.map(Into::into), diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index cf2e69e82d..7a74145be2 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -167,7 +167,6 @@ impl From { fn from(s: types::SledInstanceState) -> Self { Self { - propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), migration_in: s.migration_in.map(Into::into), migration_out: s.migration_out.map(Into::into), diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 143d7dd5b9..60b820cb24 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -117,10 +117,6 @@ pub struct VmmRuntimeState { /// specific VMM and the instance it incarnates. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct SledInstanceState { - // TODO(gjc) this is probably redundant now - /// The ID of the VMM whose state is being reported. - pub propolis_id: PropolisUuid, - /// The most recent state of the sled's VMM process. pub vmm_state: VmmRuntimeState, diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 054c16a871..53c5259c99 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1863,6 +1863,7 @@ pub(crate) async fn process_vmm_update( // prepare and return it. if instance_update::update_saga_needed( &opctx.log, + propolis_id, new_runtime_state, &result, ) { diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 020e541bb1..be7a5e3b50 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -407,6 +407,7 @@ mod destroyed; /// VMM/migration states. pub fn update_saga_needed( log: &slog::Logger, + propolis_id: PropolisUuid, state: &SledInstanceState, result: &VmmStateUpdateResult, ) -> bool { @@ -442,7 +443,7 @@ pub fn update_saga_needed( debug!(log, "new VMM runtime state from sled agent requires an \ instance-update saga"; - "propolis_id" => %state.propolis_id, + "propolis_id" => %propolis_id, "vmm_needs_update" => vmm_needs_update, "migration_in_needs_update" => migration_in_needs_update, "migration_out_needs_update" => migration_out_needs_update, diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 5a5643a678..970c26c96b 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -5076,14 +5076,6 @@ } ] }, - "propolis_id": { - "description": "The ID of the VMM whose state is being reported.", - "allOf": [ - { - "$ref": "#/components/schemas/TypedUuidForPropolisKind" - } - ] - }, "vmm_state": { "description": "The most recent state of the sled's VMM process.", "allOf": [ @@ -5094,7 +5086,6 @@ } }, "required": [ - "propolis_id", "vmm_state" ] }, @@ -5324,10 +5315,6 @@ "type": "string", "format": "uuid" }, - "TypedUuidForPropolisKind": { - "type": "string", - "format": "uuid" - }, "TypedUuidForSledKind": { "type": "string", "format": "uuid" @@ -5589,6 +5576,10 @@ ] } ] + }, + "TypedUuidForPropolisKind": { + "type": "string", + "format": "uuid" } }, "responses": { diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 1ea29e35dc..12e324ede1 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -4664,14 +4664,6 @@ } ] }, - "propolis_id": { - "description": "The ID of the VMM whose state is being reported.", - "allOf": [ - { - "$ref": "#/components/schemas/TypedUuidForPropolisKind" - } - ] - }, "vmm_state": { "description": "The most recent state of the sled's VMM process.", "allOf": [ @@ -4682,7 +4674,6 @@ } }, "required": [ - "propolis_id", "vmm_state" ] }, diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index adbeb9158f..575f52066d 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -10,7 +10,6 @@ use omicron_common::api::internal::nexus::{ MigrationRuntimeState, MigrationState, SledInstanceState, VmmRuntimeState, VmmState, }; -use omicron_uuid_kinds::PropolisUuid; use propolis_client::types::{ InstanceMigrationStatus, InstanceState as PropolisApiState, InstanceStateMonitorResponse, MigrationState as PropolisMigrationState, @@ -21,7 +20,6 @@ use uuid::Uuid; #[derive(Clone, Debug)] pub struct InstanceStates { vmm: VmmRuntimeState, - propolis_id: PropolisUuid, migration_in: Option, migration_out: Option, } @@ -173,11 +171,7 @@ pub enum Action { } impl InstanceStates { - pub fn new( - vmm: VmmRuntimeState, - propolis_id: PropolisUuid, - migration_id: Option, - ) -> Self { + pub fn new(vmm: VmmRuntimeState, migration_id: Option) -> Self { // If this instance is created with a migration ID, we are the intended // target of a migration in. Set that up now. let migration_in = @@ -187,17 +181,13 @@ impl InstanceStates { gen: Generation::new(), time_updated: Utc::now(), }); - InstanceStates { vmm, propolis_id, migration_in, migration_out: None } + InstanceStates { vmm, migration_in, migration_out: None } } pub fn vmm(&self) -> &VmmRuntimeState { &self.vmm } - pub fn propolis_id(&self) -> PropolisUuid { - self.propolis_id - } - pub fn migration_in(&self) -> Option<&MigrationRuntimeState> { self.migration_in.as_ref() } @@ -212,7 +202,6 @@ impl InstanceStates { pub fn sled_instance_state(&self) -> SledInstanceState { SledInstanceState { vmm_state: self.vmm.clone(), - propolis_id: self.propolis_id, migration_in: self.migration_in.clone(), migration_out: self.migration_out.clone(), } @@ -377,7 +366,6 @@ mod test { use uuid::Uuid; fn make_instance() -> InstanceStates { - let propolis_id = PropolisUuid::new_v4(); let now = Utc::now(); let vmm = VmmRuntimeState { @@ -386,7 +374,7 @@ mod test { time_updated: now, }; - InstanceStates::new(vmm, propolis_id, None) + InstanceStates::new(vmm, None) } fn make_migration_source_instance() -> InstanceStates { @@ -406,7 +394,6 @@ mod test { } fn make_migration_target_instance() -> InstanceStates { - let propolis_id = PropolisUuid::new_v4(); let now = Utc::now(); let vmm = VmmRuntimeState { @@ -415,7 +402,7 @@ mod test { time_updated: now, }; - InstanceStates::new(vmm, propolis_id, Some(Uuid::new_v4())) + InstanceStates::new(vmm, Some(Uuid::new_v4())) } fn make_observed_state( diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 4342572c4c..e56f5a1822 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -514,15 +514,12 @@ impl InstanceRunner { let state = self.state.sled_instance_state(); info!(self.log, "Publishing instance state update to Nexus"; "instance_id" => %self.instance_id(), - "propolis_id" => %self.state.propolis_id(), + "propolis_id" => %self.propolis_id, "state" => ?state, ); self.nexus_client - .cpapi_instances_put( - &self.state.propolis_id(), - &state.into(), - ) + .cpapi_instances_put(&self.propolis_id, &state.into()) .await .map_err(|err| -> backoff::BackoffError { match &err { @@ -618,7 +615,7 @@ impl InstanceRunner { info!( self.log, "updated state after observing Propolis state change"; - "propolis_id" => %self.state.propolis_id(), + "propolis_id" => %self.propolis_id, "new_vmm_state" => ?self.state.vmm() ); @@ -1089,7 +1086,7 @@ impl Instance { dhcp_config, requested_disks: hardware.disks, cloud_init_bytes: hardware.cloud_init_bytes, - state: InstanceStates::new(vmm_runtime, propolis_id, migration_id), + state: InstanceStates::new(vmm_runtime, migration_id), running_state: None, nexus_client, storage, diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index aaa2092ceb..8421350887 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -396,14 +396,12 @@ mod test { use omicron_common::api::internal::nexus::VmmRuntimeState; use omicron_common::api::internal::nexus::VmmState; use omicron_test_utils::dev::test_setup_log; - use omicron_uuid_kinds::PropolisUuid; use sled_agent_types::disk::DiskStateRequested; use sled_agent_types::instance::InstanceStateRequested; fn make_instance( logctx: &LogContext, ) -> (SimObject, Receiver<()>) { - let propolis_id = PropolisUuid::new_v4(); let vmm_state = VmmRuntimeState { state: VmmState::Starting, gen: Generation::new(), @@ -412,7 +410,6 @@ mod test { let state = SledInstanceState { vmm_state, - propolis_id, migration_in: None, migration_out: None, }; diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index 75b0f4e195..8c13c7e9b9 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -454,7 +454,6 @@ impl Simulatable for SimInstance { inner: Arc::new(Mutex::new(SimInstanceInner { state: InstanceStates::new( current.vmm_state, - current.propolis_id, current.migration_in.map(|m| m.migration_id), ), last_response: InstanceStateMonitorResponse { diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 2cb26e1bb0..97c75c6b55 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -381,7 +381,6 @@ impl SledAgent { &propolis_id.into_untyped_uuid(), SledInstanceState { vmm_state: vmm_runtime, - propolis_id, migration_in, migration_out: None, }, From fadceea4b395a66dc78c90e7cda1ad0a655bdd52 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 23 Aug 2024 23:14:10 +0000 Subject: [PATCH 08/16] cleanup --- nexus/src/app/snapshot.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nexus/src/app/snapshot.rs b/nexus/src/app/snapshot.rs index c1a9825f07..57b8edd1f0 100644 --- a/nexus/src/app/snapshot.rs +++ b/nexus/src/app/snapshot.rs @@ -107,8 +107,11 @@ impl super::Nexus { .instance_fetch_with_vmm(&opctx, &authz_instance) .await?; + // If a Propolis _may_ exist, send the snapshot request there, + // otherwise use the pantry. instance_state.vmm().is_none() } else { + // This disk is not attached to an instance, use the pantry. true }; From f20099a6d775a123c20bb7a9d2b425cd267600f6 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Fri, 23 Aug 2024 23:23:19 +0000 Subject: [PATCH 09/16] explain why vmm record refetch is safe --- nexus/src/app/instance.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 53c5259c99..0012042d9e 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1861,6 +1861,13 @@ pub(crate) async fn process_vmm_update( // If an instance-update saga must be executed as a result of this update, // prepare and return it. + // + // It's safe to refetch the VMM record to get the instance ID at this point + // even though the VMM may now be Destroyed, because the record itself won't + // be deleted until the instance update saga retires it. If this update + // marked the VMM as Destroyed and then the fetch failed, then another + // update saga has already observed what this routine published, so there's + // no need to schedule another saga. if instance_update::update_saga_needed( &opctx.log, propolis_id, From 1f9aa265e210c217556e0f7ee0c112810730a2d7 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Sat, 24 Aug 2024 00:19:33 +0000 Subject: [PATCH 10/16] fmt, fix doc links --- nexus/src/app/background/tasks/abandoned_vmm_reaper.rs | 2 +- nexus/src/app/sagas/instance_update/mod.rs | 6 +++--- nexus/tests/integration_tests/disks.rs | 3 ++- sled-agent/api/src/lib.rs | 5 +---- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs index a81080ec75..541ede6e86 100644 --- a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs +++ b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs @@ -28,7 +28,7 @@ //! remains alive and continues to own its virtual provisioning resources. //! //! Cleanup of instance resources when an instance's *active* VMM is destroyed -//! is handled elsewhere, by `notify_instance_updated` and (eventually) the +//! is handled elsewhere, by `notify_vmm_updated` and (eventually) the //! `instance-update` saga. use crate::app::background::BackgroundTask; diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index be7a5e3b50..12bf95af41 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -236,7 +236,7 @@ //! updates is perhaps the simplest one: _avoiding unnecessary update sagas_. //! The `cpapi_instances_put` API endpoint and instance-watcher background tasks //! handle changes to VMM and migration states by calling the -//! [`notify_instance_updated`] method, which writes the new states to the +//! [`notify_vmm_updated`] method, which writes the new states to the //! database and (potentially) starts an update saga. Naively, this method would //! *always* start an update saga, but remember that --- as we discussed //! [above](#background) --- many VMM/migration state changes don't actually @@ -271,7 +271,7 @@ //! delayed. To improve the timeliness of update sagas, we will also explicitly //! activate the background task at any point where we know that an update saga //! *should* run but we were not able to run it. If an update saga cannot be -//! started, whether by [`notify_instance_updated`], a `start-instance-update` +//! started, whether by [`notify_vmm_updated`], a `start-instance-update` //! saga attempting to start its real saga, or an `instance-update` saga //! chaining into a new one as its last action, the `instance-watcher` //! background task is activated. Similarly, when a `start-instance-update` saga @@ -326,7 +326,7 @@ //! crate::app::db::datastore::DataStore::instance_updater_inherit_lock //! [instance_updater_unlock]: //! crate::app::db::datastore::DataStore::instance_updater_unlock -//! [`notify_instance_updated`]: crate::app::Nexus::notify_instance_updated +//! [`notify_vmm_updated`]: crate::app::Nexus::notify_vmm_updated //! //! [dist-locking]: //! https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index 900a849c94..fe6aab2770 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -188,7 +188,8 @@ async fn set_instance_state( } async fn instance_simulate(nexus: &Arc, id: &InstanceUuid) { - let info = nexus.active_instance_info(id, None) + let info = nexus + .active_instance_info(id, None) .await .unwrap() .expect("instance must be on a sled to simulate a state change"); diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 5787a0fd41..8c7a9d841d 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -296,10 +296,7 @@ pub trait SledAgentApi { rqctx: RequestContext, path_params: Path, body: TypedBody, - ) -> Result< - HttpResponseOk, - HttpError, - >; + ) -> Result, HttpError>; #[endpoint { method = PUT, From 6d3d8436ce3ea2f209aea21e73c417c51560a613 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Mon, 26 Aug 2024 17:07:50 +0000 Subject: [PATCH 11/16] PR feedback This is mostly renaming and cleanup; the main functional change is to the instance zone bundle collection routine, which now parses the input zone name to get a Propolis ID instead of converting all registered Propolis IDs to zone names to look for a match. --- clients/nexus-client/src/lib.rs | 8 +- clients/sled-agent-client/src/lib.rs | 6 +- common/src/api/internal/nexus.rs | 7 +- nexus/internal-api/src/lib.rs | 4 +- .../background/tasks/abandoned_vmm_reaper.rs | 4 +- .../app/background/tasks/instance_watcher.rs | 14 +- nexus/src/app/instance.rs | 50 +-- nexus/src/app/sagas/instance_common.rs | 19 +- nexus/src/app/sagas/instance_ip_attach.rs | 18 +- nexus/src/app/sagas/instance_ip_detach.rs | 16 +- nexus/src/app/sagas/instance_update/mod.rs | 12 +- nexus/src/app/sagas/test_helpers.rs | 22 +- nexus/src/internal_api/http_entrypoints.rs | 4 +- openapi/nexus-internal.json | 72 ++--- openapi/sled-agent.json | 284 +++++++++--------- sled-agent/api/src/lib.rs | 16 +- sled-agent/src/common/instance.rs | 6 +- sled-agent/src/fakes/nexus.rs | 8 +- sled-agent/src/http_entrypoints.rs | 16 +- sled-agent/src/instance.rs | 63 ++-- sled-agent/src/instance_manager.rs | 100 +++--- sled-agent/src/sim/collection.rs | 27 +- sled-agent/src/sim/http_entrypoints.rs | 18 +- sled-agent/src/sim/instance.rs | 38 ++- sled-agent/src/sim/sled_agent.rs | 45 ++- sled-agent/src/sled_agent.rs | 19 +- sled-agent/types/src/instance.rs | 37 ++- 27 files changed, 469 insertions(+), 464 deletions(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index d65de072f7..bbeb79f67f 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -131,12 +131,10 @@ impl From } } -impl From - for types::SledInstanceState +impl From + for types::SledVmmState { - fn from( - s: omicron_common::api::internal::nexus::SledInstanceState, - ) -> Self { + fn from(s: omicron_common::api::internal::nexus::SledVmmState) -> Self { Self { vmm_state: s.vmm_state.into(), migration_in: s.migration_in.map(Into::into), diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 7a74145be2..b14cf5a96f 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -162,10 +162,10 @@ impl From } } -impl From - for omicron_common::api::internal::nexus::SledInstanceState +impl From + for omicron_common::api::internal::nexus::SledVmmState { - fn from(s: types::SledInstanceState) -> Self { + fn from(s: types::SledVmmState) -> Self { Self { vmm_state: s.vmm_state.into(), migration_in: s.migration_in.map(Into::into), diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 60b820cb24..43e6f0f97d 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -113,10 +113,9 @@ pub struct VmmRuntimeState { pub time_updated: DateTime, } -/// A wrapper type containing a sled's total knowledge of the state of a -/// specific VMM and the instance it incarnates. +/// A wrapper type containing a sled's total knowledge of the state of a VMM. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] -pub struct SledInstanceState { +pub struct SledVmmState { /// The most recent state of the sled's VMM process. pub vmm_state: VmmRuntimeState, @@ -139,7 +138,7 @@ impl Migrations<'_> { } } -impl SledInstanceState { +impl SledVmmState { pub fn migrations(&self) -> Migrations<'_> { Migrations { migration_in: self.migration_in.as_ref(), diff --git a/nexus/internal-api/src/lib.rs b/nexus/internal-api/src/lib.rs index 47ca4f250a..12e99ba23b 100644 --- a/nexus/internal-api/src/lib.rs +++ b/nexus/internal-api/src/lib.rs @@ -33,7 +33,7 @@ use omicron_common::{ DiskRuntimeState, DownstairsClientStopRequest, DownstairsClientStopped, ProducerEndpoint, ProducerRegistrationResponse, RepairFinishInfo, RepairProgress, - RepairStartInfo, SledInstanceState, + RepairStartInfo, SledVmmState, }, }, update::ArtifactId, @@ -116,7 +116,7 @@ pub trait NexusInternalApi { async fn cpapi_instances_put( rqctx: RequestContext, path_params: Path, - new_runtime_state: TypedBody, + new_runtime_state: TypedBody, ) -> Result; #[endpoint { diff --git a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs index 541ede6e86..ca6e7e4271 100644 --- a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs +++ b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs @@ -28,8 +28,8 @@ //! remains alive and continues to own its virtual provisioning resources. //! //! Cleanup of instance resources when an instance's *active* VMM is destroyed -//! is handled elsewhere, by `notify_vmm_updated` and (eventually) the -//! `instance-update` saga. +//! is handled elsewhere, by `process_vmm_update` and the `instance-update` +//! saga. use crate::app::background::BackgroundTask; use anyhow::Context; diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index 51dbb40831..ae78392ea3 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -19,7 +19,7 @@ use nexus_types::identity::Asset; use nexus_types::identity::Resource; use omicron_common::api::external::Error; use omicron_common::api::external::InstanceState; -use omicron_common::api::internal::nexus::SledInstanceState; +use omicron_common::api::internal::nexus::SledVmmState; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; use oximeter::types::ProducerRegistry; @@ -81,10 +81,12 @@ impl InstanceWatcher { let client = client.clone(); async move { - slog::trace!(opctx.log, "checking on instance..."); - let rsp = client - .vmm_get_state(&PropolisUuid::from_untyped_uuid(target.vmm_id)) - .await; + let vmm_id = PropolisUuid::from_untyped_uuid(target.vmm_id); + slog::trace!( + opctx.log, "checking on VMM"; "propolis_id" => %vmm_id + ); + + let rsp = client.vmm_get_state(&vmm_id).await; let mut check = Check { target, outcome: Default::default(), @@ -149,7 +151,7 @@ impl InstanceWatcher { } }; - let new_runtime_state: SledInstanceState = state.into(); + let new_runtime_state: SledVmmState = state.into(); check.outcome = CheckOutcome::Success(new_runtime_state.vmm_state.state.into()); debug!( diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 0012042d9e..bcc242db77 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -60,7 +60,7 @@ use propolis_client::support::WebSocketStream; use sagas::instance_common::ExternalIpAttach; use sled_agent_client::types::InstanceMigrationTargetParams; use sled_agent_client::types::InstanceProperties; -use sled_agent_client::types::InstancePutStateBody; +use sled_agent_client::types::VmmPutStateBody; use std::matches; use std::net::SocketAddr; use std::sync::Arc; @@ -154,7 +154,7 @@ pub(crate) enum InstanceStateChangeRequest { } impl From - for sled_agent_client::types::InstanceStateRequested + for sled_agent_client::types::VmmStateRequested { fn from(value: InstanceStateChangeRequest) -> Self { match value { @@ -664,8 +664,7 @@ impl super::Nexus { &self, propolis_id: &PropolisUuid, sled_id: &SledUuid, - ) -> Result, InstanceStateChangeError> - { + ) -> Result, InstanceStateChangeError> { let sa = self.sled_client(&sled_id).await?; sa.vmm_unregister(propolis_id) .await @@ -846,7 +845,7 @@ impl super::Nexus { let instance_put_result = sa .vmm_put_state( &propolis_id, - &InstancePutStateBody { state: requested.into() }, + &VmmPutStateBody { state: requested.into() }, ) .await .map(|res| res.into_inner().updated_runtime.map(Into::into)) @@ -1325,7 +1324,7 @@ impl super::Nexus { &self, opctx: &OpContext, propolis_id: PropolisUuid, - new_runtime_state: &nexus::SledInstanceState, + new_runtime_state: &nexus::SledVmmState, ) -> Result<(), Error> { let Some((instance_id, saga)) = process_vmm_update( &self.db_datastore, @@ -1831,14 +1830,22 @@ impl super::Nexus { } } -/// Invoked by a sled agent to publish an updated runtime state for an -/// Instance, returning an update saga for that instance (if one must be -/// executed). +/// Publishes the VMM and migration state supplied in `new_runtime_state` to the +/// database. +/// +/// # Return value +/// +/// - `Ok(Some(instance_id, saga))` if the new VMM state obsoletes the current +/// instance state. The caller should execute the returned instance update +/// saga to reconcile the instance to the new VMM state. +/// - `Ok(None)` if the new state was successfully published but does not +/// require an instance update. +/// - `Err` if an error occurred. pub(crate) async fn process_vmm_update( datastore: &DataStore, opctx: &OpContext, propolis_id: PropolisUuid, - new_runtime_state: &nexus::SledInstanceState, + new_runtime_state: &nexus::SledVmmState, ) -> Result, Error> { use sagas::instance_update; @@ -1862,7 +1869,7 @@ pub(crate) async fn process_vmm_update( // If an instance-update saga must be executed as a result of this update, // prepare and return it. // - // It's safe to refetch the VMM record to get the instance ID at this point + // It's safe to fetch the VMM record to get the instance ID at this point // even though the VMM may now be Destroyed, because the record itself won't // be deleted until the instance update saga retires it. If this update // marked the VMM as Destroyed and then the fetch failed, then another @@ -1878,22 +1885,27 @@ pub(crate) async fn process_vmm_update( datastore.vmm_fetch(&opctx, &propolis_id).await?.instance_id, ); - info!(opctx.log, "scheduling update saga in response to vmm update"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - ); - let (.., authz_instance) = LookupPath::new(&opctx, datastore) .instance_id(instance_id.into_untyped_uuid()) .lookup_for(authz::Action::Modify) .await?; - let saga = instance_update::SagaInstanceUpdate::prepare( + + match instance_update::SagaInstanceUpdate::prepare( &instance_update::Params { serialized_authn: authn::saga::Serialized::for_opctx(opctx), authz_instance, }, - )?; - Ok(Some((instance_id, saga))) + ) { + Ok(saga) => Ok(Some((instance_id, saga))), + Err(e) => { + error!(opctx.log, "failed to prepare instance update saga"; + "error" => ?e, + "instance_id" => %instance_id, + "propolis_id" => %propolis_id); + + Err(e) + } + } } else { Ok(None) } diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index b649c369ad..049673d2ee 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -26,8 +26,8 @@ use super::NexusActionContext; const DEFAULT_PROPOLIS_PORT: u16 = 12400; #[derive(Clone, Debug, Serialize, Deserialize)] -pub(super) struct PropolisAndSledId { - pub(super) propolis_id: PropolisUuid, +pub(super) struct VmmAndSledIds { + pub(super) vmm_id: PropolisUuid, pub(super) sled_id: SledUuid, } @@ -224,7 +224,7 @@ pub(super) async fn instance_ip_get_instance_state( serialized_authn: &authn::saga::Serialized, authz_instance: &authz::Instance, verb: &str, -) -> Result, ActionError> { +) -> Result, ActionError> { // XXX: we can get instance state (but not sled ID) in same transaction // as attach (but not detach) wth current design. We need to re-query // for sled ID anyhow, so keep consistent between attach/detach. @@ -243,8 +243,8 @@ pub(super) async fn instance_ip_get_instance_state( let found_instance_state = inst_and_vmm.instance().runtime_state.nexus_state; let mut propolis_and_sled_id = - inst_and_vmm.vmm().as_ref().map(|vmm| PropolisAndSledId { - propolis_id: PropolisUuid::from_untyped_uuid(vmm.id), + inst_and_vmm.vmm().as_ref().map(|vmm| VmmAndSledIds { + vmm_id: PropolisUuid::from_untyped_uuid(vmm.id), sled_id: SledUuid::from_untyped_uuid(vmm.sled_id), }); @@ -456,13 +456,13 @@ pub async fn instance_ip_remove_nat( /// a double attach/detach (`!target_ip.do_saga`). pub(super) async fn instance_ip_add_opte( sagactx: &NexusActionContext, - propolis_and_sled: Option, + vmm_and_sled: Option, target_ip: ModifyStateForExternalIp, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); // No physical sled? Don't inform OPTE. - let Some(PropolisAndSledId { propolis_id, sled_id }) = propolis_and_sled + let Some(VmmAndSledIds { vmm_id: propolis_id, sled_id }) = vmm_and_sled else { return Ok(()); }; @@ -512,13 +512,14 @@ pub(super) async fn instance_ip_add_opte( /// a double attach/detach (`!target_ip.do_saga`). pub(super) async fn instance_ip_remove_opte( sagactx: &NexusActionContext, - propolis_and_sled: Option, + propolis_and_sled: Option, target_ip: ModifyStateForExternalIp, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); // No physical sled? Don't inform OPTE. - let Some(PropolisAndSledId { propolis_id, sled_id }) = propolis_and_sled + let Some(VmmAndSledIds { vmm_id: propolis_id, sled_id }) = + propolis_and_sled else { return Ok(()); }; diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 6a456e7c44..e6fb8654ea 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -5,7 +5,7 @@ use super::instance_common::{ instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, instance_ip_move_state, instance_ip_remove_opte, ExternalIpAttach, - ModifyStateForExternalIp, PropolisAndSledId, + ModifyStateForExternalIp, VmmAndSledIds, }; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; @@ -161,7 +161,7 @@ async fn siia_begin_attach_ip_undo( async fn siia_get_instance_state( sagactx: NexusActionContext, -) -> Result, ActionError> { +) -> Result, ActionError> { let params = sagactx.saga_params::()?; instance_ip_get_instance_state( &sagactx, @@ -178,7 +178,7 @@ async fn siia_nat( ) -> Result, ActionError> { let params = sagactx.saga_params::()?; let sled_id = sagactx - .lookup::>("instance_state")? + .lookup::>("instance_state")? .map(|ids| ids.sled_id); let target_ip = sagactx.lookup::("target_ip")?; @@ -248,7 +248,7 @@ async fn siia_nat_undo( async fn siia_update_opte( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let ids = sagactx.lookup::>("instance_state")?; + let ids = sagactx.lookup::>("instance_state")?; let target_ip = sagactx.lookup::("target_ip")?; instance_ip_add_opte(&sagactx, ids, target_ip).await } @@ -257,7 +257,7 @@ async fn siia_update_opte_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let log = sagactx.user_data().log(); - let ids = sagactx.lookup::>("instance_state")?; + let ids = sagactx.lookup::>("instance_state")?; let target_ip = sagactx.lookup::("target_ip")?; if let Err(e) = instance_ip_remove_opte(&sagactx, ids, target_ip).await { error!(log, "siia_update_opte_undo: failed to notify sled-agent: {e}"); @@ -429,14 +429,14 @@ pub(crate) mod test { } // Sled agent has a record of the new external IPs. - let (propolis_id, _) = + let VmmAndSledIds { vmm_id, .. } = crate::app::sagas::test_helpers::instance_fetch_vmm_and_sled_ids( cptestctx, &instance_id, ) .await; let mut eips = sled_agent.external_ips.lock().await; - let my_eips = eips.entry(propolis_id).or_default(); + let my_eips = eips.entry(vmm_id).or_default(); assert!(my_eips .iter() .any(|v| matches!(v, InstanceExternalIpBody::Floating(_)))); @@ -491,14 +491,14 @@ pub(crate) mod test { .is_none()); // No IP bindings remain on sled-agent. - let (propolis_id, _) = + let VmmAndSledIds { vmm_id, .. } = crate::app::sagas::test_helpers::instance_fetch_vmm_and_sled_ids( cptestctx, &instance_id, ) .await; let mut eips = sled_agent.external_ips.lock().await; - let my_eips = eips.entry(propolis_id).or_default(); + let my_eips = eips.entry(vmm_id).or_default(); assert!(my_eips.is_empty()); } diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 4a4aa0c2d2..d9da9fc05c 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -5,7 +5,7 @@ use super::instance_common::{ instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, instance_ip_move_state, instance_ip_remove_nat, instance_ip_remove_opte, - ModifyStateForExternalIp, PropolisAndSledId, + ModifyStateForExternalIp, VmmAndSledIds, }; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; @@ -155,7 +155,7 @@ async fn siid_begin_detach_ip_undo( async fn siid_get_instance_state( sagactx: NexusActionContext, -) -> Result, ActionError> { +) -> Result, ActionError> { let params = sagactx.saga_params::()?; instance_ip_get_instance_state( &sagactx, @@ -169,7 +169,7 @@ async fn siid_get_instance_state( async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { let params = sagactx.saga_params::()?; let sled_id = sagactx - .lookup::>("instance_state")? + .lookup::>("instance_state")? .map(|ids| ids.sled_id); let target_ip = sagactx.lookup::("target_ip")?; instance_ip_remove_nat( @@ -187,7 +187,7 @@ async fn siid_nat_undo( let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; let sled_id = sagactx - .lookup::>("instance_state")? + .lookup::>("instance_state")? .map(|ids| ids.sled_id); let target_ip = sagactx.lookup::("target_ip")?; if let Err(e) = instance_ip_add_nat( @@ -208,7 +208,7 @@ async fn siid_nat_undo( async fn siid_update_opte( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let ids = sagactx.lookup::>("instance_state")?; + let ids = sagactx.lookup::>("instance_state")?; let target_ip = sagactx.lookup::("target_ip")?; instance_ip_remove_opte(&sagactx, ids, target_ip).await } @@ -217,7 +217,7 @@ async fn siid_update_opte_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let log = sagactx.user_data().log(); - let ids = sagactx.lookup::>("instance_state")?; + let ids = sagactx.lookup::>("instance_state")?; let target_ip = sagactx.lookup::("target_ip")?; if let Err(e) = instance_ip_add_opte(&sagactx, ids, target_ip).await { error!(log, "siid_update_opte_undo: failed to notify sled-agent: {e}"); @@ -399,14 +399,14 @@ pub(crate) mod test { } // Sled agent has removed its records of the external IPs. - let (propolis_id, _) = + let VmmAndSledIds { vmm_id, .. } = crate::app::sagas::test_helpers::instance_fetch_vmm_and_sled_ids( cptestctx, &instance_id, ) .await; let mut eips = sled_agent.external_ips.lock().await; - let my_eips = eips.entry(propolis_id).or_default(); + let my_eips = eips.entry(vmm_id).or_default(); assert!(my_eips.is_empty()); // DB only has record for SNAT. diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 12bf95af41..e8aa5cb5fc 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -236,9 +236,9 @@ //! updates is perhaps the simplest one: _avoiding unnecessary update sagas_. //! The `cpapi_instances_put` API endpoint and instance-watcher background tasks //! handle changes to VMM and migration states by calling the -//! [`notify_vmm_updated`] method, which writes the new states to the -//! database and (potentially) starts an update saga. Naively, this method would -//! *always* start an update saga, but remember that --- as we discussed +//! [`process_vmm_update`] method, which writes the new states to the database +//! and (potentially) starts an update saga. Naively, this method would *always* +//! start an update saga, but remember that --- as we discussed //! [above](#background) --- many VMM/migration state changes don't actually //! require modifying the instance record. For example, if an instance's VMM //! transitions from [`VmmState::Starting`] to [`VmmState::Running`], that @@ -326,7 +326,7 @@ //! crate::app::db::datastore::DataStore::instance_updater_inherit_lock //! [instance_updater_unlock]: //! crate::app::db::datastore::DataStore::instance_updater_unlock -//! [`notify_vmm_updated`]: crate::app::Nexus::notify_vmm_updated +//! [`process_vmm_update`]: crate::app::instance::process_vmm_update //! //! [dist-locking]: //! https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html @@ -362,7 +362,7 @@ use nexus_db_queries::{authn, authz}; use nexus_types::identity::Resource; use omicron_common::api::external::Error; use omicron_common::api::internal::nexus; -use omicron_common::api::internal::nexus::SledInstanceState; +use omicron_common::api::internal::nexus::SledVmmState; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; @@ -408,7 +408,7 @@ mod destroyed; pub fn update_saga_needed( log: &slog::Logger, propolis_id: PropolisUuid, - state: &SledInstanceState, + state: &SledVmmState, result: &VmmStateUpdateResult, ) -> bool { // Currently, an instance-update saga is required if (and only if): diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index 3de1f3c8d2..1572ba4330 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -5,7 +5,7 @@ //! Helper functions for writing saga undo tests and working with instances in //! saga tests. -use super::NexusSaga; +use super::{instance_common::VmmAndSledIds, NexusSaga}; use crate::{app::saga::create_saga_dag, Nexus}; use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use camino::Utf8Path; @@ -134,14 +134,14 @@ pub(crate) async fn instance_simulate( info!(&cptestctx.logctx.log, "Poking simulated instance"; "instance_id" => %instance_id); let nexus = &cptestctx.server.server_context().nexus; - let (propolis_id, sled_id) = + let VmmAndSledIds { vmm_id, sled_id } = instance_fetch_vmm_and_sled_ids(cptestctx, instance_id).await; let sa = nexus .sled_client(&sled_id) .await .expect("instance must be on a sled to simulate a state change"); - sa.vmm_finish_transition(propolis_id).await; + sa.vmm_finish_transition(vmm_id).await; } pub(crate) async fn instance_single_step_on_sled( @@ -156,14 +156,14 @@ pub(crate) async fn instance_single_step_on_sled( "sled_id" => %sled_id, ); let nexus = &cptestctx.server.server_context().nexus; - let (propolis_id, sled_id) = + let VmmAndSledIds { vmm_id, sled_id } = instance_fetch_vmm_and_sled_ids(cptestctx, instance_id).await; let sa = nexus .sled_client(&sled_id) .await .expect("instance must be on a sled to simulate a state change"); - sa.vmm_single_step(propolis_id).await; + sa.vmm_single_step(vmm_id).await; } pub(crate) async fn instance_simulate_by_name( @@ -187,13 +187,13 @@ pub(crate) async fn instance_simulate_by_name( nexus.instance_lookup(&opctx, instance_selector).unwrap(); let (.., instance) = instance_lookup.fetch().await.unwrap(); let instance_id = InstanceUuid::from_untyped_uuid(instance.id()); - let (propolis_id, sled_id) = + let VmmAndSledIds { vmm_id, sled_id } = instance_fetch_vmm_and_sled_ids(cptestctx, &instance_id).await; let sa = nexus .sled_client(&sled_id) .await .expect("instance must be on a sled to simulate a state change"); - sa.vmm_finish_transition(propolis_id).await; + sa.vmm_finish_transition(vmm_id).await; } pub async fn instance_fetch( @@ -223,16 +223,16 @@ pub async fn instance_fetch( pub(super) async fn instance_fetch_vmm_and_sled_ids( cptestctx: &ControlPlaneTestContext, instance_id: &InstanceUuid, -) -> (PropolisUuid, SledUuid) { +) -> VmmAndSledIds { let instance_and_vmm = instance_fetch(cptestctx, *instance_id).await; let vmm = instance_and_vmm .vmm() .as_ref() - .expect("simulating an instance requires an active vmm"); + .expect("can only fetch VMM and sled IDs for an active instance"); - let propolis_id = PropolisUuid::from_untyped_uuid(vmm.id); + let vmm_id = PropolisUuid::from_untyped_uuid(vmm.id); let sled_id = SledUuid::from_untyped_uuid(vmm.sled_id); - (propolis_id, sled_id) + VmmAndSledIds { vmm_id, sled_id } } pub async fn instance_fetch_all( diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 87bded2281..66a8090f11 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -52,7 +52,7 @@ use omicron_common::api::internal::nexus::ProducerRegistrationResponse; use omicron_common::api::internal::nexus::RepairFinishInfo; use omicron_common::api::internal::nexus::RepairProgress; use omicron_common::api::internal::nexus::RepairStartInfo; -use omicron_common::api::internal::nexus::SledInstanceState; +use omicron_common::api::internal::nexus::SledVmmState; use omicron_common::update::ArtifactId; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; @@ -169,7 +169,7 @@ impl NexusInternalApi for NexusInternalApiImpl { async fn cpapi_instances_put( rqctx: RequestContext, path_params: Path, - new_runtime_state: TypedBody, + new_runtime_state: TypedBody, ) -> Result { let apictx = &rqctx.context().context; let nexus = &apictx.nexus; diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 970c26c96b..50bcf8e08e 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -1450,7 +1450,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SledInstanceState" + "$ref": "#/components/schemas/SledVmmState" } } }, @@ -5054,41 +5054,6 @@ "id" ] }, - "SledInstanceState": { - "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.", - "type": "object", - "properties": { - "migration_in": { - "nullable": true, - "description": "The current state of any inbound migration to this VMM.", - "allOf": [ - { - "$ref": "#/components/schemas/MigrationRuntimeState" - } - ] - }, - "migration_out": { - "nullable": true, - "description": "The state of any outbound migration from this VMM.", - "allOf": [ - { - "$ref": "#/components/schemas/MigrationRuntimeState" - } - ] - }, - "vmm_state": { - "description": "The most recent state of the sled's VMM process.", - "allOf": [ - { - "$ref": "#/components/schemas/VmmRuntimeState" - } - ] - } - }, - "required": [ - "vmm_state" - ] - }, "SledPolicy": { "description": "The operator-defined policy of a sled.", "oneOf": [ @@ -5203,6 +5168,41 @@ } ] }, + "SledVmmState": { + "description": "A wrapper type containing a sled's total knowledge of the state of a VMM.", + "type": "object", + "properties": { + "migration_in": { + "nullable": true, + "description": "The current state of any inbound migration to this VMM.", + "allOf": [ + { + "$ref": "#/components/schemas/MigrationRuntimeState" + } + ] + }, + "migration_out": { + "nullable": true, + "description": "The state of any outbound migration from this VMM.", + "allOf": [ + { + "$ref": "#/components/schemas/MigrationRuntimeState" + } + ] + }, + "vmm_state": { + "description": "The most recent state of the sled's VMM process.", + "allOf": [ + { + "$ref": "#/components/schemas/VmmRuntimeState" + } + ] + } + }, + "required": [ + "vmm_state" + ] + }, "SourceNatConfig": { "description": "An IP address and port range used for source NAT, i.e., making outbound network connections from guests or services.", "type": "object", diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 12e324ede1..ec2a8bfc4d 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -650,7 +650,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SledInstanceState" + "$ref": "#/components/schemas/SledVmmState" } } } @@ -681,7 +681,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstanceUnregisterResponse" + "$ref": "#/components/schemas/VmmUnregisterResponse" } } } @@ -837,7 +837,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SledInstanceState" + "$ref": "#/components/schemas/SledVmmState" } } } @@ -866,7 +866,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstancePutStateBody" + "$ref": "#/components/schemas/VmmPutStateBody" } } }, @@ -878,7 +878,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstancePutStateResponse" + "$ref": "#/components/schemas/VmmPutStateResponse" } } } @@ -3046,38 +3046,6 @@ "ncpus" ] }, - "InstancePutStateBody": { - "description": "The body of a request to move a previously-ensured instance into a specific runtime state.", - "type": "object", - "properties": { - "state": { - "description": "The state into which the instance should be driven.", - "allOf": [ - { - "$ref": "#/components/schemas/InstanceStateRequested" - } - ] - } - }, - "required": [ - "state" - ] - }, - "InstancePutStateResponse": { - "description": "The response sent from a request to move an instance into a specific runtime state.", - "type": "object", - "properties": { - "updated_runtime": { - "nullable": true, - "description": "The current runtime state of the instance after handling the request to change its state. If the instance's state did not change, this field is `None`.", - "allOf": [ - { - "$ref": "#/components/schemas/SledInstanceState" - } - ] - } - } - }, "InstanceRuntimeState": { "description": "The dynamic runtime properties of an instance: its current VMM ID (if any), migration information (if any), and the instance state to report if there is no active VMM.", "type": "object", @@ -3125,90 +3093,6 @@ "time_updated" ] }, - "InstanceStateRequested": { - "description": "Requestable running state of an Instance.\n\nA subset of [`omicron_common::api::external::InstanceState`].", - "oneOf": [ - { - "description": "Run this instance by migrating in from a previous running incarnation of the instance.", - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "migration_target" - ] - }, - "value": { - "$ref": "#/components/schemas/InstanceMigrationTargetParams" - } - }, - "required": [ - "type", - "value" - ] - }, - { - "description": "Start the instance if it is not already running.", - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "running" - ] - } - }, - "required": [ - "type" - ] - }, - { - "description": "Stop the instance.", - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "stopped" - ] - } - }, - "required": [ - "type" - ] - }, - { - "description": "Immediately reset the instance, as though it had stopped and immediately began to run again.", - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "reboot" - ] - } - }, - "required": [ - "type" - ] - } - ] - }, - "InstanceUnregisterResponse": { - "description": "The response sent from a request to unregister an instance.", - "type": "object", - "properties": { - "updated_runtime": { - "nullable": true, - "description": "The current state of the instance after handling the request to unregister it. If the instance's state did not change, this field is `None`.", - "allOf": [ - { - "$ref": "#/components/schemas/SledInstanceState" - } - ] - } - } - }, "Inventory": { "description": "Identity and basic status information about this sled agent", "type": "object", @@ -4642,8 +4526,27 @@ "sled_id" ] }, - "SledInstanceState": { - "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.", + "SledRole": { + "description": "Describes the role of the sled within the rack.\n\nNote that this may change if the sled is physically moved within the rack.", + "oneOf": [ + { + "description": "The sled is a general compute sled.", + "type": "string", + "enum": [ + "gimlet" + ] + }, + { + "description": "The sled is attached to the network switch, and has additional responsibilities.", + "type": "string", + "enum": [ + "scrimlet" + ] + } + ] + }, + "SledVmmState": { + "description": "A wrapper type containing a sled's total knowledge of the state of a VMM.", "type": "object", "properties": { "migration_in": { @@ -4677,25 +4580,6 @@ "vmm_state" ] }, - "SledRole": { - "description": "Describes the role of the sled within the rack.\n\nNote that this may change if the sled is physically moved within the rack.", - "oneOf": [ - { - "description": "The sled is a general compute sled.", - "type": "string", - "enum": [ - "gimlet" - ] - }, - { - "description": "The sled is attached to the network switch, and has additional responsibilities.", - "type": "string", - "enum": [ - "scrimlet" - ] - } - ] - }, "Slot": { "description": "A stable index which is translated by Propolis into a PCI BDF, visible to the guest.\n\n
JSON schema\n\n```json { \"description\": \"A stable index which is translated by Propolis into a PCI BDF, visible to the guest.\", \"type\": \"integer\", \"format\": \"uint8\", \"minimum\": 0.0 } ```
", "type": "integer", @@ -4990,6 +4874,38 @@ "snapshot_id" ] }, + "VmmPutStateBody": { + "description": "The body of a request to move a previously-ensured instance into a specific runtime state.", + "type": "object", + "properties": { + "state": { + "description": "The state into which the instance should be driven.", + "allOf": [ + { + "$ref": "#/components/schemas/VmmStateRequested" + } + ] + } + }, + "required": [ + "state" + ] + }, + "VmmPutStateResponse": { + "description": "The response sent from a request to move an instance into a specific runtime state.", + "type": "object", + "properties": { + "updated_runtime": { + "nullable": true, + "description": "The current runtime state of the instance after handling the request to change its state. If the instance's state did not change, this field is `None`.", + "allOf": [ + { + "$ref": "#/components/schemas/SledVmmState" + } + ] + } + } + }, "VmmRuntimeState": { "description": "The dynamic runtime properties of an individual VMM process.", "type": "object", @@ -5083,6 +4999,90 @@ } ] }, + "VmmStateRequested": { + "description": "Requestable running state of an Instance.\n\nA subset of [`omicron_common::api::external::InstanceState`].", + "oneOf": [ + { + "description": "Run this instance by migrating in from a previous running incarnation of the instance.", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "migration_target" + ] + }, + "value": { + "$ref": "#/components/schemas/InstanceMigrationTargetParams" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "description": "Start the instance if it is not already running.", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "running" + ] + } + }, + "required": [ + "type" + ] + }, + { + "description": "Stop the instance.", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "stopped" + ] + } + }, + "required": [ + "type" + ] + }, + { + "description": "Immediately reset the instance, as though it had stopped and immediately began to run again.", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "reboot" + ] + } + }, + "required": [ + "type" + ] + } + ] + }, + "VmmUnregisterResponse": { + "description": "The response sent from a request to unregister an instance.", + "type": "object", + "properties": { + "updated_runtime": { + "nullable": true, + "description": "The current state of the instance after handling the request to unregister it. If the instance's state did not change, this field is `None`.", + "allOf": [ + { + "$ref": "#/components/schemas/SledVmmState" + } + ] + } + } + }, "Vni": { "description": "A Geneve Virtual Network Identifier", "type": "integer", diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 8c7a9d841d..410747bf46 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -15,7 +15,7 @@ use nexus_sled_agent_shared::inventory::{ }; use omicron_common::{ api::internal::{ - nexus::{DiskRuntimeState, SledInstanceState, UpdateArtifactId}, + nexus::{DiskRuntimeState, SledVmmState, UpdateArtifactId}, shared::{ ResolvedVpcRouteSet, ResolvedVpcRouteState, SledIdentifiers, SwitchPorts, VirtualNetworkInterfaceHost, @@ -36,8 +36,8 @@ use sled_agent_types::{ early_networking::EarlyNetworkConfig, firewall_rules::VpcFirewallRulesEnsureBody, instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstancePutStateBody, - InstancePutStateResponse, InstanceUnregisterResponse, + InstanceEnsureBody, InstanceExternalIpBody, VmmPutStateBody, + VmmPutStateResponse, VmmUnregisterResponse, }, sled::AddSledRequest, time_sync::TimeSync, @@ -218,7 +218,7 @@ pub trait SledAgentApi { rqctx: RequestContext, path_params: Path, body: TypedBody, - ) -> Result, HttpError>; + ) -> Result, HttpError>; #[endpoint { method = DELETE, @@ -227,7 +227,7 @@ pub trait SledAgentApi { async fn vmm_unregister( rqctx: RequestContext, path_params: Path, - ) -> Result, HttpError>; + ) -> Result, HttpError>; #[endpoint { method = PUT, @@ -236,8 +236,8 @@ pub trait SledAgentApi { async fn vmm_put_state( rqctx: RequestContext, path_params: Path, - body: TypedBody, - ) -> Result, HttpError>; + body: TypedBody, + ) -> Result, HttpError>; #[endpoint { method = GET, @@ -246,7 +246,7 @@ pub trait SledAgentApi { async fn vmm_get_state( rqctx: RequestContext, path_params: Path, - ) -> Result, HttpError>; + ) -> Result, HttpError>; #[endpoint { method = PUT, diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 575f52066d..f95bf0cb64 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -7,7 +7,7 @@ use chrono::{DateTime, Utc}; use omicron_common::api::external::Generation; use omicron_common::api::internal::nexus::{ - MigrationRuntimeState, MigrationState, SledInstanceState, VmmRuntimeState, + MigrationRuntimeState, MigrationState, SledVmmState, VmmRuntimeState, VmmState, }; use propolis_client::types::{ @@ -199,8 +199,8 @@ impl InstanceStates { /// Creates a `SledInstanceState` structure containing the entirety of this /// structure's runtime state. This requires cloning; for simple read access /// use the `instance` or `vmm` accessors instead. - pub fn sled_instance_state(&self) -> SledInstanceState { - SledInstanceState { + pub fn sled_instance_state(&self) -> SledVmmState { + SledVmmState { vmm_state: self.vmm.clone(), migration_in: self.migration_in.clone(), migration_out: self.migration_out.clone(), diff --git a/sled-agent/src/fakes/nexus.rs b/sled-agent/src/fakes/nexus.rs index 9221a4b267..bd4680563e 100644 --- a/sled-agent/src/fakes/nexus.rs +++ b/sled-agent/src/fakes/nexus.rs @@ -15,9 +15,7 @@ use hyper::Body; use internal_dns::ServiceName; use nexus_client::types::SledAgentInfo; use omicron_common::api::external::Error; -use omicron_common::api::internal::nexus::{ - SledInstanceState, UpdateArtifactId, -}; +use omicron_common::api::internal::nexus::{SledVmmState, UpdateArtifactId}; use omicron_uuid_kinds::{OmicronZoneUuid, PropolisUuid}; use schemars::JsonSchema; use serde::Deserialize; @@ -52,7 +50,7 @@ pub trait FakeNexusServer: Send + Sync { fn cpapi_instances_put( &self, _propolis_id: PropolisUuid, - _new_runtime_state: SledInstanceState, + _new_runtime_state: SledVmmState, ) -> Result<(), Error> { Err(Error::internal_error("Not implemented")) } @@ -126,7 +124,7 @@ async fn sled_agent_put( async fn cpapi_instances_put( request_context: RequestContext, path_params: Path, - new_runtime_state: TypedBody, + new_runtime_state: TypedBody, ) -> Result { let context = request_context.context(); context.cpapi_instances_put( diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index d5a85fa960..221224a2e9 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -21,7 +21,7 @@ use nexus_sled_agent_shared::inventory::{ }; use omicron_common::api::external::Error; use omicron_common::api::internal::nexus::{ - DiskRuntimeState, SledInstanceState, UpdateArtifactId, + DiskRuntimeState, SledVmmState, UpdateArtifactId, }; use omicron_common::api::internal::shared::{ ResolvedVpcRouteSet, ResolvedVpcRouteState, SledIdentifiers, SwitchPorts, @@ -40,8 +40,8 @@ use sled_agent_types::disk::DiskEnsureBody; use sled_agent_types::early_networking::EarlyNetworkConfig; use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstancePutStateBody, - InstancePutStateResponse, InstanceUnregisterResponse, + InstanceEnsureBody, InstanceExternalIpBody, VmmPutStateBody, + VmmPutStateResponse, VmmUnregisterResponse, }; use sled_agent_types::sled::AddSledRequest; use sled_agent_types::time_sync::TimeSync; @@ -297,7 +297,7 @@ impl SledAgentApi for SledAgentImpl { rqctx: RequestContext, path_params: Path, body: TypedBody, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let sa = rqctx.context(); let propolis_id = path_params.into_inner().propolis_id; let body_args = body.into_inner(); @@ -318,7 +318,7 @@ impl SledAgentApi for SledAgentImpl { async fn vmm_unregister( rqctx: RequestContext, path_params: Path, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let sa = rqctx.context(); let id = path_params.into_inner().propolis_id; Ok(HttpResponseOk(sa.instance_ensure_unregistered(id).await?)) @@ -327,8 +327,8 @@ impl SledAgentApi for SledAgentImpl { async fn vmm_put_state( rqctx: RequestContext, path_params: Path, - body: TypedBody, - ) -> Result, HttpError> { + body: TypedBody, + ) -> Result, HttpError> { let sa = rqctx.context(); let id = path_params.into_inner().propolis_id; let body_args = body.into_inner(); @@ -338,7 +338,7 @@ impl SledAgentApi for SledAgentImpl { async fn vmm_get_state( rqctx: RequestContext, path_params: Path, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let sa = rqctx.context(); let id = path_params.into_inner().propolis_id; Ok(HttpResponseOk(sa.instance_get_state(id).await?)) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index e56f5a1822..b035ef7e71 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -25,9 +25,7 @@ use illumos_utils::opte::{DhcpCfg, PortCreateParams, PortManager}; use illumos_utils::running_zone::{RunningZone, ZoneBuilderFactory}; use illumos_utils::svc::wait_for_service; use illumos_utils::zone::PROPOLIS_ZONE_PREFIX; -use omicron_common::api::internal::nexus::{ - SledInstanceState, VmmRuntimeState, -}; +use omicron_common::api::internal::nexus::{SledVmmState, VmmRuntimeState}; use omicron_common::api::internal::shared::{ NetworkInterface, ResolvedVpcFirewallRule, SledIdentifiers, SourceNatConfig, }; @@ -217,15 +215,15 @@ enum InstanceRequest { tx: oneshot::Sender>, }, CurrentState { - tx: oneshot::Sender, + tx: oneshot::Sender, }, PutState { - state: InstanceStateRequested, - tx: oneshot::Sender>, + state: VmmStateRequested, + tx: oneshot::Sender>, }, Terminate { mark_failed: bool, - tx: oneshot::Sender>, + tx: oneshot::Sender>, }, IssueSnapshotRequest { disk_id: Uuid, @@ -414,12 +412,12 @@ impl InstanceRunner { }, Some(PutState{ state, tx }) => { tx.send(self.put_state(state).await - .map(|r| InstancePutStateResponse { updated_runtime: Some(r) }) + .map(|r| VmmPutStateResponse { updated_runtime: Some(r) }) .map_err(|e| e.into())) .map_err(|_| Error::FailedSendClientClosed) }, Some(Terminate { mark_failed, tx }) => { - tx.send(Ok(InstanceUnregisterResponse { + tx.send(Ok(VmmUnregisterResponse { updated_runtime: Some(self.terminate(mark_failed).await) })) .map_err(|_| Error::FailedSendClientClosed) @@ -570,6 +568,7 @@ impl InstanceRunner { "Failed to publish instance state to Nexus: {}", err.to_string(); "instance_id" => %self.instance_id(), + "propolis_id" => %self.propolis_id, "retry_after" => ?delay); }, ) @@ -579,7 +578,8 @@ impl InstanceRunner { error!( self.log, "Failed to publish state to Nexus, will not retry: {:?}", e; - "instance_id" => %self.instance_id() + "instance_id" => %self.instance_id(), + "propolis_id" => %self.propolis_id, ); } } @@ -627,7 +627,8 @@ impl InstanceRunner { match action { Some(InstanceAction::Destroy) => { info!(self.log, "terminating VMM that has exited"; - "instance_id" => %self.instance_id()); + "instance_id" => %self.instance_id(), + "propolis_id" => %self.propolis_id); let mark_failed = false; self.terminate(mark_failed).await; Reaction::Terminate @@ -1129,7 +1130,7 @@ impl Instance { Ok(rx.await?) } - pub async fn current_state(&self) -> Result { + pub async fn current_state(&self) -> Result { let (tx, rx) = oneshot::channel(); self.tx .send(InstanceRequest::CurrentState { tx }) @@ -1151,8 +1152,8 @@ impl Instance { /// Rebooting to Running to Stopping to Stopped. pub async fn put_state( &self, - tx: oneshot::Sender>, - state: InstanceStateRequested, + tx: oneshot::Sender>, + state: VmmStateRequested, ) -> Result<(), Error> { self.tx .send(InstanceRequest::PutState { state, tx }) @@ -1165,7 +1166,7 @@ impl Instance { /// immediately transitions the instance to the Destroyed state. pub async fn terminate( &self, - tx: oneshot::Sender>, + tx: oneshot::Sender>, mark_failed: bool, ) -> Result<(), Error> { self.tx @@ -1241,7 +1242,7 @@ impl InstanceRunner { run_state.running_zone.root_zpool().map(|p| p.clone()) } - fn current_state(&self) -> SledInstanceState { + fn current_state(&self) -> SledVmmState { self.state.sled_instance_state() } @@ -1299,19 +1300,19 @@ impl InstanceRunner { async fn put_state( &mut self, - state: InstanceStateRequested, - ) -> Result { + state: VmmStateRequested, + ) -> Result { use propolis_client::types::InstanceStateRequested as PropolisRequest; let (propolis_state, next_published) = match state { - InstanceStateRequested::MigrationTarget(migration_params) => { + VmmStateRequested::MigrationTarget(migration_params) => { self.propolis_ensure(Some(migration_params)).await?; (None, None) } - InstanceStateRequested::Running => { + VmmStateRequested::Running => { self.propolis_ensure(None).await?; (Some(PropolisRequest::Run), None) } - InstanceStateRequested::Stopped => { + VmmStateRequested::Stopped => { // If the instance has not started yet, unregister it // immediately. Since there is no Propolis to push updates when // this happens, generate an instance record bearing the @@ -1327,7 +1328,7 @@ impl InstanceRunner { ) } } - InstanceStateRequested::Reboot => { + VmmStateRequested::Reboot => { if self.running_state.is_none() { return Err(Error::VmNotRunning(self.propolis_id)); } @@ -1482,7 +1483,7 @@ impl InstanceRunner { Ok(PropolisSetup { client, running_zone }) } - async fn terminate(&mut self, mark_failed: bool) -> SledInstanceState { + async fn terminate(&mut self, mark_failed: bool) -> SledVmmState { self.terminate_inner().await; self.state.terminate_rudely(mark_failed); @@ -1601,7 +1602,7 @@ mod tests { enum ReceivedInstanceState { #[default] None, - InstancePut(SledInstanceState), + InstancePut(SledVmmState), } struct NexusServer { @@ -1612,7 +1613,7 @@ mod tests { fn cpapi_instances_put( &self, _propolis_id: PropolisUuid, - new_runtime_state: SledInstanceState, + new_runtime_state: SledVmmState, ) -> Result<(), omicron_common::api::external::Error> { self.observed_runtime_state .send(ReceivedInstanceState::InstancePut(new_runtime_state)) @@ -1914,7 +1915,7 @@ mod tests { // pretending we're InstanceManager::ensure_state, start our "instance" // (backed by fakes and propolis_mock_server) - inst.put_state(put_tx, InstanceStateRequested::Running) + inst.put_state(put_tx, VmmStateRequested::Running) .await .expect("failed to send Instance::put_state"); @@ -2008,7 +2009,7 @@ mod tests { // pretending we're InstanceManager::ensure_state, try in vain to start // our "instance", but no propolis server is running - inst.put_state(put_tx, InstanceStateRequested::Running) + inst.put_state(put_tx, VmmStateRequested::Running) .await .expect("failed to send Instance::put_state"); @@ -2022,7 +2023,7 @@ mod tests { .await .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); - if let ReceivedInstanceState::InstancePut(SledInstanceState { + if let ReceivedInstanceState::InstancePut(SledVmmState { vmm_state: VmmRuntimeState { state: VmmState::Running, .. }, .. }) = state_rx.borrow().to_owned() @@ -2115,7 +2116,7 @@ mod tests { // pretending we're InstanceManager::ensure_state, try in vain to start // our "instance", but the zone never finishes installing - inst.put_state(put_tx, InstanceStateRequested::Running) + inst.put_state(put_tx, VmmStateRequested::Running) .await .expect("failed to send Instance::put_state"); @@ -2130,7 +2131,7 @@ mod tests { .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); debug!(log, "Zone-boot timeout awaited"); - if let ReceivedInstanceState::InstancePut(SledInstanceState { + if let ReceivedInstanceState::InstancePut(SledVmmState { vmm_state: VmmRuntimeState { state: VmmState::Running, .. }, .. }) = state_rx.borrow().to_owned() @@ -2253,7 +2254,7 @@ mod tests { .await .unwrap(); - mgr.ensure_state(propolis_id, InstanceStateRequested::Running) + mgr.ensure_state(propolis_id, VmmStateRequested::Running) .await .unwrap(); diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 19c4562910..353496a060 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -4,13 +4,13 @@ //! API for controlling multiple instances on a sled. -use crate::instance::propolis_zone_name; use crate::instance::Instance; use crate::metrics::MetricsRequestQueue; use crate::nexus::NexusClient; use crate::vmm_reservoir::VmmReservoirManagerHandle; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; +use illumos_utils::zone::PROPOLIS_ZONE_PREFIX; use omicron_common::api::external::ByteCount; use anyhow::anyhow; @@ -20,7 +20,7 @@ use illumos_utils::opte::PortManager; use illumos_utils::running_zone::ZoneBuilderFactory; use omicron_common::api::external::Generation; use omicron_common::api::internal::nexus::InstanceRuntimeState; -use omicron_common::api::internal::nexus::SledInstanceState; +use omicron_common::api::internal::nexus::SledVmmState; use omicron_common::api::internal::nexus::VmmRuntimeState; use omicron_common::api::internal::shared::SledIdentifiers; use omicron_uuid_kinds::InstanceUuid; @@ -44,8 +44,8 @@ pub enum Error { #[error("Instance error: {0}")] Instance(#[from] crate::instance::Error), - #[error("Instance with ID {0} not found")] - NoSuchInstance(PropolisUuid), + #[error("VMM with ID {0} not found")] + NoSuchVmm(PropolisUuid), #[error("OPTE port management error: {0}")] Opte(#[from] illumos_utils::opte::Error), @@ -150,7 +150,7 @@ impl InstanceManager { propolis_addr: SocketAddr, sled_identifiers: SledIdentifiers, metadata: InstanceMetadata, - ) -> Result { + ) -> Result { let (tx, rx) = oneshot::channel(); self.inner .tx @@ -173,7 +173,7 @@ impl InstanceManager { pub async fn ensure_unregistered( &self, propolis_id: PropolisUuid, - ) -> Result { + ) -> Result { let (tx, rx) = oneshot::channel(); self.inner .tx @@ -189,8 +189,8 @@ impl InstanceManager { pub async fn ensure_state( &self, propolis_id: PropolisUuid, - target: InstanceStateRequested, - ) -> Result { + target: VmmStateRequested, + ) -> Result { let (tx, rx) = oneshot::channel(); self.inner .tx @@ -206,17 +206,18 @@ impl InstanceManager { // these may involve a long-running zone creation, so avoid HTTP // request timeouts by decoupling the response // (see InstanceRunner::put_state) - InstanceStateRequested::MigrationTarget(_) - | InstanceStateRequested::Running => { + VmmStateRequested::MigrationTarget(_) + | VmmStateRequested::Running => { // We don't want the sending side of the channel to see an // error if we drop rx without awaiting it. // Since we don't care about the response here, we spawn rx // into a task which will await it for us in the background. tokio::spawn(rx); - Ok(InstancePutStateResponse { updated_runtime: None }) + Ok(VmmPutStateResponse { updated_runtime: None }) + } + VmmStateRequested::Stopped | VmmStateRequested::Reboot => { + rx.await? } - InstanceStateRequested::Stopped - | InstanceStateRequested::Reboot => rx.await?, } } @@ -301,7 +302,7 @@ impl InstanceManager { pub async fn get_instance_state( &self, propolis_id: PropolisUuid, - ) -> Result { + ) -> Result { let (tx, rx) = oneshot::channel(); self.inner .tx @@ -351,16 +352,16 @@ enum InstanceManagerRequest { // reasonable choice... sled_identifiers: Box, metadata: InstanceMetadata, - tx: oneshot::Sender>, + tx: oneshot::Sender>, }, EnsureUnregistered { propolis_id: PropolisUuid, - tx: oneshot::Sender>, + tx: oneshot::Sender>, }, EnsureState { propolis_id: PropolisUuid, - target: InstanceStateRequested, - tx: oneshot::Sender>, + target: VmmStateRequested, + tx: oneshot::Sender>, }, IssueDiskSnapshot { @@ -385,7 +386,7 @@ enum InstanceManagerRequest { }, GetState { propolis_id: PropolisUuid, - tx: oneshot::Sender>, + tx: oneshot::Sender>, }, OnlyUseDisks { disks: AllDisks, @@ -565,7 +566,7 @@ impl InstanceManagerRunner { propolis_addr: SocketAddr, sled_identifiers: SledIdentifiers, metadata: InstanceMetadata, - ) -> Result { + ) -> Result { info!( &self.log, "ensuring instance is registered"; @@ -603,10 +604,12 @@ impl InstanceManagerRunner { "registering new instance"; "instance_id" => %instance_id, "propolis_id" => %propolis_id); + let instance_log = self.log.new(o!( - "instance_id" => format!("{instance_id}"), - "propolis_id" => format!("{propolis_id}") + "instance_id" => instance_id.to_string(), + "propolis_id" => propolis_id.to_string(), )); + let ticket = InstanceTicket::new(propolis_id, self.terminate_tx.clone()); @@ -651,12 +654,12 @@ impl InstanceManagerRunner { /// zone is rudely terminated. async fn ensure_unregistered( &mut self, - tx: oneshot::Sender>, + tx: oneshot::Sender>, propolis_id: PropolisUuid, ) -> Result<(), Error> { // If the instance does not exist, we response immediately. let Some(instance) = self.get_propolis(propolis_id) else { - tx.send(Ok(InstanceUnregisterResponse { updated_runtime: None })) + tx.send(Ok(VmmUnregisterResponse { updated_runtime: None })) .map_err(|_| Error::FailedSendClientClosed)?; return Ok(()); }; @@ -672,9 +675,9 @@ impl InstanceManagerRunner { /// runtime state. async fn ensure_state( &mut self, - tx: oneshot::Sender>, + tx: oneshot::Sender>, propolis_id: PropolisUuid, - target: InstanceStateRequested, + target: VmmStateRequested, ) -> Result<(), Error> { let Some(instance) = self.get_propolis(propolis_id) else { match target { @@ -686,14 +689,12 @@ impl InstanceManagerRunner { // Propolis handled it, sled agent unregistered the // instance, and only then did a second stop request // arrive. - InstanceStateRequested::Stopped => { - tx.send(Ok(InstancePutStateResponse { - updated_runtime: None, - })) - .map_err(|_| Error::FailedSendClientClosed)?; + VmmStateRequested::Stopped => { + tx.send(Ok(VmmPutStateResponse { updated_runtime: None })) + .map_err(|_| Error::FailedSendClientClosed)?; } _ => { - tx.send(Err(Error::NoSuchInstance(propolis_id))) + tx.send(Err(Error::NoSuchVmm(propolis_id))) .map_err(|_| Error::FailedSendClientClosed)?; } } @@ -710,10 +711,8 @@ impl InstanceManagerRunner { disk_id: Uuid, snapshot_id: Uuid, ) -> Result<(), Error> { - let instance = self - .jobs - .get(&propolis_id) - .ok_or(Error::NoSuchInstance(propolis_id))?; + let instance = + self.jobs.get(&propolis_id).ok_or(Error::NoSuchVmm(propolis_id))?; instance .issue_snapshot_request(tx, disk_id, snapshot_id) @@ -727,12 +726,21 @@ impl InstanceManagerRunner { tx: oneshot::Sender>, name: &str, ) -> Result<(), BundleError> { - let Some(instance) = self - .jobs - .iter() - .find(|(propolis_id, _)| name == propolis_zone_name(propolis_id)) - .map(|entry| entry.1) - else { + // A well-formed Propolis zone name must consist of + // `PROPOLIS_ZONE_PREFIX` and the Propolis ID. If the prefix is not + // present or the Propolis ID portion of the supplied zone name isn't + // parseable as a UUID, there is no Propolis zone with the specified + // name to capture into a bundle, so return a `NoSuchZone` error. + let vmm_id: PropolisUuid = name + .strip_prefix(PROPOLIS_ZONE_PREFIX) + .ok_or_else(|| BundleError::NoSuchZone { name: name.to_string() }) + .and_then(|uuid_str| { + uuid_str.parse::().map_err(|_| { + BundleError::NoSuchZone { name: name.to_string() } + }) + })?; + + let Some(instance) = self.jobs.get(&vmm_id) else { return Err(BundleError::NoSuchZone { name: name.to_string() }); }; instance.request_zone_bundle(tx).await @@ -745,7 +753,7 @@ impl InstanceManagerRunner { ip: &InstanceExternalIpBody, ) -> Result<(), Error> { let Some(instance) = self.get_propolis(propolis_id) else { - return Err(Error::NoSuchInstance(propolis_id)); + return Err(Error::NoSuchVmm(propolis_id)); }; instance.add_external_ip(tx, ip).await?; Ok(()) @@ -758,7 +766,7 @@ impl InstanceManagerRunner { ip: &InstanceExternalIpBody, ) -> Result<(), Error> { let Some(instance) = self.get_propolis(propolis_id) else { - return Err(Error::NoSuchInstance(propolis_id)); + return Err(Error::NoSuchVmm(propolis_id)); }; instance.delete_external_ip(tx, ip).await?; @@ -767,12 +775,12 @@ impl InstanceManagerRunner { async fn get_instance_state( &self, - tx: oneshot::Sender>, + tx: oneshot::Sender>, propolis_id: PropolisUuid, ) -> Result<(), Error> { let Some(instance) = self.get_propolis(propolis_id) else { return tx - .send(Err(Error::NoSuchInstance(propolis_id))) + .send(Err(Error::NoSuchVmm(propolis_id))) .map_err(|_| Error::FailedSendClientClosed); }; diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index 8421350887..d75081f1e4 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -392,12 +392,12 @@ mod test { use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::internal::nexus::DiskRuntimeState; - use omicron_common::api::internal::nexus::SledInstanceState; + use omicron_common::api::internal::nexus::SledVmmState; use omicron_common::api::internal::nexus::VmmRuntimeState; use omicron_common::api::internal::nexus::VmmState; use omicron_test_utils::dev::test_setup_log; use sled_agent_types::disk::DiskStateRequested; - use sled_agent_types::instance::InstanceStateRequested; + use sled_agent_types::instance::VmmStateRequested; fn make_instance( logctx: &LogContext, @@ -408,11 +408,8 @@ mod test { time_updated: Utc::now(), }; - let state = SledInstanceState { - vmm_state, - migration_in: None, - migration_out: None, - }; + let state = + SledVmmState { vmm_state, migration_in: None, migration_out: None }; SimObject::new_simulated_auto(&state, logctx.log.new(o!())) } @@ -456,8 +453,7 @@ mod test { // Stopping an instance that was never started synchronously destroys // its VMM. let rprev = r1; - let dropped = - instance.transition(InstanceStateRequested::Stopped).unwrap(); + let dropped = instance.transition(VmmStateRequested::Stopped).unwrap(); assert!(dropped.is_none()); assert!(instance.object.desired().is_none()); let rnext = instance.object.current(); @@ -497,8 +493,7 @@ mod test { // simulated instance's state, but it does queue up a transition. let mut rprev = r1; assert!(rx.try_next().is_err()); - let dropped = - instance.transition(InstanceStateRequested::Running).unwrap(); + let dropped = instance.transition(VmmStateRequested::Running).unwrap(); assert!(dropped.is_none()); assert!(instance.object.desired().is_some()); assert!(rx.try_next().is_err()); @@ -530,8 +525,7 @@ mod test { // If we transition again to "Running", the process should complete // immediately. - let dropped = - instance.transition(InstanceStateRequested::Running).unwrap(); + let dropped = instance.transition(VmmStateRequested::Running).unwrap(); assert!(dropped.is_none()); assert!(instance.object.desired().is_none()); assert!(rx.try_next().is_err()); @@ -544,8 +538,7 @@ mod test { // If we go back to any stopped state, we go through the async process // again. assert!(rx.try_next().is_err()); - let dropped = - instance.transition(InstanceStateRequested::Stopped).unwrap(); + let dropped = instance.transition(VmmStateRequested::Stopped).unwrap(); assert!(dropped.is_none()); assert!(instance.object.desired().is_some()); let rnext = instance.object.current(); @@ -602,7 +595,7 @@ mod test { assert_eq!(r1.vmm_state.state, VmmState::Starting); assert_eq!(r1.vmm_state.gen, Generation::new()); assert!(instance - .transition(InstanceStateRequested::Running) + .transition(VmmStateRequested::Running) .unwrap() .is_none()); instance.transition_finish(); @@ -618,7 +611,7 @@ mod test { // Now reboot the instance. This is dispatched to Propolis, which will // move to the Rebooting state and then back to Running. assert!(instance - .transition(InstanceStateRequested::Reboot) + .transition(VmmStateRequested::Reboot) .unwrap() .is_none()); let (rprev, rnext) = (rnext, instance.object.current()); diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 8a2ea3a65e..aead47658f 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -23,7 +23,7 @@ use dropshot::TypedBody; use nexus_sled_agent_shared::inventory::SledRole; use nexus_sled_agent_shared::inventory::{Inventory, OmicronZonesConfig}; use omicron_common::api::internal::nexus::DiskRuntimeState; -use omicron_common::api::internal::nexus::SledInstanceState; +use omicron_common::api::internal::nexus::SledVmmState; use omicron_common::api::internal::nexus::UpdateArtifactId; use omicron_common::api::internal::shared::SledIdentifiers; use omicron_common::api::internal::shared::VirtualNetworkInterfaceHost; @@ -43,9 +43,9 @@ use sled_agent_types::early_networking::EarlyNetworkConfig; use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::InstanceEnsureBody; use sled_agent_types::instance::InstanceExternalIpBody; -use sled_agent_types::instance::InstancePutStateBody; -use sled_agent_types::instance::InstancePutStateResponse; -use sled_agent_types::instance::InstanceUnregisterResponse; +use sled_agent_types::instance::VmmPutStateBody; +use sled_agent_types::instance::VmmPutStateResponse; +use sled_agent_types::instance::VmmUnregisterResponse; use sled_agent_types::sled::AddSledRequest; use sled_agent_types::time_sync::TimeSync; use sled_agent_types::zone_bundle::BundleUtilization; @@ -86,7 +86,7 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, path_params: Path, body: TypedBody, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let sa = rqctx.context(); let propolis_id = path_params.into_inner().propolis_id; let body_args = body.into_inner(); @@ -106,7 +106,7 @@ impl SledAgentApi for SledAgentSimImpl { async fn vmm_unregister( rqctx: RequestContext, path_params: Path, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let sa = rqctx.context(); let id = path_params.into_inner().propolis_id; Ok(HttpResponseOk(sa.instance_unregister(id).await?)) @@ -115,8 +115,8 @@ impl SledAgentApi for SledAgentSimImpl { async fn vmm_put_state( rqctx: RequestContext, path_params: Path, - body: TypedBody, - ) -> Result, HttpError> { + body: TypedBody, + ) -> Result, HttpError> { let sa = rqctx.context(); let id = path_params.into_inner().propolis_id; let body_args = body.into_inner(); @@ -126,7 +126,7 @@ impl SledAgentApi for SledAgentSimImpl { async fn vmm_get_state( rqctx: RequestContext, path_params: Path, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let sa = rqctx.context(); let id = path_params.into_inner().propolis_id; Ok(HttpResponseOk(sa.instance_get_state(id).await?)) diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index 8c13c7e9b9..eb7ea0ca79 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -14,14 +14,14 @@ use nexus_client; use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::ResourceType; -use omicron_common::api::internal::nexus::{SledInstanceState, VmmState}; +use omicron_common::api::internal::nexus::{SledVmmState, VmmState}; use omicron_uuid_kinds::{GenericUuid, PropolisUuid}; use propolis_client::types::{ InstanceMigrateStatusResponse as PropolisMigrateResponse, InstanceMigrationStatus as PropolisMigrationStatus, InstanceState as PropolisInstanceState, InstanceStateMonitorResponse, }; -use sled_agent_types::instance::InstanceStateRequested; +use sled_agent_types::instance::VmmStateRequested; use std::collections::VecDeque; use std::sync::Arc; use std::sync::Mutex; @@ -171,13 +171,13 @@ impl SimInstanceInner { /// returning an action for the caller to simulate. fn request_transition( &mut self, - target: &InstanceStateRequested, + target: &VmmStateRequested, ) -> Result, Error> { match target { // When Nexus intends to migrate into a VMM, it should create that // VMM in the Migrating state and shouldn't request anything else // from it before asking to migrate in. - InstanceStateRequested::MigrationTarget(_) => { + VmmStateRequested::MigrationTarget(_) => { if !self.queue.is_empty() { return Err(Error::invalid_request(&format!( "can't request migration in with a non-empty state @@ -208,7 +208,7 @@ impl SimInstanceInner { SimulatedMigrationResult::Success, ); } - InstanceStateRequested::Running => { + VmmStateRequested::Running => { match self.next_resting_state() { VmmState::Starting => { self.queue_propolis_state( @@ -235,7 +235,7 @@ impl SimInstanceInner { } } } - InstanceStateRequested::Stopped => { + VmmStateRequested::Stopped => { match self.next_resting_state() { VmmState::Starting => { let mark_failed = false; @@ -257,7 +257,7 @@ impl SimInstanceInner { } } } - InstanceStateRequested::Reboot => match self.next_resting_state() { + VmmStateRequested::Reboot => match self.next_resting_state() { VmmState::Running => { // Further requests to reboot are ignored if the instance // is currently rebooting or about to reboot. @@ -316,7 +316,7 @@ impl SimInstanceInner { /// If the state change queue contains at least once instance state change, /// returns the requested instance state associated with the last instance /// state on the queue. Returns None otherwise. - fn desired(&self) -> Option { + fn desired(&self) -> Option { self.last_queued_instance_state().map(|terminal| match terminal { // State change requests may queue these states as intermediate // states, but the simulation (and the tests that rely on it) is @@ -332,13 +332,11 @@ impl SimInstanceInner { "pending resting state {:?} doesn't map to a requested state", terminal ), - PropolisInstanceState::Running => InstanceStateRequested::Running, + PropolisInstanceState::Running => VmmStateRequested::Running, PropolisInstanceState::Stopping | PropolisInstanceState::Stopped - | PropolisInstanceState::Destroyed => { - InstanceStateRequested::Stopped - } - PropolisInstanceState::Rebooting => InstanceStateRequested::Reboot, + | PropolisInstanceState::Destroyed => VmmStateRequested::Stopped, + PropolisInstanceState::Rebooting => VmmStateRequested::Reboot, }) } @@ -389,7 +387,7 @@ impl SimInstanceInner { /// Simulates rude termination by moving the instance to the Destroyed state /// immediately and clearing the queue of pending state transitions. - fn terminate(&mut self) -> SledInstanceState { + fn terminate(&mut self) -> SledVmmState { let mark_failed = false; self.state.terminate_rudely(mark_failed); self.queue.clear(); @@ -419,7 +417,7 @@ pub struct SimInstance { } impl SimInstance { - pub fn terminate(&self) -> SledInstanceState { + pub fn terminate(&self) -> SledVmmState { self.inner.lock().unwrap().terminate() } @@ -436,12 +434,12 @@ impl SimInstance { #[async_trait] impl Simulatable for SimInstance { - type CurrentState = SledInstanceState; - type RequestedState = InstanceStateRequested; + type CurrentState = SledVmmState; + type RequestedState = VmmStateRequested; type ProducerArgs = (); type Action = InstanceAction; - fn new(current: SledInstanceState) -> Self { + fn new(current: SledVmmState) -> Self { assert!(matches!( current.vmm_state.state, VmmState::Starting | VmmState::Migrating), @@ -480,7 +478,7 @@ impl Simulatable for SimInstance { fn request_transition( &mut self, - target: &InstanceStateRequested, + target: &VmmStateRequested, ) -> Result, Error> { self.inner.lock().unwrap().request_transition(target) } @@ -513,7 +511,7 @@ impl Simulatable for SimInstance { nexus_client .cpapi_instances_put( &PropolisUuid::from_untyped_uuid(*id), - &nexus_client::types::SledInstanceState::from(current), + &nexus_client::types::SledVmmState::from(current), ) .await .map(|_| ()) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 97c75c6b55..0b0df0585b 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -24,7 +24,7 @@ use omicron_common::api::external::{ ByteCount, DiskState, Error, Generation, ResourceType, }; use omicron_common::api::internal::nexus::{ - DiskRuntimeState, MigrationRuntimeState, MigrationState, SledInstanceState, + DiskRuntimeState, MigrationRuntimeState, MigrationState, SledVmmState, }; use omicron_common::api::internal::nexus::{ InstanceRuntimeState, VmmRuntimeState, @@ -50,8 +50,7 @@ use sled_agent_types::early_networking::{ }; use sled_agent_types::instance::{ InstanceExternalIpBody, InstanceHardware, InstanceMetadata, - InstancePutStateResponse, InstanceStateRequested, - InstanceUnregisterResponse, + VmmPutStateResponse, VmmStateRequested, VmmUnregisterResponse, }; use slog::Logger; use std::collections::{HashMap, HashSet, VecDeque}; @@ -270,7 +269,7 @@ impl SledAgent { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, metadata: InstanceMetadata, - ) -> Result { + ) -> Result { // respond with a fake 500 level failure if asked to ensure an instance // with more than 16 CPUs. let ncpus: i64 = (&hardware.properties.ncpus).into(); @@ -379,7 +378,7 @@ impl SledAgent { .vmms .sim_ensure( &propolis_id.into_untyped_uuid(), - SledInstanceState { + SledVmmState { vmm_state: vmm_runtime, migration_in, migration_out: None, @@ -414,7 +413,7 @@ impl SledAgent { pub async fn instance_unregister( self: &Arc, propolis_id: PropolisUuid, - ) -> Result { + ) -> Result { let instance = match self .vmms .sim_get_cloned_object(&propolis_id.into_untyped_uuid()) @@ -422,12 +421,12 @@ impl SledAgent { { Ok(instance) => instance, Err(Error::ObjectNotFound { .. }) => { - return Ok(InstanceUnregisterResponse { updated_runtime: None }) + return Ok(VmmUnregisterResponse { updated_runtime: None }) } Err(e) => return Err(e), }; - let response = InstanceUnregisterResponse { + let response = VmmUnregisterResponse { updated_runtime: Some(instance.terminate()), }; @@ -439,8 +438,8 @@ impl SledAgent { pub async fn instance_ensure_state( self: &Arc, propolis_id: PropolisUuid, - state: InstanceStateRequested, - ) -> Result { + state: VmmStateRequested, + ) -> Result { if let Some(e) = self.instance_ensure_state_error.lock().await.as_ref() { return Err(e.clone()); @@ -453,8 +452,8 @@ impl SledAgent { { Ok(i) => i.current().clone(), Err(_) => match state { - InstanceStateRequested::Stopped => { - return Ok(InstancePutStateResponse { + VmmStateRequested::Stopped => { + return Ok(VmmPutStateResponse { updated_runtime: None, }); } @@ -470,12 +469,12 @@ impl SledAgent { let mock_lock = self.mock_propolis.lock().await; if let Some((_srv, client)) = mock_lock.as_ref() { let body = match state { - InstanceStateRequested::MigrationTarget(_) => { + VmmStateRequested::MigrationTarget(_) => { return Err(Error::internal_error( "migration not implemented for mock Propolis", )); } - InstanceStateRequested::Running => { + VmmStateRequested::Running => { let vmms = self.vmms.clone(); let log = self.log.new( o!("component" => "SledAgent-insure_instance_state"), @@ -491,22 +490,22 @@ impl SledAgent { .await { Ok(state) => { - let instance_state: nexus_client::types::SledInstanceState = state.into(); - info!(log, "sim_ensure success"; "instance_state" => #?instance_state); + let vmm_state: nexus_client::types::SledVmmState = state.into(); + info!(log, "sim_ensure success"; "vmm_state" => #?vmm_state); } Err(instance_put_error) => { error!(log, "sim_ensure failure"; "error" => #?instance_put_error); } } }); - return Ok(InstancePutStateResponse { + return Ok(VmmPutStateResponse { updated_runtime: None, }); } - InstanceStateRequested::Stopped => { + VmmStateRequested::Stopped => { propolis_client::types::InstanceStateRequested::Stop } - InstanceStateRequested::Reboot => { + VmmStateRequested::Reboot => { propolis_client::types::InstanceStateRequested::Reboot } }; @@ -520,20 +519,20 @@ impl SledAgent { .sim_ensure(&propolis_id.into_untyped_uuid(), current, Some(state)) .await?; - Ok(InstancePutStateResponse { updated_runtime: Some(new_state) }) + Ok(VmmPutStateResponse { updated_runtime: Some(new_state) }) } pub async fn instance_get_state( &self, propolis_id: PropolisUuid, - ) -> Result { + ) -> Result { let instance = self .vmms .sim_get_cloned_object(&propolis_id.into_untyped_uuid()) .await .map_err(|_| { crate::sled_agent::Error::Instance( - crate::instance_manager::Error::NoSuchInstance(propolis_id), + crate::instance_manager::Error::NoSuchVmm(propolis_id), ) })?; Ok(instance.current()) @@ -550,7 +549,7 @@ impl SledAgent { .await .map_err(|_| { crate::sled_agent::Error::Instance( - crate::instance_manager::Error::NoSuchInstance(propolis_id), + crate::instance_manager::Error::NoSuchVmm(propolis_id), ) })?; instance.set_simulated_migration_source(migration); diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 9752e8d7e2..d69ccedb7d 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -38,9 +38,7 @@ use omicron_common::address::{ get_sled_address, get_switch_zone_address, Ipv6Subnet, SLED_PREFIX, }; use omicron_common::api::external::{ByteCount, ByteCountRangeError, Vni}; -use omicron_common::api::internal::nexus::{ - SledInstanceState, VmmRuntimeState, -}; +use omicron_common::api::internal::nexus::{SledVmmState, VmmRuntimeState}; use omicron_common::api::internal::shared::{ HostPortConfig, RackNetworkConfig, ResolvedVpcFirewallRule, ResolvedVpcRouteSet, ResolvedVpcRouteState, SledIdentifiers, @@ -61,8 +59,7 @@ use sled_agent_types::disk::DiskStateRequested; use sled_agent_types::early_networking::EarlyNetworkConfig; use sled_agent_types::instance::{ InstanceExternalIpBody, InstanceHardware, InstanceMetadata, - InstancePutStateResponse, InstanceStateRequested, - InstanceUnregisterResponse, + VmmPutStateResponse, VmmStateRequested, VmmUnregisterResponse, }; use sled_agent_types::sled::{BaseboardId, StartSledAgentRequest}; use sled_agent_types::time_sync::TimeSync; @@ -227,7 +224,7 @@ impl From for dropshot::HttpError { } } Error::Instance( - e @ crate::instance_manager::Error::NoSuchInstance(_), + e @ crate::instance_manager::Error::NoSuchVmm(_), ) => HttpError::for_not_found( Some(NO_SUCH_INSTANCE.to_string()), e.to_string(), @@ -966,7 +963,7 @@ impl SledAgent { vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, metadata: InstanceMetadata, - ) -> Result { + ) -> Result { self.inner .instances .ensure_registered( @@ -991,7 +988,7 @@ impl SledAgent { pub async fn instance_ensure_unregistered( &self, propolis_id: PropolisUuid, - ) -> Result { + ) -> Result { self.inner .instances .ensure_unregistered(propolis_id) @@ -1004,8 +1001,8 @@ impl SledAgent { pub async fn instance_ensure_state( &self, propolis_id: PropolisUuid, - target: InstanceStateRequested, - ) -> Result { + target: VmmStateRequested, + ) -> Result { self.inner .instances .ensure_state(propolis_id, target) @@ -1048,7 +1045,7 @@ impl SledAgent { pub async fn instance_get_state( &self, propolis_id: PropolisUuid, - ) -> Result { + ) -> Result { self.inner .instances .get_instance_state(propolis_id) diff --git a/sled-agent/types/src/instance.rs b/sled-agent/types/src/instance.rs index 12ceb98fb4..bd0f536aa3 100644 --- a/sled-agent/types/src/instance.rs +++ b/sled-agent/types/src/instance.rs @@ -11,8 +11,7 @@ use std::{ use omicron_common::api::internal::{ nexus::{ - InstanceProperties, InstanceRuntimeState, SledInstanceState, - VmmRuntimeState, + InstanceProperties, InstanceRuntimeState, SledVmmState, VmmRuntimeState, }, shared::{ DhcpConfig, NetworkInterface, ResolvedVpcFirewallRule, SourceNatConfig, @@ -78,19 +77,19 @@ pub struct InstanceMetadata { /// The body of a request to move a previously-ensured instance into a specific /// runtime state. #[derive(Serialize, Deserialize, JsonSchema)] -pub struct InstancePutStateBody { +pub struct VmmPutStateBody { /// The state into which the instance should be driven. - pub state: InstanceStateRequested, + pub state: VmmStateRequested, } /// The response sent from a request to move an instance into a specific runtime /// state. #[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct InstancePutStateResponse { +pub struct VmmPutStateResponse { /// The current runtime state of the instance after handling the request to /// change its state. If the instance's state did not change, this field is /// `None`. - pub updated_runtime: Option, + pub updated_runtime: Option, } /// Requestable running state of an Instance. @@ -98,7 +97,7 @@ pub struct InstancePutStateResponse { /// A subset of [`omicron_common::api::external::InstanceState`]. #[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)] #[serde(rename_all = "snake_case", tag = "type", content = "value")] -pub enum InstanceStateRequested { +pub enum VmmStateRequested { /// Run this instance by migrating in from a previous running incarnation of /// the instance. MigrationTarget(InstanceMigrationTargetParams), @@ -111,40 +110,40 @@ pub enum InstanceStateRequested { Reboot, } -impl fmt::Display for InstanceStateRequested { +impl fmt::Display for VmmStateRequested { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.label()) } } -impl InstanceStateRequested { +impl VmmStateRequested { fn label(&self) -> &str { match self { - InstanceStateRequested::MigrationTarget(_) => "migrating in", - InstanceStateRequested::Running => "running", - InstanceStateRequested::Stopped => "stopped", - InstanceStateRequested::Reboot => "reboot", + VmmStateRequested::MigrationTarget(_) => "migrating in", + VmmStateRequested::Running => "running", + VmmStateRequested::Stopped => "stopped", + VmmStateRequested::Reboot => "reboot", } } /// Returns true if the state represents a stopped Instance. pub fn is_stopped(&self) -> bool { match self { - InstanceStateRequested::MigrationTarget(_) => false, - InstanceStateRequested::Running => false, - InstanceStateRequested::Stopped => true, - InstanceStateRequested::Reboot => false, + VmmStateRequested::MigrationTarget(_) => false, + VmmStateRequested::Running => false, + VmmStateRequested::Stopped => true, + VmmStateRequested::Reboot => false, } } } /// The response sent from a request to unregister an instance. #[derive(Serialize, Deserialize, JsonSchema)] -pub struct InstanceUnregisterResponse { +pub struct VmmUnregisterResponse { /// The current state of the instance after handling the request to /// unregister it. If the instance's state did not change, this field is /// `None`. - pub updated_runtime: Option, + pub updated_runtime: Option, } /// Parameters used when directing Propolis to initialize itself via live From 9f4156a1e6ce8f3a88ce29e7876879efcd92cc67 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Mon, 26 Aug 2024 19:51:09 +0000 Subject: [PATCH 12/16] get instance id from vmm find-and-update --- nexus/db-queries/src/db/datastore/vmm.rs | 20 +++++++++++++++++--- nexus/src/app/instance.rs | 5 ++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index f909b7e9d3..089a2914be 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -39,8 +39,13 @@ use uuid::Uuid; /// The result of an [`DataStore::vmm_and_migration_update_runtime`] call, /// indicating which records were updated. -#[derive(Copy, Clone, Debug)] +#[derive(Clone, Debug)] pub struct VmmStateUpdateResult { + /// The VMM record that the update query found and possibly updated. + /// + /// NOTE: This is the record prior to the update! + pub found_vmm: Vmm, + /// `true` if the VMM record was updated, `false` otherwise. pub vmm_updated: bool, @@ -228,13 +233,21 @@ impl DataStore { .transaction(&conn, |conn| { let err = err.clone(); async move { - let vmm_updated = self + let vmm_update_result = self .vmm_update_runtime_on_connection( &conn, &vmm_id, new_runtime, ) - .await.map(|r| match r.status { UpdateStatus::Updated => true, UpdateStatus::NotUpdatedButExists => false })?; + .await?; + + + let found_vmm = vmm_update_result.found; + let vmm_updated = match vmm_update_result.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => false + }; + let migration_out_updated = match migration_out { Some(migration) => { let r = self.migration_update_source_on_connection( @@ -282,6 +295,7 @@ impl DataStore { None => false, }; Ok(VmmStateUpdateResult { + found_vmm, vmm_updated, migration_in_updated, migration_out_updated, diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index bcc242db77..0f79ace564 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1881,9 +1881,8 @@ pub(crate) async fn process_vmm_update( new_runtime_state, &result, ) { - let instance_id = InstanceUuid::from_untyped_uuid( - datastore.vmm_fetch(&opctx, &propolis_id).await?.instance_id, - ); + let instance_id = + InstanceUuid::from_untyped_uuid(result.found_vmm.instance_id); let (.., authz_instance) = LookupPath::new(&opctx, datastore) .instance_id(instance_id.into_untyped_uuid()) From 88a488a729ebce7a6662a3e84529b8202b9413b8 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Mon, 26 Aug 2024 20:31:11 +0000 Subject: [PATCH 13/16] remove obsolete comment --- nexus/src/app/instance.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 0f79ace564..ed0a8443b9 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1868,13 +1868,6 @@ pub(crate) async fn process_vmm_update( // If an instance-update saga must be executed as a result of this update, // prepare and return it. - // - // It's safe to fetch the VMM record to get the instance ID at this point - // even though the VMM may now be Destroyed, because the record itself won't - // be deleted until the instance update saga retires it. If this update - // marked the VMM as Destroyed and then the fetch failed, then another - // update saga has already observed what this routine published, so there's - // no need to schedule another saga. if instance_update::update_saga_needed( &opctx.log, propolis_id, From 8dd6e9e3848745d2588140a78f0afcf4b6658dce Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Mon, 26 Aug 2024 20:32:46 +0000 Subject: [PATCH 14/16] fmt --- sled-agent/src/sim/sled_agent.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 0b0df0585b..7292b3dee1 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -453,9 +453,7 @@ impl SledAgent { Ok(i) => i.current().clone(), Err(_) => match state { VmmStateRequested::Stopped => { - return Ok(VmmPutStateResponse { - updated_runtime: None, - }); + return Ok(VmmPutStateResponse { updated_runtime: None }); } _ => { return Err(Error::invalid_request(&format!( @@ -498,9 +496,7 @@ impl SledAgent { } } }); - return Ok(VmmPutStateResponse { - updated_runtime: None, - }); + return Ok(VmmPutStateResponse { updated_runtime: None }); } VmmStateRequested::Stopped => { propolis_client::types::InstanceStateRequested::Stop From 3ab24312b6ab0d5e8ffdfe0afd733eeae06199bc Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Mon, 26 Aug 2024 20:40:56 +0000 Subject: [PATCH 15/16] fix doc links --- nexus/src/app/sagas/instance_update/mod.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index e8aa5cb5fc..4c4c4deff2 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -30,10 +30,9 @@ //! Nexus' `cpapi_instances_put` internal API endpoint, when a Nexus' //! `instance-watcher` background task *pulls* instance states from sled-agents //! periodically, or as the return value of an API call from Nexus to a -//! sled-agent. When a Nexus receives a new [`SledInstanceState`] from a -//! sled-agent through any of these mechanisms, the Nexus will write any changed -//! state to the `vmm` and/or `migration` tables directly on behalf of the -//! sled-agent. +//! sled-agent. When a Nexus receives a new [`SledVmmState`] from a sled-agent +//! through any of these mechanisms, the Nexus will write any changed state to +//! the `vmm` and/or `migration` tables directly on behalf of the sled-agent. //! //! Although Nexus is technically the party responsible for the database query //! that writes VMM and migration state updates received from sled-agent, it is @@ -326,6 +325,7 @@ //! crate::app::db::datastore::DataStore::instance_updater_inherit_lock //! [instance_updater_unlock]: //! crate::app::db::datastore::DataStore::instance_updater_unlock +//! [`notify_vmm_updated`]: crate::app::Nexus::notify_vmm_updated //! [`process_vmm_update`]: crate::app::instance::process_vmm_update //! //! [dist-locking]: @@ -388,8 +388,8 @@ pub(crate) use self::start::{Params, SagaInstanceUpdate}; mod destroyed; /// Returns `true` if an `instance-update` saga should be executed as a result -/// of writing the provided [`SledInstanceState`] to the database with the -/// provided [`VmmStateUpdateResult`]. +/// of writing the provided [`SledVmmState`] to the database with the provided +/// [`VmmStateUpdateResult`]. /// /// We determine this only after actually updating the database records, /// because we don't know whether a particular VMM or migration state is From fb9ee7ec124b207283e475ca337d60acf7d16ca1 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Mon, 26 Aug 2024 21:20:19 +0000 Subject: [PATCH 16/16] PR feedback --- nexus/src/app/instance.rs | 4 ++-- sled-agent/src/instance_manager.rs | 8 +++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index ed0a8443b9..b715b6bbd3 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1830,8 +1830,8 @@ impl super::Nexus { } } -/// Publishes the VMM and migration state supplied in `new_runtime_state` to the -/// database. +/// Writes the VMM and migration state supplied in `new_runtime_state` to the +/// database (provided that it's newer than what's already there). /// /// # Return value /// diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 353496a060..24be8be89f 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -733,11 +733,9 @@ impl InstanceManagerRunner { // name to capture into a bundle, so return a `NoSuchZone` error. let vmm_id: PropolisUuid = name .strip_prefix(PROPOLIS_ZONE_PREFIX) - .ok_or_else(|| BundleError::NoSuchZone { name: name.to_string() }) - .and_then(|uuid_str| { - uuid_str.parse::().map_err(|_| { - BundleError::NoSuchZone { name: name.to_string() } - }) + .and_then(|uuid_str| uuid_str.parse::().ok()) + .ok_or_else(|| BundleError::NoSuchZone { + name: name.to_string(), })?; let Some(instance) = self.jobs.get(&vmm_id) else {