diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
index 987a8ac794..93386a66d0 100644
--- a/nexus/src/app/instance.rs
+++ b/nexus/src/app/instance.rs
@@ -61,6 +61,83 @@ use uuid::Uuid;
 
 const MAX_KEYS_PER_INSTANCE: u32 = 8;
 
+type SledAgentClientError =
+    sled_agent_client::Error<sled_agent_client::types::Error>;
+
+// Newtype wrapper to avoid the orphan type rule.
+#[derive(Debug)]
+pub struct SledAgentInstancePutError(pub SledAgentClientError);
+
+impl std::fmt::Display for SledAgentInstancePutError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl From<SledAgentClientError> for SledAgentInstancePutError {
+    fn from(value: SledAgentClientError) -> Self {
+        Self(value)
+    }
+}
+
+impl From<SledAgentInstancePutError> for omicron_common::api::external::Error {
+    fn from(value: SledAgentInstancePutError) -> Self {
+        value.0.into()
+    }
+}
+
+impl SledAgentInstancePutError {
+    /// Returns `true` if this error is of a class that indicates that Nexus
+    /// cannot assume anything about the health of the instance or its sled.
+    pub fn instance_unhealthy(&self) -> bool {
+        // TODO(#3238) TODO(#4226) For compatibility, this logic is lifted from
+        // the From impl that converts Progenitor client errors to
+        // `omicron_common::api::external::Error`s and from previous logic in
+        // this module that inferred instance health from those converted
+        // errors. In particular, some of the outer Progenitor client error
+        // variants (e.g. CommunicationError) can indicate transient conditions
+        // that aren't really fatal to an instance and don't otherwise indicate
+        // that it's unhealthy.
+        //
+        // To match old behavior until this is revisited, however, treat all
+        // Progenitor errors except for explicit error responses as signs of an
+        // unhealthy instance, and then only treat an instance as healthy if its
+        // sled returned a 400-level status code.
+        match &self.0 {
+            progenitor_client::Error::ErrorResponse(rv) => {
+                !rv.status().is_client_error()
+            }
+            _ => true,
+        }
+    }
+}
+
+/// An error that can be returned from an operation that changes the state of an
+/// instance on a specific sled.
+#[derive(Debug, thiserror::Error)]
+pub enum InstanceStateChangeError {
+    /// Sled agent returned an error from one of its instance endpoints.
+    #[error("sled agent client error: {0}")]
+    SledAgent(SledAgentInstancePutError),
+
+    /// Some other error occurred outside of the attempt to communicate with
+    /// sled agent.
+    #[error(transparent)]
+    Other(#[from] omicron_common::api::external::Error),
+}
+
+// Allow direct conversion of instance state change errors into API errors for
+// callers who don't care about the specific reason the update failed and just
+// need to return an API error.
+impl From<InstanceStateChangeError> for omicron_common::api::external::Error {
+    fn from(value: InstanceStateChangeError) -> Self {
+        match value {
+            InstanceStateChangeError::SledAgent(e) => e.into(),
+            InstanceStateChangeError::Other(e) => e,
+        }
+    }
+}
+
 /// The kinds of state changes that can be requested of an instance's current
 /// VMM (i.e. the VMM pointed to be the instance's `propolis_id` field).
 pub(crate) enum InstanceStateChangeRequest {
@@ -438,19 +515,30 @@ impl super::Nexus {
                 },
             )
             .await
-            .map(|res| Some(res.into_inner()));
+            .map(|res| Some(res.into_inner().into()))
+            .map_err(|e| SledAgentInstancePutError(e));
 
         // Write the updated instance runtime state back to CRDB. If this
         // outright fails, this operation fails. If the operation nominally
         // succeeds but nothing was updated, this action is outdated and the
         // caller should not proceed with migration.
-        let (updated, _) = self
-            .handle_instance_put_result(
-                &instance_id,
-                prev_instance_runtime,
-                instance_put_result.map(|state| state.map(Into::into)),
-            )
-            .await?;
+        let (updated, _) = match instance_put_result {
+            Ok(state) => {
+                self.write_returned_instance_state(&instance_id, state).await?
+            }
+            Err(e) => {
+                if e.instance_unhealthy() {
+                    let _ = self
+                        .mark_instance_failed(
+                            &instance_id,
+                            &prev_instance_runtime,
+                            &e,
+                        )
+                        .await;
+                }
+                return Err(e.into());
+            }
+        };
 
         if updated {
             Ok(self
@@ -498,14 +586,26 @@ impl super::Nexus {
                 },
             )
             .await
-            .map(|res| Some(res.into_inner()));
+            .map(|res| Some(res.into_inner().into()))
+            .map_err(|e| SledAgentInstancePutError(e));
 
-        self.handle_instance_put_result(
-            &instance_id,
-            prev_instance_runtime,
-            instance_put_result.map(|state| state.map(Into::into)),
-        )
-        .await?;
+        match instance_put_result {
+            Ok(state) => {
+                self.write_returned_instance_state(&instance_id, state).await?;
+            }
+            Err(e) => {
+                if e.instance_unhealthy() {
+                    let _ = self
+                        .mark_instance_failed(
+                            &instance_id,
+                            &prev_instance_runtime,
+                            &e,
+                        )
+                        .await;
+                }
+                return Err(e.into());
+            }
+        }
 
         Ok(())
     }
@@ -631,22 +731,18 @@ impl super::Nexus {
         opctx: &OpContext,
         authz_instance: &authz::Instance,
         sled_id: &Uuid,
-        prev_instance_runtime: &db::model::InstanceRuntimeState,
-    ) -> Result<(), Error> {
+    ) -> Result<Option<nexus::SledInstanceState>, InstanceStateChangeError>
+    {
         opctx.authorize(authz::Action::Modify, authz_instance).await?;
         let sa = self.sled_client(&sled_id).await?;
-        let result = sa
-            .instance_unregister(&authz_instance.id())
+        sa.instance_unregister(&authz_instance.id())
             .await
-            .map(|res| res.into_inner().updated_runtime);
-
-        self.handle_instance_put_result(
-            &authz_instance.id(),
-            prev_instance_runtime,
-            result.map(|state| state.map(Into::into)),
-        )
-        .await
-        .map(|_| ())
+            .map(|res| res.into_inner().updated_runtime.map(Into::into))
+            .map_err(|e| {
+                InstanceStateChangeError::SledAgent(SledAgentInstancePutError(
+                    e,
+                ))
+            })
     }
 
     /// Determines the action to take on an instance's active VMM given a
@@ -799,7 +895,7 @@ impl super::Nexus {
         prev_instance_state: &db::model::Instance,
         prev_vmm_state: &Option<db::model::Vmm>,
         requested: InstanceStateChangeRequest,
-    ) -> Result<(), Error> {
+    ) -> Result<(), InstanceStateChangeError> {
         opctx.authorize(authz::Action::Modify, authz_instance).await?;
         let instance_id = authz_instance.id();
 
@@ -817,16 +913,23 @@ impl super::Nexus {
                         &InstancePutStateBody { state: requested.into() },
                     )
                     .await
-                    .map(|res| res.into_inner().updated_runtime)
-                    .map(|state| state.map(Into::into));
+                    .map(|res| res.into_inner().updated_runtime.map(Into::into))
+                    .map_err(|e| SledAgentInstancePutError(e));
 
-                self.handle_instance_put_result(
-                    &instance_id,
-                    prev_instance_state.runtime(),
-                    instance_put_result,
-                )
-                .await
-                .map(|_| ())
+                // If the operation succeeded, write the instance state back,
+                // returning any subsequent errors that occurred during that
+                // write.
+                //
+                // If the operation failed, kick the sled agent error back up to
+                // the caller to let it decide how to handle it.
+                match instance_put_result {
+                    Ok(state) => self
+                        .write_returned_instance_state(&instance_id, state)
+                        .await
+                        .map(|_| ())
+                        .map_err(Into::into),
+                    Err(e) => Err(InstanceStateChangeError::SledAgent(e)),
+                }
             }
         }
     }
@@ -1046,143 +1149,117 @@ impl super::Nexus {
                 },
             )
             .await
-            .map(|res| Some(res.into_inner()));
+            .map(|res| Some(res.into_inner().into()))
+            .map_err(|e| SledAgentInstancePutError(e));
 
-        self.handle_instance_put_result(
-            &db_instance.id(),
-            db_instance.runtime(),
-            instance_register_result.map(|state| state.map(Into::into)),
-        )
-        .await
-        .map(|_| ())
+        match instance_register_result {
+            Ok(state) => {
+                self.write_returned_instance_state(&db_instance.id(), state)
+                    .await?;
+            }
+            Err(e) => {
+                if e.instance_unhealthy() {
+                    let _ = self
+                        .mark_instance_failed(
+                            &db_instance.id(),
+                            db_instance.runtime(),
+                            &e,
+                        )
+                        .await;
+                }
+                return Err(e.into());
+            }
+        }
+
+        Ok(())
     }
 
-    /// Updates an instance's CRDB record based on the result of a call to sled
-    /// agent that tried to update the instance's state.
-    ///
-    /// # Parameters
-    ///
-    /// - `db_instance`: The CRDB instance record observed by the caller before
-    ///   it attempted to update the instance's state.
-    /// - `result`: The result of the relevant sled agent operation. If this is
-    ///   `Ok`, the payload is the updated instance runtime state returned from
-    ///   sled agent, if there was one.
+    /// Takes an updated instance state returned from a call to sled agent and
+    /// writes it back to the database.
     ///
     /// # Return value
     ///
-    /// - `Ok(true)` if the caller supplied an updated instance record and this
-    ///   routine successfully wrote it to CRDB.
-    /// - `Ok(false)` if the sled agent call succeeded, but this routine did not
-    ///   update CRDB.
-    ///   This can happen either because sled agent didn't return an updated
-    ///   record or because the updated record was superseded by a state update
-    ///   with a more advanced generation number.
-    /// - `Err` if the sled agent operation failed or this routine received an
-    ///   error while trying to update CRDB.
-    async fn handle_instance_put_result(
+    /// - `Ok((instance_updated, vmm_updated))` if no failures occurred. The
+    ///   tuple fields indicate which database records (if any) were updated.
+    ///   Note that it is possible for sled agent not to return an updated
+    ///   instance state from a particular API call. In that case, the `state`
+    ///   parameter is `None` and this routine returns `Ok((false, false))`.
+    /// - `Err` if an error occurred while writing state to the database. A
+    ///   database operation that succeeds but doesn't update anything (e.g.
+    ///   owing to an outdated generation number) will return `Ok`.
+    async fn write_returned_instance_state(
         &self,
         instance_id: &Uuid,
-        prev_instance_runtime: &db::model::InstanceRuntimeState,
-        result: Result<
-            Option<nexus::SledInstanceState>,
-            sled_agent_client::Error<sled_agent_client::types::Error>,
-        >,
+        state: Option<nexus::SledInstanceState>,
     ) -> Result<(bool, bool), Error> {
-        slog::debug!(&self.log, "Handling sled agent instance PUT result";
+        slog::debug!(&self.log,
+                     "writing instance state returned from sled agent";
                      "instance_id" => %instance_id,
-                     "result" => ?result);
-
-        match result {
-            Ok(Some(new_state)) => {
-                let update_result = self
-                    .db_datastore
-                    .instance_and_vmm_update_runtime(
-                        instance_id,
-                        &new_state.instance_state.into(),
-                        &new_state.propolis_id,
-                        &new_state.vmm_state.into(),
-                    )
-                    .await;
+                     "new_state" => ?state);
 
-                slog::debug!(&self.log,
-                             "Attempted DB update after instance PUT";
-                             "instance_id" => %instance_id,
-                             "propolis_id" => %new_state.propolis_id,
-                             "result" => ?update_result);
-
-                update_result
-            }
-            Ok(None) => Ok((false, false)),
-            Err(e) => {
-                // The sled-agent has told us that it can't do what we
-                // requested, but does that mean a failure? One example would be
-                // if we try to "reboot" a stopped instance. That shouldn't
-                // transition the instance to failed. But if the sled-agent
-                // *can't* boot a stopped instance, that should transition
-                // to failed.
-                //
-                // Without a richer error type, let the sled-agent tell Nexus
-                // what to do with status codes.
-                error!(self.log, "received error from instance PUT";
-                       "instance_id" => %instance_id,
-                       "error" => ?e);
+        if let Some(state) = state {
+            let update_result = self
+                .db_datastore
+                .instance_and_vmm_update_runtime(
+                    instance_id,
+                    &state.instance_state.into(),
+                    &state.propolis_id,
+                    &state.vmm_state.into(),
+                )
+                .await;
 
-                // Convert to the Omicron API error type.
-                //
-                // TODO(#3238): This is an extremely lossy conversion: if the
-                // operation failed without getting a response from sled agent,
-                // this unconditionally converts to Error::InternalError.
-                let e = e.into();
-
-                match &e {
-                    // Bad request shouldn't change the instance state.
-                    Error::InvalidRequest { .. } => Err(e),
-
-                    // Internal server error (or anything else) should change
-                    // the instance state to failed, we don't know what state
-                    // the instance is in.
-                    //
-                    // TODO(#4226): This logic needs to be revisited:
-                    // - Some errors that don't get classified as
-                    //   Error::InvalidRequest (timeouts, disconnections due to
-                    //   network weather, etc.) are not necessarily fatal to the
-                    //   instance and shouldn't mark it as Failed.
-                    // - If the instance still has a running VMM, this operation
-                    //   won't terminate it or reclaim its resources. (The
-                    //   resources will be reclaimed if the sled later reports
-                    //   that the VMM is gone, however.)
-                    _ => {
-                        let new_runtime = db::model::InstanceRuntimeState {
-                            nexus_state: db::model::InstanceState::new(
-                                InstanceState::Failed,
-                            ),
+            slog::debug!(&self.log,
+                         "attempted to write instance state from sled agent";
+                         "instance_id" => %instance_id,
+                         "propolis_id" => %state.propolis_id,
+                         "result" => ?update_result);
 
-                            // TODO(#4226): Clearing the Propolis ID is required
-                            // to allow the instance to be deleted, but this
-                            // doesn't actually terminate the VMM (see above).
-                            propolis_id: None,
-                            gen: prev_instance_runtime.gen.next().into(),
-                            ..prev_instance_runtime.clone()
-                        };
+            update_result
+        } else {
+            Ok((false, false))
+        }
+    }
 
-                        // XXX what if this fails?
-                        let result = self
-                            .db_datastore
-                            .instance_update_runtime(&instance_id, &new_runtime)
-                            .await;
+    /// Attempts to move an instance from `prev_instance_runtime` to the
+    /// `Failed` state in response to an error returned from a call to a sled
+    /// agent instance API, supplied in `reason`.
+    pub(crate) async fn mark_instance_failed(
+        &self,
+        instance_id: &Uuid,
+        prev_instance_runtime: &db::model::InstanceRuntimeState,
+        reason: &SledAgentInstancePutError,
+    ) -> Result<(), Error> {
+        error!(self.log, "marking instance failed due to sled agent API error";
+               "instance_id" => %instance_id,
+               "error" => ?reason);
+
+        let new_runtime = db::model::InstanceRuntimeState {
+            nexus_state: db::model::InstanceState::new(InstanceState::Failed),
+
+            // TODO(#4226): Clearing the Propolis ID is required to allow the
+            // instance to be deleted, but this doesn't actually terminate the
+            // VMM.
+            propolis_id: None,
+            gen: prev_instance_runtime.gen.next().into(),
+            ..prev_instance_runtime.clone()
+        };
 
-                        error!(
-                            self.log,
-                            "attempted to set instance to Failed after bad put";
+        match self
+            .db_datastore
+            .instance_update_runtime(&instance_id, &new_runtime)
+            .await
+        {
+            Ok(_) => info!(self.log, "marked instance as Failed";
+                           "instance_id" => %instance_id),
+            // XXX: It's not clear what to do with this error; should it be
+            // bubbled back up to the caller?
+            Err(e) => error!(self.log,
+                            "failed to write Failed instance state to DB";
                             "instance_id" => %instance_id,
-                            "result" => ?result,
-                        );
-
-                        Err(e)
-                    }
-                }
-            }
+                            "error" => ?e),
         }
+
+        Ok(())
     }
 
     /// Lists disks attached to the instance.
diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs
index 5149825842..fd86e2052a 100644
--- a/nexus/src/app/sagas/instance_create.rs
+++ b/nexus/src/app/sagas/instance_create.rs
@@ -903,8 +903,7 @@ pub mod test {
     };
     use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection};
     use diesel::{
-        BoolExpressionMethods, ExpressionMethods, OptionalExtension, QueryDsl,
-        SelectableHelper,
+        ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper,
     };
     use dropshot::test_util::ClientTestContext;
     use nexus_db_queries::authn::saga::Serialized;
@@ -1073,68 +1072,6 @@ pub mod test {
             .unwrap()
     }
 
-    async fn no_virtual_provisioning_resource_records_exist(
-        datastore: &DataStore,
-    ) -> bool {
-        use nexus_db_queries::db::model::VirtualProvisioningResource;
-        use nexus_db_queries::db::schema::virtual_provisioning_resource::dsl;
-
-        let conn = datastore.pool_connection_for_tests().await.unwrap();
-
-        datastore
-            .transaction_retry_wrapper("no_virtual_provisioning_resource_records_exist")
-            .transaction(&conn, |conn| async move {
-                conn
-                    .batch_execute_async(nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL)
-                    .await
-                    .unwrap();
-
-                Ok(
-                    dsl::virtual_provisioning_resource
-                        .filter(dsl::resource_type.eq(nexus_db_queries::db::model::ResourceTypeProvisioned::Instance.to_string()))
-                        .select(VirtualProvisioningResource::as_select())
-                        .get_results_async::<VirtualProvisioningResource>(&conn)
-                        .await
-                        .unwrap()
-                        .is_empty()
-                )
-            }).await.unwrap()
-    }
-
-    async fn no_virtual_provisioning_collection_records_using_instances(
-        datastore: &DataStore,
-    ) -> bool {
-        use nexus_db_queries::db::model::VirtualProvisioningCollection;
-        use nexus_db_queries::db::schema::virtual_provisioning_collection::dsl;
-
-        let conn = datastore.pool_connection_for_tests().await.unwrap();
-
-        datastore
-            .transaction_retry_wrapper(
-                "no_virtual_provisioning_collection_records_using_instances",
-            )
-            .transaction(&conn, |conn| async move {
-                conn.batch_execute_async(
-                    nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL,
-                )
-                .await
-                .unwrap();
-                Ok(dsl::virtual_provisioning_collection
-                    .filter(
-                        dsl::cpus_provisioned
-                            .ne(0)
-                            .or(dsl::ram_provisioned.ne(0)),
-                    )
-                    .select(VirtualProvisioningCollection::as_select())
-                    .get_results_async::<VirtualProvisioningCollection>(&conn)
-                    .await
-                    .unwrap()
-                    .is_empty())
-            })
-            .await
-            .unwrap()
-    }
-
     async fn disk_is_detached(datastore: &DataStore) -> bool {
         use nexus_db_queries::db::model::Disk;
         use nexus_db_queries::db::schema::disk::dsl;
@@ -1170,11 +1107,14 @@ pub mod test {
         assert!(no_external_ip_records_exist(datastore).await);
         assert!(no_sled_resource_instance_records_exist(datastore).await);
         assert!(
-            no_virtual_provisioning_resource_records_exist(datastore).await
+            test_helpers::no_virtual_provisioning_resource_records_exist(
+                cptestctx
+            )
+            .await
         );
         assert!(
-            no_virtual_provisioning_collection_records_using_instances(
-                datastore
+            test_helpers::no_virtual_provisioning_collection_records_using_instances(
+                cptestctx
             )
             .await
         );
diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs
index d32a20bc40..7a417a5781 100644
--- a/nexus/src/app/sagas/instance_migrate.rs
+++ b/nexus/src/app/sagas/instance_migrate.rs
@@ -3,7 +3,9 @@
 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
 
 use super::{NexusActionContext, NexusSaga, ACTION_GENERATE_ID};
-use crate::app::instance::InstanceStateChangeRequest;
+use crate::app::instance::{
+    InstanceStateChangeError, InstanceStateChangeRequest,
+};
 use crate::app::sagas::{
     declare_saga_actions, instance_common::allocate_sled_ipv6,
 };
@@ -387,28 +389,26 @@ async fn sim_ensure_destination_propolis_undo(
           "prev_runtime_state" => ?db_instance.runtime());
 
     // Ensure that the destination sled has no Propolis matching the description
-    // the saga previously generated.
-    //
-    // Sled agent guarantees that if an instance is unregistered from a sled
-    // that does not believe it holds the "active" Propolis for the instance,
-    // then the sled's copy of the instance record will not change during
-    // unregistration. This precondition always holds here because the "start
-    // migration" step is not allowed to unwind once migration has possibly
-    // started. Not changing the instance is important here because the next
-    // undo step (clearing migration IDs) needs to advance the instance's
-    // generation number to succeed.
-    osagactx
+    // the saga previously generated. If this succeeds, or if it fails because
+    // the destination sled no longer knows about this instance, allow the rest
+    // of unwind to take care of cleaning up the migration IDs in the instance
+    // record. Otherwise the unwind has failed and manual intervention is
+    // needed.
+    match osagactx
         .nexus()
-        .instance_ensure_unregistered(
-            &opctx,
-            &authz_instance,
-            &dst_sled_id,
-            db_instance.runtime(),
-        )
+        .instance_ensure_unregistered(&opctx, &authz_instance, &dst_sled_id)
         .await
-        .map_err(ActionError::action_failed)?;
-
-    Ok(())
+    {
+        Ok(_) => Ok(()),
+        Err(InstanceStateChangeError::SledAgent(inner)) => {
+            if !inner.instance_unhealthy() {
+                Ok(())
+            } else {
+                Err(inner.0.into())
+            }
+        }
+        Err(e) => Err(e.into()),
+    }
 }
 
 async fn sim_instance_migrate(
@@ -454,7 +454,7 @@ async fn sim_instance_migrate(
     //
     // Possibly sled agent can help with this by using state or Propolis
     // generation numbers to filter out stale destruction requests.
-    osagactx
+    match osagactx
         .nexus()
         .instance_request_state(
             &opctx,
@@ -469,9 +469,30 @@ async fn sim_instance_migrate(
             ),
         )
         .await
-        .map_err(ActionError::action_failed)?;
+    {
+        Ok(_) => Ok(()),
+        // Failure to initiate migration to a specific target doesn't entail
+        // that the entire instance has failed, so handle errors by unwinding
+        // the saga without otherwise touching the instance's state.
+        Err(InstanceStateChangeError::SledAgent(inner)) => {
+            info!(osagactx.log(),
+                      "migration saga: sled agent failed to start migration";
+                      "instance_id" => %db_instance.id(),
+                      "error" => ?inner);
+
+            Err(ActionError::action_failed(
+                omicron_common::api::external::Error::from(inner),
+            ))
+        }
+        Err(InstanceStateChangeError::Other(inner)) => {
+            info!(osagactx.log(),
+                      "migration saga: internal error changing instance state";
+                      "instance_id" => %db_instance.id(),
+                      "error" => ?inner);
 
-    Ok(())
+            Err(ActionError::action_failed(inner))
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs
index 76773d6369..e6717b0164 100644
--- a/nexus/src/app/sagas/instance_start.rs
+++ b/nexus/src/app/sagas/instance_start.rs
@@ -10,6 +10,7 @@ use super::{
     instance_common::allocate_sled_ipv6, NexusActionContext, NexusSaga,
     SagaInitError, ACTION_GENERATE_ID,
 };
+use crate::app::instance::InstanceStateChangeError;
 use crate::app::sagas::declare_saga_actions;
 use chrono::Utc;
 use nexus_db_queries::db::{identity::Resource, lookup::LookupPath};
@@ -52,11 +53,6 @@ declare_saga_actions! {
         - sis_move_to_starting_undo
     }
 
-    ADD_VIRTUAL_RESOURCES -> "virtual_resources" {
-        + sis_account_virtual_resources
-        - sis_account_virtual_resources_undo
-    }
-
     // TODO(#3879) This can be replaced with an action that triggers the NAT RPW
     // once such an RPW is available.
     DPD_ENSURE -> "dpd_ensure" {
@@ -74,6 +70,17 @@ declare_saga_actions! {
         - sis_ensure_registered_undo
     }
 
+    // Only account for the instance's resource consumption when the saga is on
+    // the brink of actually starting it. This allows prior steps' undo actions
+    // to change the instance's generation number if warranted (e.g. by moving
+    // the instance to the Failed state) without disrupting this step's undo
+    // action (which depends on the instance bearing the same generation number
+    // at undo time that it had at resource accounting time).
+    ADD_VIRTUAL_RESOURCES -> "virtual_resources" {
+        + sis_account_virtual_resources
+        - sis_account_virtual_resources_undo
+    }
+
     ENSURE_RUNNING -> "ensure_running" {
         + sis_ensure_running
     }
@@ -103,10 +110,10 @@ impl NexusSaga for SagaInstanceStart {
         builder.append(alloc_propolis_ip_action());
         builder.append(create_vmm_record_action());
         builder.append(mark_as_starting_action());
-        builder.append(add_virtual_resources_action());
         builder.append(dpd_ensure_action());
         builder.append(v2p_ensure_action());
         builder.append(ensure_registered_action());
+        builder.append(add_virtual_resources_action());
         builder.append(ensure_running_action());
         Ok(builder.build()?)
     }
@@ -575,18 +582,87 @@ async fn sis_ensure_registered_undo(
         .await
         .map_err(ActionError::action_failed)?;
 
-    osagactx
+    // If the sled successfully unregistered the instance, allow the rest of
+    // saga unwind to restore the instance record to its prior state (without
+    // writing back the state returned from sled agent). Otherwise, try to
+    // reason about the next action from the specific kind of error that was
+    // returned.
+    if let Err(e) = osagactx
         .nexus()
-        .instance_ensure_unregistered(
-            &opctx,
-            &authz_instance,
-            &sled_id,
-            db_instance.runtime(),
-        )
+        .instance_ensure_unregistered(&opctx, &authz_instance, &sled_id)
         .await
-        .map_err(ActionError::action_failed)?;
+    {
+        error!(osagactx.log(),
+               "start saga: failed to unregister instance from sled";
+               "instance_id" => %instance_id,
+               "error" => ?e);
+
+        // If the failure came from talking to sled agent, and the error code
+        // indicates the instance or sled might be unhealthy, manual
+        // intervention is likely to be needed, so try to mark the instance as
+        // Failed and then bail on unwinding.
+        //
+        // If sled agent is in good shape but just doesn't know about the
+        // instance, this saga still owns the instance's state, so allow
+        // unwinding to continue.
+        //
+        // If some other Nexus error occurred, this saga is in bad shape, so
+        // return an error indicating that intervention is needed without trying
+        // to modify the instance further.
+        //
+        // TODO(#3238): `instance_unhealthy` does not take an especially nuanced
+        // view of the meanings of the error codes sled agent could return, so
+        // assuming that an error that isn't `instance_unhealthy` means
+        // that everything is hunky-dory and it's OK to continue unwinding may
+        // be a bit of a stretch. See the definition of `instance_unhealthy` for
+        // more details.
+        match e {
+            InstanceStateChangeError::SledAgent(inner)
+                if inner.instance_unhealthy() =>
+            {
+                error!(osagactx.log(),
+                       "start saga: failing instance after unregister failure";
+                       "instance_id" => %instance_id,
+                       "error" => ?inner);
+
+                if let Err(set_failed_error) = osagactx
+                    .nexus()
+                    .mark_instance_failed(
+                        &instance_id,
+                        db_instance.runtime(),
+                        &inner,
+                    )
+                    .await
+                {
+                    error!(osagactx.log(),
+                           "start saga: failed to mark instance as failed";
+                           "instance_id" => %instance_id,
+                           "error" => ?set_failed_error);
+
+                    Err(set_failed_error.into())
+                } else {
+                    Err(inner.0.into())
+                }
+            }
+            InstanceStateChangeError::SledAgent(_) => {
+                info!(osagactx.log(),
+                       "start saga: instance already unregistered from sled";
+                       "instance_id" => %instance_id);
 
-    Ok(())
+                Ok(())
+            }
+            InstanceStateChangeError::Other(inner) => {
+                error!(osagactx.log(),
+                       "start saga: internal error unregistering instance";
+                       "instance_id" => %instance_id,
+                       "error" => ?inner);
+
+                Err(inner.into())
+            }
+        }
+    } else {
+        Ok(())
+    }
 }
 
 async fn sis_ensure_running(
@@ -615,7 +691,7 @@ async fn sis_ensure_running(
         .await
         .map_err(ActionError::action_failed)?;
 
-    osagactx
+    match osagactx
         .nexus()
         .instance_request_state(
             &opctx,
@@ -625,9 +701,30 @@ async fn sis_ensure_running(
             crate::app::instance::InstanceStateChangeRequest::Run,
         )
         .await
-        .map_err(ActionError::action_failed)?;
+    {
+        Ok(_) => Ok(()),
+        Err(InstanceStateChangeError::SledAgent(inner)) => {
+            info!(osagactx.log(),
+                  "start saga: sled agent failed to set instance to running";
+                  "instance_id" => %instance_id,
+                  "sled_id" =>  %sled_id,
+                  "error" => ?inner);
+
+            // Don't set the instance to Failed in this case. Instead, allow
+            // the saga to unwind and restore the instance to the Stopped
+            // state (matching what would happen if there were a failure
+            // prior to this point).
+            Err(ActionError::action_failed(Error::from(inner)))
+        }
+        Err(InstanceStateChangeError::Other(inner)) => {
+            info!(osagactx.log(),
+                  "start saga: internal error changing instance state";
+                  "instance_id" => %instance_id,
+                  "error" => ?inner);
 
-    Ok(())
+            Err(ActionError::action_failed(inner))
+        }
+    }
 }
 
 #[cfg(test)]
@@ -776,6 +873,9 @@ mod test {
                             new_db_instance.runtime().nexus_state.0,
                             InstanceState::Stopped
                         );
+
+                        assert!(test_helpers::no_virtual_provisioning_resource_records_exist(cptestctx).await);
+                        assert!(test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx).await);
                     }
                 })
             },
@@ -818,4 +918,80 @@ mod test {
 
         assert_eq!(vmm_state, InstanceState::Running);
     }
+
+    /// Tests that if a start saga unwinds because sled agent returned failure
+    /// from a call to ensure the instance was running, then the system returns
+    /// to the correct state.
+    ///
+    /// This is different from `test_action_failure_can_unwind` because that
+    /// test causes saga nodes to "fail" without actually executing anything,
+    /// whereas this test injects a failure into the normal operation of the
+    /// ensure-running node.
+    #[nexus_test(server = crate::Server)]
+    async fn test_ensure_running_unwind(cptestctx: &ControlPlaneTestContext) {
+        let client = &cptestctx.external_client;
+        let nexus = &cptestctx.server.apictx().nexus;
+        let _project_id = setup_test_project(&client).await;
+        let opctx = test_helpers::test_opctx(cptestctx);
+        let instance = create_instance(client).await;
+        let db_instance =
+            test_helpers::instance_fetch(cptestctx, instance.identity.id)
+                .await
+                .instance()
+                .clone();
+
+        let params = Params {
+            serialized_authn: authn::saga::Serialized::for_opctx(&opctx),
+            db_instance,
+        };
+
+        let dag = create_saga_dag::<SagaInstanceStart>(params).unwrap();
+
+        // The ensure_running node is last in the saga. This should be the node
+        // where the failure ultimately occurs.
+        let last_node_name = dag
+            .get_nodes()
+            .last()
+            .expect("saga should have at least one node")
+            .name()
+            .clone();
+
+        // Inject failure at the simulated sled agent level. This allows the
+        // ensure-running node to attempt to change the instance's state, but
+        // forces this operation to fail and produce whatever side effects
+        // result from that failure.
+        let sled_agent = &cptestctx.sled_agent.sled_agent;
+        sled_agent
+            .set_instance_ensure_state_error(Some(Error::internal_error(
+                "injected by test_ensure_running_unwind",
+            )))
+            .await;
+
+        let saga = nexus.create_runnable_saga(dag).await.unwrap();
+        let saga_error = nexus
+            .run_saga_raw_result(saga)
+            .await
+            .expect("saga execution should have started")
+            .kind
+            .expect_err("saga should fail due to injected error");
+
+        assert_eq!(saga_error.error_node_name, last_node_name);
+
+        let db_instance =
+            test_helpers::instance_fetch(cptestctx, instance.identity.id).await;
+
+        assert_eq!(
+            db_instance.instance().runtime_state.nexus_state,
+            nexus_db_model::InstanceState(InstanceState::Stopped)
+        );
+        assert!(db_instance.vmm().is_none());
+
+        assert!(
+            test_helpers::no_virtual_provisioning_resource_records_exist(
+                cptestctx
+            )
+            .await
+        );
+        assert!(test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx).await);
+    }
 }
diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs
index 3110bd318a..1b383d27bb 100644
--- a/nexus/src/app/sagas/test_helpers.rs
+++ b/nexus/src/app/sagas/test_helpers.rs
@@ -11,7 +11,9 @@ use crate::{
     Nexus,
 };
 use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection};
-use diesel::{ExpressionMethods, QueryDsl, SelectableHelper};
+use diesel::{
+    BoolExpressionMethods, ExpressionMethods, QueryDsl, SelectableHelper,
+};
 use futures::future::BoxFuture;
 use nexus_db_queries::{
     authz,
@@ -186,6 +188,68 @@ pub async fn instance_fetch(
     db_state
 }
 
+pub async fn no_virtual_provisioning_resource_records_exist(
+    cptestctx: &ControlPlaneTestContext,
+) -> bool {
+    use nexus_db_queries::db::model::VirtualProvisioningResource;
+    use nexus_db_queries::db::schema::virtual_provisioning_resource::dsl;
+
+    let datastore = cptestctx.server.apictx().nexus.datastore().clone();
+    let conn = datastore.pool_connection_for_tests().await.unwrap();
+
+    datastore
+        .transaction_retry_wrapper("no_virtual_provisioning_resource_records_exist")
+        .transaction(&conn, |conn| async move {
+            conn
+                .batch_execute_async(nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL)
+                .await
+                .unwrap();
+
+            Ok(
+                dsl::virtual_provisioning_resource
+                    .filter(dsl::resource_type.eq(nexus_db_queries::db::model::ResourceTypeProvisioned::Instance.to_string()))
+                    .select(VirtualProvisioningResource::as_select())
+                    .get_results_async::<VirtualProvisioningResource>(&conn)
+                    .await
+                    .unwrap()
+                    .is_empty()
+            )
+        }).await.unwrap()
+}
+
+pub async fn no_virtual_provisioning_collection_records_using_instances(
+    cptestctx: &ControlPlaneTestContext,
+) -> bool {
+    use nexus_db_queries::db::model::VirtualProvisioningCollection;
+    use nexus_db_queries::db::schema::virtual_provisioning_collection::dsl;
+
+    let datastore = cptestctx.server.apictx().nexus.datastore().clone();
+    let conn = datastore.pool_connection_for_tests().await.unwrap();
+
+    datastore
+        .transaction_retry_wrapper(
+            "no_virtual_provisioning_collection_records_using_instances",
+        )
+        .transaction(&conn, |conn| async move {
+            conn.batch_execute_async(
+                nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL,
+            )
+            .await
+            .unwrap();
+            Ok(dsl::virtual_provisioning_collection
+                .filter(
+                    dsl::cpus_provisioned.ne(0).or(dsl::ram_provisioned.ne(0)),
+                )
+                .select(VirtualProvisioningCollection::as_select())
+                .get_results_async::<VirtualProvisioningCollection>(&conn)
+                .await
+                .unwrap()
+                .is_empty())
+        })
+        .await
+        .unwrap()
+}
+
 /// Tests that the saga described by `dag` succeeds if each of its nodes is
 /// repeated.
 ///
diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs
index 8dae31863c..bbc3e440ab 100644
--- a/sled-agent/src/sim/collection.rs
+++ b/sled-agent/src/sim/collection.rs
@@ -217,6 +217,16 @@ impl<S: Simulatable + 'static> SimCollection<S> {
         }
     }
 
+    /// Forcibly removes the object `id` from the collection without simulating
+    /// any further state changes for it.
+    pub async fn sim_force_remove(&self, id: Uuid) {
+        let mut objects = self.objects.lock().await;
+        let object = objects.remove(&id).unwrap();
+        if let Some(mut tx) = object.channel_tx {
+            tx.close_channel();
+        }
+    }
+
     /// Complete a desired asynchronous state transition for object `id`.
     /// This is invoked either by `sim_step()` (if the simulation mode is
     /// `SimMode::Auto`) or `instance_finish_transition` (if the simulation mode
diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs
index c06ae96f2e..a16049dd2f 100644
--- a/sled-agent/src/sim/sled_agent.rs
+++ b/sled-agent/src/sim/sled_agent.rs
@@ -68,6 +68,8 @@ pub struct SledAgent {
     pub v2p_mappings: Mutex<HashMap<Uuid, Vec<SetVirtualNetworkInterfaceHost>>>,
     mock_propolis:
         Mutex<Option<(HttpServer<Arc<PropolisContext>>, PropolisClient)>>,
+
+    instance_ensure_state_error: Mutex<Option<Error>>,
 }
 
 fn extract_targets_from_volume_construction_request(
@@ -159,6 +161,7 @@ impl SledAgent {
             disk_id_to_region_ids: Mutex::new(HashMap::new()),
             v2p_mappings: Mutex::new(HashMap::new()),
             mock_propolis: Mutex::new(None),
+            instance_ensure_state_error: Mutex::new(None),
         })
     }
 
@@ -343,15 +346,7 @@ impl SledAgent {
             updated_runtime: Some(instance.terminate()),
         };
 
-        // Poke the now-destroyed instance to force it to be removed from the
-        // collection.
-        //
-        // TODO: In the real sled agent, this happens inline without publishing
-        // any other state changes, whereas this call causes any pending state
-        // changes to be published. This can be fixed by adding a simulated
-        // object collection function to forcibly remove an object from a
-        // collection.
-        self.instances.sim_poke(instance_id, PokeMode::Drain).await;
+        self.instances.sim_force_remove(instance_id).await;
         Ok(response)
     }
 
@@ -361,6 +356,11 @@ impl SledAgent {
         instance_id: Uuid,
         state: InstanceStateRequested,
     ) -> Result<InstancePutStateResponse, Error> {
+        if let Some(e) = self.instance_ensure_state_error.lock().await.as_ref()
+        {
+            return Err(e.clone());
+        }
+
         let current =
             match self.instances.sim_get_cloned_object(&instance_id).await {
                 Ok(i) => i.current().clone(),
@@ -416,6 +416,10 @@ impl SledAgent {
         Ok(InstancePutStateResponse { updated_runtime: Some(new_state) })
     }
 
+    pub async fn set_instance_ensure_state_error(&self, error: Option<Error>) {
+        *self.instance_ensure_state_error.lock().await = error;
+    }
+
     async fn detach_disks_from_instance(
         &self,
         instance_id: Uuid,