Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add migration table and explicit migration tracking in sled-agent #5859

Merged
merged 34 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
370b73c
add migration table
hawkw Jun 5, 2024
d017eb0
add migration state to nexus API
hawkw Jun 5, 2024
9362651
okay i really hopep the CTE works
hawkw Jun 5, 2024
e6e5e14
unbreak CTE, et cetera
hawkw Jun 5, 2024
c8a0c7c
fix migration name
hawkw Jun 5, 2024
00c2c93
add generation numbers
hawkw Jun 5, 2024
7f95255
regen openapi again
hawkw Jun 6, 2024
6304817
add timestamps
hawkw Jun 6, 2024
d57060f
actually create and delete migration records
hawkw Jun 6, 2024
77e3080
sled-agent api plumbing
hawkw Jun 6, 2024
6b63d22
fix CTE syntax error
hawkw Jun 6, 2024
7f3c7fa
add test that migration state updates
hawkw Jun 6, 2024
45d7844
clippy
hawkw Jun 6, 2024
c6311fb
make simulated sled agent complete src migration
hawkw Jun 7, 2024
43254ad
do the actual correct simulated migration
hawkw Jun 7, 2024
f649f82
Merge branch 'main' into eliza/migration-table
hawkw Jun 7, 2024
fd5e49c
Update common/src/api/internal/nexus.rs
hawkw Jun 7, 2024
631ba0f
add pending state
hawkw Jun 7, 2024
1aecc88
fix sled-agent sim panicking when unsetting migration
hawkw Jun 7, 2024
270368f
fix unexpected null
hawkw Jun 7, 2024
47cfd55
fix places where i got source/target backwards
hawkw Jun 10, 2024
f399eb1
misc. @gjcolombo review feedback
hawkw Jun 10, 2024
8e75658
@smklein's review feedback
hawkw Jun 10, 2024
762febc
update tests to expect migration states
hawkw Jun 11, 2024
7634307
update tests, start out in pending
hawkw Jun 11, 2024
e4f7c6b
don't generate spurious state transitions
hawkw Jun 11, 2024
9d0de2b
delete records when a migration completes
hawkw Jun 11, 2024
f6c9875
report migration failure if an in-progress vmm vanishes
hawkw Jun 11, 2024
77a6dfe
update comment
hawkw Jun 11, 2024
29eae36
fix comment
hawkw Jun 11, 2024
18ccc7e
rm code from other branch
hawkw Jun 11, 2024
00ad191
oops
hawkw Jun 11, 2024
4e3e4f2
Merge branch 'main' into eliza/migration-table
hawkw Jun 12, 2024
a1ca946
Update sled-agent/src/common/instance.rs
hawkw Jun 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions common/src/api/internal/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,12 @@ impl MigrationState {
Self::Failed => "failed",
}
}
/// Returns `true` if this migration state means that the migration is no
/// longer in progress (it has either succeeded or failed).
#[must_use]
pub fn is_terminal(&self) -> bool {
matches!(self, MigrationState::Completed | MigrationState::Failed)
}
}

impl fmt::Display for MigrationState {
Expand Down
2 changes: 0 additions & 2 deletions nexus/db-model/src/migration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ pub struct Migration {
/// The time at which this migration record was deleted,
pub time_deleted: Option<DateTime<Utc>>,

/// The time at which the source VMM state was last updated.

/// The state of the migration source VMM.
pub source_state: MigrationState,

Expand Down
9 changes: 9 additions & 0 deletions nexus/db-model/src/migration_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ impl_enum_wrapper!(
Failed => b"failed"
);

impl MigrationState {
/// Returns `true` if this migration state means that the migration is no
/// longer in progress (it has either succeeded or failed).
#[must_use]
pub fn is_terminal(&self) -> bool {
self.0.is_terminal()
}
}

impl fmt::Display for MigrationState {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(&self.0, f)
Expand Down
38 changes: 30 additions & 8 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,25 @@ pub enum UpdaterLockError {
Query(#[from] Error),
}

/// The result of an [`DataStore::instance_and_vmm_update_runtime`] call,
/// indicating which records were updated.
#[derive(Copy, Clone, Debug)]
pub struct InstanceUpdateResult {
/// `true` if the instance record was updated, `false` otherwise.
pub instance_updated: bool,
/// `true` if the VMM record was updated, `false` otherwise.
pub vmm_updated: bool,
/// Indicates whether a migration record for this instance was updated, if a
/// [`MigrationRuntimeState`] was provided to
/// [`DataStore::instance_and_vmm_update_runtime`].
///
/// - `Some(true)` if a migration record was updated
/// - `Some(false)` if a [`MigrationRuntimeState`] was provided, but the
/// migration record was not updated
/// - `None` if no [`MigrationRuntimeState`] was provided
pub migration_updated: Option<bool>,
}

impl DataStore {
/// Idempotently insert a database record for an Instance
///
Expand Down Expand Up @@ -373,12 +392,11 @@ impl DataStore {
///
/// # Return value
///
/// - `Ok((instance_updated, vmm_updated))` if the query was issued
/// successfully. `instance_updated` and `vmm_updated` are each true if
/// the relevant item was updated and false otherwise. Note that an update
/// can fail because it was inapplicable (i.e. the database has state with
/// a newer generation already) or because the relevant record was not
/// found.
/// - `Ok(`[`InstanceUpdateResult`]`)` if the query was issued
/// successfully. The returned [`InstanceUpdateResult`] indicates which
/// database record(s) were updated. Note that an update can fail because
/// it was inapplicable (i.e. the database has state with a newer
/// generation already) or because the relevant record was not found.
/// - `Err` if another error occurred while accessing the database.
pub async fn instance_and_vmm_update_runtime(
&self,
Expand All @@ -387,7 +405,7 @@ impl DataStore {
vmm_id: &Uuid,
new_vmm: &VmmRuntimeState,
migration: &Option<MigrationRuntimeState>,
) -> Result<(bool, bool, Option<bool>), Error> {
) -> Result<InstanceUpdateResult, Error> {
let query = crate::db::queries::instance::InstanceAndVmmUpdate::new(
*instance_id,
new_instance.clone(),
Expand Down Expand Up @@ -427,7 +445,11 @@ impl DataStore {
None
};

Ok((instance_updated, vmm_updated, migration_updated))
Ok(InstanceUpdateResult {
instance_updated,
vmm_updated,
migration_updated,
})
}

/// Lists all instances on in-service sleds with active Propolis VMM
Expand Down
36 changes: 35 additions & 1 deletion nexus/db-queries/src/db/datastore/migration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use super::DataStore;
use crate::context::OpContext;
use crate::db::error::public_error_from_diesel;
use crate::db::error::ErrorHandler;
use crate::db::model::Migration;
use crate::db::model::{Migration, MigrationState};
use crate::db::schema::migration::dsl;
use crate::db::update_and_check::UpdateAndCheck;
use crate::db::update_and_check::UpdateStatus;
Expand All @@ -17,6 +17,7 @@ use chrono::Utc;
use diesel::prelude::*;
use omicron_common::api::external::CreateResult;
use omicron_common::api::external::UpdateResult;
use omicron_common::api::internal::nexus;
use uuid::Uuid;

impl DataStore {
Expand All @@ -37,6 +38,39 @@ impl DataStore {
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

/// Marks a migration record as deleted if and only if both sides of the
/// migration are in a terminal state.
pub async fn migration_terminate(
&self,
opctx: &OpContext,
migration_id: Uuid,
) -> UpdateResult<bool> {
const TERMINAL_STATES: &[MigrationState] = &[
MigrationState(nexus::MigrationState::Completed),
MigrationState(nexus::MigrationState::Failed),
];

diesel::update(dsl::migration)
.filter(dsl::id.eq(migration_id))
.filter(dsl::time_deleted.is_null())
.filter(dsl::source_state.eq_any(TERMINAL_STATES))
.filter(dsl::target_state.eq_any(TERMINAL_STATES))
.set(dsl::time_deleted.eq(Utc::now()))
.check_if_exists::<Migration>(migration_id)
.execute_and_check(&*self.pool_connection_authorized(opctx).await?)
.await
.map(|r| match r.status {
UpdateStatus::Updated => true,
UpdateStatus::NotUpdatedButExists => false,
})
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

/// Unconditionally mark a migration record as deleted.
///
/// This is distinct from [`DataStore::migration_terminate`], as it will
/// mark a migration as deleted regardless of the states of the source and
/// target VMMs.
pub async fn migration_mark_deleted(
hawkw marked this conversation as resolved.
Show resolved Hide resolved
&self,
opctx: &OpContext,
Expand Down
8 changes: 7 additions & 1 deletion nexus/db-queries/src/db/queries/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ use crate::db::update_and_check::UpdateStatus;
// SELECT vmm_result.found, vmm_result.updated, instance_result.found,
// instance_result.updated
// FROM vmm_result, instance_result;
///
/// If a [`MigrationRuntimeState`] is provided, similar "found" and "update"
/// clauses are also added to join the `migration` record for the instance's
/// active migration, if one exists, and update the migration record. If no
/// migration record is provided, this part of the query is skipped, and the
/// `migration_found` and `migration_updated` portions are always `false`.
//
// The "wrapper" SELECTs when finding instances and VMMs are used to get a NULL
// result in the final output instead of failing the entire query if the target
Expand Down Expand Up @@ -222,7 +228,7 @@ impl InstanceAndVmmUpdate {
.filter(migration_dsl::source_gen.lt(gen))
.set((
migration_dsl::source_state.eq(state),
migration_dsl::time_target_updated
migration_dsl::time_source_updated
.eq(time_updated),
)),
),
Expand Down
106 changes: 68 additions & 38 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use nexus_db_queries::authn;
use nexus_db_queries::authz;
use nexus_db_queries::context::OpContext;
use nexus_db_queries::db;
use nexus_db_queries::db::datastore::instance::InstanceUpdateResult;
use nexus_db_queries::db::datastore::InstanceAndActiveVmm;
use nexus_db_queries::db::identity::Resource;
use nexus_db_queries::db::lookup;
Expand Down Expand Up @@ -563,28 +564,27 @@ impl super::Nexus {
// outright fails, this operation fails. If the operation nominally
// succeeds but nothing was updated, this action is outdated and the
// caller should not proceed with migration.
let (updated, _, _) = match instance_put_result {
Ok(state) => {
self.write_returned_instance_state(&instance_id, state).await?

// TODO(eliza): this is where we would also want to insert to the
// migration table...
}
Err(e) => {
if e.instance_unhealthy() {
let _ = self
.mark_instance_failed(
&instance_id,
&prev_instance_runtime,
&e,
)
.await;
let InstanceUpdateResult { instance_updated, .. } =
match instance_put_result {
Ok(state) => {
self.write_returned_instance_state(&instance_id, state)
.await?
}
return Err(e.into());
}
};
Err(e) => {
if e.instance_unhealthy() {
let _ = self
.mark_instance_failed(
&instance_id,
&prev_instance_runtime,
&e,
)
.await;
}
return Err(e.into());
}
};

if updated {
if instance_updated {
Ok(self
.db_datastore
.instance_refetch(opctx, &authz_instance)
Expand Down Expand Up @@ -1324,7 +1324,7 @@ impl super::Nexus {
&self,
instance_id: &Uuid,
state: Option<nexus::SledInstanceState>,
) -> Result<(bool, bool, Option<bool>), Error> {
) -> Result<InstanceUpdateResult, Error> {
slog::debug!(&self.log,
"writing instance state returned from sled agent";
"instance_id" => %instance_id,
Expand All @@ -1350,7 +1350,13 @@ impl super::Nexus {

update_result
} else {
Ok((false, false, None))
// There was no instance state to write back, so --- perhaps
// obviously --- nothing happened.
Ok(InstanceUpdateResult {
instance_updated: false,
vmm_updated: false,
migration_updated: None,
})
}
}

Expand Down Expand Up @@ -1958,16 +1964,6 @@ impl super::Nexus {
}
}

/// Records what aspects of an instance's state were actually changed in a
/// [`notify_instance_updated`] call.
///
/// This is (presently) used for debugging purposes only.
#[derive(Copy, Clone)]
pub(crate) struct InstanceUpdated {
pub instance_updated: bool,
pub vmm_updated: bool,
}

/// Invoked by a sled agent to publish an updated runtime state for an
/// Instance.
#[allow(clippy::too_many_arguments)] // :(
Expand All @@ -1980,7 +1976,7 @@ pub(crate) async fn notify_instance_updated(
instance_id: &Uuid,
new_runtime_state: &nexus::SledInstanceState,
v2p_notification_tx: tokio::sync::watch::Sender<()>,
) -> Result<Option<InstanceUpdated>, Error> {
) -> Result<Option<InstanceUpdateResult>, Error> {
let propolis_id = new_runtime_state.propolis_id;

info!(log, "received new runtime state from sled agent";
Expand Down Expand Up @@ -2080,6 +2076,40 @@ pub(crate) async fn notify_instance_updated(
)
.await;

// Has a migration terminated? If so,mark the migration record as deleted if
// and only if both sides of the migration are in a terminal state.
if let Some(nexus::MigrationRuntimeState {
migration_id,
state,
role,
..
}) = new_runtime_state.migration_state
{
if state.is_terminal() {
info!(
log,
"migration has terminated, trying to delete it...";
"instance_id" => %instance_id,
"propolis_id" => %propolis_id,
"migration_id" => %propolis_id,
"migration_state" => %state,
"migration_role" => %role,
);
if !datastore.migration_terminate(opctx, migration_id).await? {
info!(
log,
"did not mark migration record as deleted (the other half \
may not yet have reported termination)";
"instance_id" => %instance_id,
"propolis_id" => %propolis_id,
"migration_id" => %propolis_id,
"migration_state" => %state,
"migration_role" => %role,
);
}
}
}

// If the VMM is now in a terminal state, make sure its resources get
// cleaned up.
//
Expand Down Expand Up @@ -2118,14 +2148,14 @@ pub(crate) async fn notify_instance_updated(
}

match result {
Ok((instance_updated, vmm_updated, migration_updated)) => {
Ok(result) => {
info!(log, "instance and vmm updated by sled agent";
"instance_id" => %instance_id,
"propolis_id" => %propolis_id,
"instance_updated" => instance_updated,
"vmm_updated" => vmm_updated,
"migration_updated" => ?migration_updated);
Ok(Some(InstanceUpdated { instance_updated, vmm_updated }))
"instance_updated" => result.instance_updated,
"vmm_updated" => result.vmm_updated,
"migration_updated" => ?result.migration_updated);
Ok(Some(result))
}

// The update command should swallow object-not-found errors and
Expand Down
11 changes: 6 additions & 5 deletions nexus/src/app/sagas/instance_migrate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,16 @@ declare_saga_actions! {
+ sim_allocate_propolis_ip
}

CREATE_VMM_RECORD -> "dst_vmm_record" {
+ sim_create_vmm_record
- sim_destroy_vmm_record
}

CREATE_MIGRATION_RECORD -> "migration_record" {
+ sim_create_migration_record
- sim_delete_migration_record
}

CREATE_VMM_RECORD -> "dst_vmm_record" {
+ sim_create_vmm_record
- sim_destroy_vmm_record
}

// This step the instance's migration ID and destination Propolis ID
// fields. Because the instance is active, its current sled agent maintains
Expand Down Expand Up @@ -132,8 +133,8 @@ impl NexusSaga for SagaInstanceMigrate {

builder.append(reserve_resources_action());
builder.append(allocate_propolis_ip_action());
builder.append(create_migration_record_action());
builder.append(create_vmm_record_action());
builder.append(create_migration_record_action());
builder.append(set_migration_ids_action());
builder.append(ensure_destination_propolis_action());
builder.append(instance_migrate_action());
Expand Down
4 changes: 4 additions & 0 deletions nexus/tests/integration_tests/instances.rs
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,9 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) {
let datastore =
cptestctx.server.server_context().nexus.datastore().clone();
let db_state = dsl::migration
// N.B. that for the purposes of this test, we explicitly should
// *not* filter out migrations that are marked as deleted, as the
// migration record is marked as deleted once the migration completes.
.filter(dsl::id.eq(migration_id))
.select(Migration::as_select())
.get_results_async::<Migration>(
Expand Down Expand Up @@ -824,6 +827,7 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) {
let migration = dbg!(migration_fetch(cptestctx, migration_id).await);
assert_eq!(migration.target_state, MigrationState::Completed.into());
assert_eq!(migration.source_state, MigrationState::Completed.into());
assert!(migration.time_deleted.is_some());
}

#[nexus_test]
Expand Down
Loading
Loading