Skip to content

Commit

Permalink
wip: might be worth testing now...
Browse files Browse the repository at this point in the history
  • Loading branch information
lif committed Jan 23, 2024
1 parent 1f121c6 commit 10b8851
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 10 deletions.
58 changes: 50 additions & 8 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -983,10 +983,35 @@ impl super::Nexus {
// long -- say, InstanceRuntimeState::time_updated
// plus the timeout, assuming time_updated is the
// right point to measure from.
tokio::spawn(async {
let prev_instance_runtime =
prev_instance_state.runtime_state.clone();
let db_datastore_weak =
Arc::downgrade(&self.db_datastore);
let log = self
.log
.new(o!("component" => "Instance timeout"));
tokio::spawn(async move {
tokio::time::sleep(Duration::from_secs(120))
.await;
todo!("fail instance")
if let Some(db_datastore) =
db_datastore_weak.upgrade()
{
Self::mark_instance_failed_inner(
&db_datastore,
&instance_id,
&prev_instance_runtime,
"Timed out waiting for instance state change.",
&log,
)
.await
.ok();
} else {
error!(
log,
"DataStore no longer exists to mark instance failed after instance state change timeout.";
"instance_id" => %instance_id
);
}
});
}
self.write_returned_instance_state(&instance_id, state)
Expand Down Expand Up @@ -1043,7 +1068,8 @@ impl super::Nexus {
&state.instance().runtime_state,
error,
)
.await
.await?;
Ok(HandleInstancePutResultResult::Ok)
}
}
}
Expand Down Expand Up @@ -1343,7 +1369,24 @@ impl super::Nexus {
prev_instance_runtime: &db::model::InstanceRuntimeState,
reason: impl std::fmt::Debug,
) -> Result<(), Error> {
error!(self.log, "marking instance failed due to sled agent API error";
Self::mark_instance_failed_inner(
&self.db_datastore,
instance_id,
prev_instance_runtime,
reason,
&self.log,
)
.await
}

async fn mark_instance_failed_inner(
db_datastore: &Arc<db::DataStore>,
instance_id: &Uuid,
prev_instance_runtime: &db::model::InstanceRuntimeState,
reason: impl std::fmt::Debug,
log: &slog::Logger,
) -> Result<(), Error> {
error!(log, "marking instance failed due to sled agent API error";
"instance_id" => %instance_id,
"error" => ?reason);

Expand All @@ -1358,16 +1401,15 @@ impl super::Nexus {
..prev_instance_runtime.clone()
};

match self
.db_datastore
match db_datastore
.instance_update_runtime(&instance_id, &new_runtime)
.await
{
Ok(_) => info!(self.log, "marked instance as Failed";
Ok(_) => info!(log, "marked instance as Failed";
"instance_id" => %instance_id),
// XXX: It's not clear what to do with this error; should it be
// bubbled back up to the caller?
Err(e) => error!(self.log,
Err(e) => error!(log,
"failed to write Failed instance state to DB";
"instance_id" => %instance_id,
"error" => ?e),
Expand Down
12 changes: 10 additions & 2 deletions sled-agent/src/instance_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,11 @@ impl InstanceManager {
{
Ok(HandleInstancePutResultResult::Ok) => {}
Ok(HandleInstancePutResultResult::TimedOut) => {
todo!("nexus doesn't want us any more, terminate instance")
error!(log, "Nexus gave up waiting for us to finish creating the propolis zone and abandoned the instance. Rudely terminating it from our side."; "instance_id" => %instance_id);
if let Err(err) = instance.terminate().await
{
error!(log, "Couldn't terminate instance whose creation was timed-out by Nexus"; "instance_id" => %instance_id, "err" => %err);
}
}
Err(err) => {
error!(log, "Failed to inform Nexus of instance_put success";
Expand All @@ -409,7 +413,11 @@ impl InstanceManager {
{
Ok(HandleInstancePutResultResult::Ok) => {}
Ok(HandleInstancePutResultResult::TimedOut) => {
todo!("well, i guess this is less awkward but clean up if we have to")
error!(log, "Nexus gave up waiting for us to finish creating the propolis zone and abandoned the instance, but the instance also explicitly failed on our side. Rudely terminating what remains of it."; "instance_id" => %instance_id);
if let Err(err) = instance.terminate().await
{
error!(log, "Couldn't terminate faulted instance (whose creation was also timed-out by Nexus)"; "instance_id" => %instance_id, "err" => %err);
}
}
Err(err) => {
error!(log, "Failed to inform Nexus of instance_put failure";
Expand Down

0 comments on commit 10b8851

Please sign in to comment.