Skip to content

Commit

Permalink
messy WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
lif committed Jan 19, 2024
1 parent be37bfe commit 12e2dcb
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 30 deletions.
40 changes: 33 additions & 7 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ use sled_agent_client::types::InstancePutStateBody;
use std::matches;
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;
use tokio::io::{AsyncRead, AsyncWrite};
use uuid::Uuid;

Expand Down Expand Up @@ -956,14 +957,39 @@ impl super::Nexus {
// the caller to let it decide how to handle it.
//
// When creating the zone for the first time, we just get
// Ok(None) here, which is a no-op. We later asynchronously get
// a cpapi call invoking Self::write_returned_instance_state
// Ok(None) here, which is a no-op in write_returned_instance_state.
// (We later asynchronously receive a cpapi call that invokes
// write_returned_instance_state with the outcome.)
match instance_put_result {
Ok(state) => self
.write_returned_instance_state(&instance_id, state)
.await
.map(|_| ())
.map_err(Into::into),
Ok(state) => {
if state.is_none()
&& matches!(
requested,
InstanceStateChangeRequest::Run
| InstanceStateChangeRequest::Migrate(..)
)
{
// TODO: This is fragile -- suppose the nexus with
// this task crashes *and* the instance creation
// happens to also hang. The new nexus won't know
// to give up on the instance! In an ideal world
// this would be handled by a RPW that polls all
// the instances nexus knows about and finds any
// that have been in a still-starting state for too
// long -- say, InstanceRuntimeState::time_updated
// plus the timeout, assuming time_updated is the
// right point to measure from.
tokio::spawn(async {
tokio::time::sleep(Duration::from_secs(120))
.await;
todo: fail instance
})
}
self.write_returned_instance_state(&instance_id, state)
.await
.map(|_| ())
.map_err(Into::into)
}
Err(e) => Err(InstanceStateChangeError::SledAgent(e)),
}
}
Expand Down
57 changes: 40 additions & 17 deletions nexus/src/internal_api/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ use nexus_types::internal_api::params::SwitchPutRequest;
use nexus_types::internal_api::params::SwitchPutResponse;
use nexus_types::internal_api::views::to_list;
use nexus_types::internal_api::views::BackgroundTask;
use nexus_types::internal_api::views::HandleInstancePutResultResult;
use nexus_types::internal_api::views::Saga;
use omicron_common::api::external::http_pagination::data_page_params_for;
use omicron_common::api::external::http_pagination::PaginatedById;
Expand Down Expand Up @@ -283,42 +284,55 @@ async fn cpapi_instances_put(
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
}

/// Asynchronously report the successful result of a potentially long-running
/// instance_put call to sled-agent made during instance creation.
/// Asynchronously report the successful result of certain instance_put calls
/// (such as the potentially long-running one made during instance creation)
#[endpoint {
method = PUT,
path = "/instances/{instance_id}/creation-success",
}]
method = PUT,
path = "/instances/{instance_id}/creation-success",
}]
async fn cpapi_handle_instance_put_success(
rqctx: RequestContext<Arc<ServerContext>>,
path_params: Path<InstancePathParam>,
instance_state: TypedBody<SledInstanceState>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
) -> Result<HttpResponseOk<HandleInstancePutResultResult>, HttpError> {
let apictx = rqctx.context();
let nexus = &apictx.nexus;
let path = path_params.into_inner();
let result = Ok(instance_state.into_inner());
let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
let handler = async {
nexus
// TODO: if instance_handle_creation_result errors because nexus gave
// up waiting and marked the instance as failed, tell sled-agent to
// destroy the instance
match nexus
.instance_handle_creation_result(&opctx, &path.instance_id, result)
.await?;
.await
{
Ok(_) => Ok(HandleInstancePutResultResult::Ok.into()),
Err(err) => {
if todo {
Ok(HandleInstancePutResultResult::TimedOut.into())
} else {
Err(err)
}
}
}
Ok(HttpResponseUpdatedNoContent())
};
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
}

/// Asynchronously report the unsuccessful result of a potentially long-running
/// instance_put call to sled-agent made during instance creation.
/// Asynchronously report the unsuccessful result of certain instance_put calls
/// (such as the potentially long-running one made during instance creation)
#[endpoint {
method = PUT,
path = "/instances/{instance_id}/creation-failure",
}]
method = PUT,
path = "/instances/{instance_id}/creation-failure",
}]
async fn cpapi_handle_instance_put_failure(
rqctx: RequestContext<Arc<ServerContext>>,
path_params: Path<InstancePathParam>,
error: TypedBody<String>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
) -> Result<HttpResponseOk<HandleInstancePutResultResult>, HttpError> {
let apictx = rqctx.context();
let nexus = &apictx.nexus;
let path = path_params.into_inner();
Expand All @@ -327,10 +341,19 @@ async fn cpapi_handle_instance_put_failure(
});
let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
let handler = async {
nexus
match nexus
.instance_handle_creation_result(&opctx, &path.instance_id, result)
.await?;
Ok(HttpResponseUpdatedNoContent())
.await
{
Ok(_) => Ok(HandleInstancePutResultResult::Ok.into()),
Err(err) => {
if todo {
Ok(HandleInstancePutResultResult::TimedOut.into())
} else {
Err(err)
}
}
}
};
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
}
Expand Down
9 changes: 9 additions & 0 deletions nexus/types/src/internal_api/views.rs
Original file line number Diff line number Diff line change
Expand Up @@ -296,3 +296,12 @@ pub struct LastResultCompleted {
/// arbitrary datum emitted by the background task
pub details: serde_json::Value,
}

/// Tells sled-agent whether an instance whose status it's reporting is still
/// relevant, or if it's timed out during creation and been marked as failed
/// (such that sled-agent can destroy the tardy instance)
#[derive(Serialize, JsonSchema)]
pub enum HandleInstancePutResultResult {
Ok,
TimedOut,
}
17 changes: 11 additions & 6 deletions sled-agent/src/instance_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -376,18 +376,23 @@ impl InstanceManager {
match instance.put_state(target).await {
Ok(state) => {
let instance_state: nexus_client::types::SledInstanceState = state.into();
if let Err(err) = nexus_client
match nexus_client
.cpapi_handle_instance_put_success(
&instance_id,
&instance_state,
)
.await
{
error!(log, "Failed to inform Nexus of instance_put success";
"err" => %err,
"instance_state" => ?instance_state,
"instance_id" => %instance_id,
);
Ok(
TODO HandleInstancePutResultResult::Ok
) => {}
Err(err) => {
error!(log, "Failed to inform Nexus of instance_put success";
"err" => %err,
"instance_state" => ?instance_state,
"instance_id" => %instance_id,
)
}
}
}
Err(instance_put_error) => {
Expand Down

0 comments on commit 12e2dcb

Please sign in to comment.