diff --git a/Cargo.lock b/Cargo.lock index 4f3f315330..1d36953da1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6871,6 +6871,7 @@ dependencies = [ "glob", "guppy", "hex", + "hex-literal", "http 1.1.0", "hyper 1.4.1", "hyper-staticfile", @@ -6905,12 +6906,15 @@ dependencies = [ "propolis_api_types", "rand", "rcgen", + "repo-depot-api", + "repo-depot-client", "reqwest 0.12.8", "schemars", "semver 1.0.23", "serde", "serde_human_bytes", "serde_json", + "sha2", "sha3", "sled-agent-api", "sled-agent-client", @@ -7214,6 +7218,7 @@ dependencies = [ "openapiv3", "owo-colors", "oximeter-api", + "repo-depot-api", "serde_json", "similar", "sled-agent-api", @@ -9128,6 +9133,29 @@ version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" +[[package]] +name = "repo-depot-api" +version = "0.1.0" +dependencies = [ + "dropshot", + "omicron-common", + "omicron-workspace-hack", + "schemars", + "serde", +] + +[[package]] +name = "repo-depot-client" +version = "0.1.0" +dependencies = [ + "omicron-workspace-hack", + "progenitor", + "reqwest 0.12.8", + "schemars", + "serde", + "slog", +] + [[package]] name = "reqwest" version = "0.11.27" diff --git a/Cargo.toml b/Cargo.toml index 84d93a1ec7..b7a96f4331 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ members = [ "clients/nexus-client", "clients/oxide-client", "clients/oximeter-client", + "clients/repo-depot-client", "clients/sled-agent-client", "clients/wicketd-client", "cockroach-admin", @@ -100,6 +101,7 @@ members = [ "sled-agent", "sled-agent/api", "sled-agent/bootstrap-agent-api", + "sled-agent/repo-depot-api", "sled-agent/types", "sled-hardware", "sled-hardware/types", @@ -140,6 +142,7 @@ default-members = [ "clients/nexus-client", "clients/oxide-client", "clients/oximeter-client", + "clients/repo-depot-client", "clients/sled-agent-client", "clients/wicketd-client", "cockroach-admin", @@ -225,6 +228,7 @@ default-members = [ "sled-agent", "sled-agent/api", "sled-agent/bootstrap-agent-api", + "sled-agent/repo-depot-api", "sled-agent/types", "sled-hardware", "sled-hardware/types", @@ -533,6 +537,8 @@ reedline = "0.35.0" ref-cast = "1.0" regex = "1.11.0" regress = "0.9.1" +repo-depot-api = { path = "sled-agent/repo-depot-api" } +repo-depot-client = { path = "clients/repo-depot-client" } reqwest = { version = "0.12", default-features = false } ring = "0.17.8" rpassword = "7.3.1" diff --git a/clients/repo-depot-client/Cargo.toml b/clients/repo-depot-client/Cargo.toml new file mode 100644 index 0000000000..858c75632f --- /dev/null +++ b/clients/repo-depot-client/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "repo-depot-client" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +omicron-workspace-hack.workspace = true +progenitor.workspace = true +reqwest.workspace = true +schemars.workspace = true +serde.workspace = true +slog.workspace = true diff --git a/clients/repo-depot-client/src/lib.rs b/clients/repo-depot-client/src/lib.rs new file mode 100644 index 0000000000..69e21cdaf3 --- /dev/null +++ b/clients/repo-depot-client/src/lib.rs @@ -0,0 +1,24 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Interface for Sled Agent's Repo Depot to make API requests. + +progenitor::generate_api!( + spec = "../../openapi/repo-depot.json", + inner_type = slog::Logger, + pre_hook = (|log: &slog::Logger, request: &reqwest::Request| { + slog::debug!(log, "client request"; + "method" => %request.method(), + "uri" => %request.url(), + "body" => ?&request.body(), + ); + }), + post_hook = (|log: &slog::Logger, result: &Result<_, _>| { + slog::debug!(log, "client response"; "result" => ?result); + }), + derives = [schemars::JsonSchema], +); + +/// A type alias for errors returned by this crate. +pub type ClientError = crate::Error; diff --git a/common/src/address.rs b/common/src/address.rs index 7cf00d5228..7e6d68ebc8 100644 --- a/common/src/address.rs +++ b/common/src/address.rs @@ -29,6 +29,7 @@ pub const MIN_PORT: u16 = u16::MIN; pub const DNS_PORT: u16 = 53; pub const DNS_HTTP_PORT: u16 = 5353; pub const SLED_AGENT_PORT: u16 = 12345; +pub const REPO_DEPOT_PORT: u16 = 12348; pub const COCKROACH_PORT: u16 = 32221; pub const COCKROACH_ADMIN_PORT: u16 = 32222; diff --git a/dev-tools/ls-apis/api-manifest.toml b/dev-tools/ls-apis/api-manifest.toml index b273b908fc..ab5dd4dec8 100644 --- a/dev-tools/ls-apis/api-manifest.toml +++ b/dev-tools/ls-apis/api-manifest.toml @@ -278,6 +278,11 @@ client_package_name = "sled-agent-client" label = "Sled Agent" server_package_name = "sled-agent-api" +[[apis]] +client_package_name = "repo-depot-client" +label = "Repo Depot API" +server_package_name = "repo-depot-api" + [[apis]] client_package_name = "wicketd-client" label = "Wicketd" diff --git a/dev-tools/ls-apis/tests/api_dependencies.out b/dev-tools/ls-apis/tests/api_dependencies.out index 938091cefb..aee8cd7a70 100644 --- a/dev-tools/ls-apis/tests/api_dependencies.out +++ b/dev-tools/ls-apis/tests/api_dependencies.out @@ -71,6 +71,9 @@ Propolis (client: propolis-client) Crucible Repair (client: repair-client) consumed by: crucible-downstairs (crucible/downstairs) via 1 path +Repo Depot API (client: repo-depot-client) + consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path + Sled Agent (client: sled-agent-client) consumed by: dpd (dendrite/dpd) via 1 path consumed by: omicron-nexus (omicron/nexus) via 7 paths diff --git a/dev-tools/openapi-manager/Cargo.toml b/dev-tools/openapi-manager/Cargo.toml index 211e134016..d32477caf3 100644 --- a/dev-tools/openapi-manager/Cargo.toml +++ b/dev-tools/openapi-manager/Cargo.toml @@ -12,9 +12,9 @@ anyhow.workspace = true atomicwrites.workspace = true bootstrap-agent-api.workspace = true camino.workspace = true +clap.workspace = true clickhouse-admin-api.workspace = true cockroach-admin-api.workspace = true -clap.workspace = true dns-server-api.workspace = true dropshot.workspace = true fs-err.workspace = true @@ -24,13 +24,14 @@ installinator-api.workspace = true nexus-external-api.workspace = true nexus-internal-api.workspace = true omicron-workspace-hack.workspace = true -openapiv3.workspace = true openapi-lint.workspace = true openapi-manager-types.workspace = true +openapiv3.workspace = true owo-colors.workspace = true oximeter-api.workspace = true +repo-depot-api.workspace = true serde_json.workspace = true -sled-agent-api.workspace = true similar.workspace = true +sled-agent-api.workspace = true supports-color.workspace = true wicketd-api.workspace = true diff --git a/dev-tools/openapi-manager/src/spec.rs b/dev-tools/openapi-manager/src/spec.rs index dafcebac05..ff55bbeff5 100644 --- a/dev-tools/openapi-manager/src/spec.rs +++ b/dev-tools/openapi-manager/src/spec.rs @@ -121,6 +121,15 @@ pub fn all_apis() -> Vec { filename: "oximeter.json", extra_validation: None, }, + ApiSpec { + title: "Oxide TUF Repo Depot API", + version: "0.0.1", + description: "API for fetching update artifacts", + boundary: ApiBoundary::Internal, + api_description: repo_depot_api::repo_depot_api_mod::stub_api_description, + filename: "repo-depot.json", + extra_validation: None, + }, ApiSpec { title: "Oxide Sled Agent API", version: "0.0.1", diff --git a/openapi/repo-depot.json b/openapi/repo-depot.json new file mode 100644 index 0000000000..0c0019cf8d --- /dev/null +++ b/openapi/repo-depot.json @@ -0,0 +1,82 @@ +{ + "openapi": "3.0.3", + "info": { + "title": "Oxide TUF Repo Depot API", + "description": "API for fetching update artifacts", + "contact": { + "url": "https://oxide.computer", + "email": "api@oxide.computer" + }, + "version": "0.0.1" + }, + "paths": { + "/artifact/sha256/{sha256}": { + "get": { + "summary": "Fetch an artifact from the depot.", + "operationId": "artifact_get_by_sha256", + "parameters": [ + { + "in": "path", + "name": "sha256", + "required": true, + "schema": { + "type": "string", + "format": "hex string (32 bytes)" + } + } + ], + "responses": { + "200": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + } + }, + "components": { + "schemas": { + "Error": { + "description": "Error information from a response.", + "type": "object", + "properties": { + "error_code": { + "type": "string" + }, + "message": { + "type": "string" + }, + "request_id": { + "type": "string" + } + }, + "required": [ + "message", + "request_id" + ] + } + }, + "responses": { + "Error": { + "description": "Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + } +} diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 0345bbf9e8..b79cb467ec 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -10,6 +10,149 @@ "version": "0.0.1" }, "paths": { + "/artifacts": { + "get": { + "operationId": "artifact_list", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Map_of_uint", + "type": "object", + "additionalProperties": { + "type": "integer", + "format": "uint", + "minimum": 0 + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/artifacts/{sha256}": { + "put": { + "operationId": "artifact_put", + "parameters": [ + { + "in": "path", + "name": "sha256", + "required": true, + "schema": { + "type": "string", + "format": "hex string (32 bytes)" + } + } + ], + "requestBody": { + "content": { + "application/octet-stream": { + "schema": { + "type": "string", + "format": "binary" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ArtifactPutResponse" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "operationId": "artifact_delete", + "parameters": [ + { + "in": "path", + "name": "sha256", + "required": true, + "schema": { + "type": "string", + "format": "hex string (32 bytes)" + } + } + ], + "responses": { + "204": { + "description": "successful deletion" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/artifacts/{sha256}/copy-from-depot": { + "post": { + "operationId": "artifact_copy_from_depot", + "parameters": [ + { + "in": "path", + "name": "sha256", + "required": true, + "schema": { + "type": "string", + "format": "hex string (32 bytes)" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ArtifactCopyFromDepotBody" + } + } + }, + "required": true + }, + "responses": { + "202": { + "description": "successfully enqueued operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ArtifactCopyFromDepotResponse" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/boot-disk/{boot_disk}/os/write": { "post": { "summary": "Write a new host OS image to the specified boot disk", @@ -1417,6 +1560,41 @@ "start_request" ] }, + "ArtifactCopyFromDepotBody": { + "type": "object", + "properties": { + "depot_base_url": { + "type": "string" + } + }, + "required": [ + "depot_base_url" + ] + }, + "ArtifactCopyFromDepotResponse": { + "type": "object" + }, + "ArtifactPutResponse": { + "type": "object", + "properties": { + "datasets": { + "description": "The number of valid M.2 artifact datasets we found on the sled. There is typically one of these datasets for each functional M.2.", + "type": "integer", + "format": "uint", + "minimum": 0 + }, + "successful_writes": { + "description": "The number of valid writes to the M.2 artifact datasets. This should be less than or equal to the number of artifact datasets.", + "type": "integer", + "format": "uint", + "minimum": 0 + } + }, + "required": [ + "datasets", + "successful_writes" + ] + }, "Baseboard": { "description": "Describes properties that should uniquely identify a Gimlet.", "oneOf": [ diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 360ba7f499..557dcbcb4e 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -64,12 +64,15 @@ propolis_api_types.workspace = true propolis-client.workspace = true propolis-mock-server.workspace = true # Only used by the simulated sled agent rand = { workspace = true, features = ["getrandom"] } +repo-depot-api.workspace = true +repo-depot-client.workspace = true reqwest = { workspace = true, features = ["rustls-tls", "stream"] } schemars = { workspace = true, features = ["chrono", "uuid1"] } semver.workspace = true serde.workspace = true serde_human_bytes.workspace = true serde_json = { workspace = true, features = ["raw_value"] } +sha2.workspace = true sha3.workspace = true sled-agent-api.workspace = true sled-agent-client.workspace = true @@ -105,6 +108,7 @@ opte-ioctl.workspace = true assert_matches.workspace = true expectorate.workspace = true guppy.workspace = true +hex-literal.workspace = true http.workspace = true hyper.workspace = true omicron-test-utils.workspace = true diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index e0d76a857b..b5608602f2 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -2,13 +2,15 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::{collections::BTreeMap, time::Duration}; +use std::collections::BTreeMap; +use std::time::Duration; use camino::Utf8PathBuf; use dropshot::{ - FreeformBody, HttpError, HttpResponseCreated, HttpResponseDeleted, - HttpResponseHeaders, HttpResponseOk, HttpResponseUpdatedNoContent, Path, - Query, RequestContext, StreamingBody, TypedBody, + FreeformBody, HttpError, HttpResponseAccepted, HttpResponseCreated, + HttpResponseDeleted, HttpResponseHeaders, HttpResponseOk, + HttpResponseUpdatedNoContent, Path, Query, RequestContext, StreamingBody, + TypedBody, }; use nexus_sled_agent_shared::inventory::{ Inventory, OmicronZonesConfig, SledRole, @@ -25,6 +27,7 @@ use omicron_common::{ DatasetsConfig, DatasetsManagementResult, DiskVariant, DisksManagementResult, OmicronPhysicalDisksConfig, }, + update::ArtifactHash, }; use omicron_uuid_kinds::{PropolisUuid, ZpoolUuid}; use schemars::JsonSchema; @@ -301,6 +304,43 @@ pub trait SledAgentApi { artifact: TypedBody, ) -> Result; + #[endpoint { + method = GET, + path = "/artifacts" + }] + async fn artifact_list( + rqctx: RequestContext, + ) -> Result>, HttpError>; + + #[endpoint { + method = POST, + path = "/artifacts/{sha256}/copy-from-depot" + }] + async fn artifact_copy_from_depot( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError>; + + #[endpoint { + method = PUT, + path = "/artifacts/{sha256}" + }] + async fn artifact_put( + rqctx: RequestContext, + path_params: Path, + body: StreamingBody, + ) -> Result, HttpError>; + + #[endpoint { + method = DELETE, + path = "/artifacts/{sha256}" + }] + async fn artifact_delete( + rqctx: RequestContext, + path_params: Path, + ) -> Result; + /// Take a snapshot of a disk that is attached to an instance #[endpoint { method = POST, @@ -547,6 +587,30 @@ pub struct DiskPathParam { pub disk_id: Uuid, } +#[derive(Deserialize, JsonSchema)] +pub struct ArtifactPathParam { + pub sha256: ArtifactHash, +} + +#[derive(Deserialize, JsonSchema)] +pub struct ArtifactCopyFromDepotBody { + pub depot_base_url: String, +} + +#[derive(Serialize, JsonSchema)] +pub struct ArtifactCopyFromDepotResponse {} + +#[derive(Debug, Serialize, JsonSchema)] +pub struct ArtifactPutResponse { + /// The number of valid M.2 artifact datasets we found on the sled. There is + /// typically one of these datasets for each functional M.2. + pub datasets: usize, + + /// The number of valid writes to the M.2 artifact datasets. This should be + /// less than or equal to the number of artifact datasets. + pub successful_writes: usize, +} + #[derive(Deserialize, JsonSchema)] pub struct VmmIssueDiskSnapshotRequestPathParam { pub propolis_id: PropolisUuid, diff --git a/sled-agent/repo-depot-api/Cargo.toml b/sled-agent/repo-depot-api/Cargo.toml new file mode 100644 index 0000000000..f9fa60ad8b --- /dev/null +++ b/sled-agent/repo-depot-api/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "repo-depot-api" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +dropshot.workspace = true +omicron-common.workspace = true +omicron-workspace-hack.workspace = true +schemars.workspace = true +serde.workspace = true diff --git a/sled-agent/repo-depot-api/src/lib.rs b/sled-agent/repo-depot-api/src/lib.rs new file mode 100644 index 0000000000..236b9c8e7a --- /dev/null +++ b/sled-agent/repo-depot-api/src/lib.rs @@ -0,0 +1,28 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use dropshot::{FreeformBody, HttpError, HttpResponseOk, Path, RequestContext}; +use omicron_common::update::ArtifactHash; +use schemars::JsonSchema; +use serde::Deserialize; + +#[dropshot::api_description] +pub trait RepoDepotApi { + type Context; + + /// Fetch an artifact from the depot. + #[endpoint { + method = GET, + path = "/artifact/sha256/{sha256}", + }] + async fn artifact_get_by_sha256( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; +} + +#[derive(Clone, Debug, Deserialize, JsonSchema)] +pub struct ArtifactPathParams { + pub sha256: ArtifactHash, +} diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs new file mode 100644 index 0000000000..fc0dc4a20a --- /dev/null +++ b/sled-agent/src/artifact_store.rs @@ -0,0 +1,910 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Manages TUF artifacts stored on this sled. The implementation is a very +//! basic content-addressed object store. +//! +//! GET operations are handled by the "Repo Depot" API, which is deliberately +//! a separate Dropshot service from the rest of Sled Agent. This is to avoid a +//! circular logical dependency, because we expect Sled Agent to fetch artifacts +//! it does not have from another Repo Depot that does have them (at Nexus's +//! direction). This API's implementation is also part of this module. +//! +//! POST, PUT, and DELETE operations are called by Nexus and handled by the Sled +//! Agent API. + +use std::collections::BTreeMap; +use std::io::ErrorKind; +use std::net::SocketAddrV6; +use std::str::FromStr; +use std::time::Duration; + +use camino::{Utf8Path, Utf8PathBuf}; +use camino_tempfile::{NamedUtf8TempFile, Utf8TempPath}; +use dropshot::{ + Body, ConfigDropshot, FreeformBody, HttpError, HttpResponseOk, + HttpServerStarter, Path, RequestContext, StreamingBody, +}; +use futures::{Stream, TryStreamExt}; +use http::StatusCode; +use omicron_common::address::REPO_DEPOT_PORT; +use omicron_common::disk::{DatasetKind, DatasetsConfig}; +use omicron_common::update::ArtifactHash; +use repo_depot_api::*; +use sha2::{Digest, Sha256}; +use sled_agent_api::ArtifactPutResponse; +use sled_storage::dataset::M2_ARTIFACT_DATASET; +use sled_storage::error::Error as StorageError; +use sled_storage::manager::StorageHandle; +use slog::{error, info, Logger}; +use slog_error_chain::SlogInlineError; +use tokio::fs::{File, OpenOptions}; +use tokio::io::AsyncWriteExt; + +const TEMP_SUBDIR: &str = "tmp"; + +/// Content-addressable local storage for software artifacts. +/// +/// Storage for artifacts is backed by datasets that are explicitly designated +/// for this purpose. The `T: DatasetsManager` parameter, which varies between +/// the real sled agent, the simulated sled agent, and unit tests, specifies +/// exactly which datasets are available for artifact storage. That's the only +/// thing `T` is used for. The behavior of storing artifacts as files under +/// one or more paths is identical for all callers (i.e., both the real and +/// simulated sled agents). +/// +/// A given artifact is generally stored on both datasets designated for +/// artifact storage across both M.2 devices, but we attempt to be resilient to +/// a failing or missing M.2 device. This means: +/// +/// - for PUT, we try to write to all datasets, logging errors as we go; if we +/// successfully write the artifact to at least one, we return OK. +/// - for GET, we look in each dataset until we find it. +/// - for DELETE, we attempt to delete it from each dataset, logging errors as +/// we go, and failing if we saw any errors. +#[derive(Clone)] +pub(crate) struct ArtifactStore { + log: Logger, + reqwest_client: reqwest::Client, + storage: T, +} + +impl ArtifactStore { + pub(crate) fn new(log: &Logger, storage: T) -> ArtifactStore { + ArtifactStore { + log: log.new(slog::o!("component" => "ArtifactStore")), + reqwest_client: reqwest::ClientBuilder::new() + .connect_timeout(Duration::from_secs(15)) + .read_timeout(Duration::from_secs(15)) + .build() + .unwrap(), + storage, + } + } +} + +impl ArtifactStore { + pub(crate) async fn start( + self, + sled_address: SocketAddrV6, + dropshot_config: &ConfigDropshot, + ) -> Result>, StartError> + { + // In the real sled agent, the update datasets are durable and may + // retain temporary files leaked during a crash. Upon startup, we + // attempt to remove the subdirectory we store temporary files in, + // logging an error if that fails. + // + // (This function is part of `start` instead of `new` out of + // convenience: this function already needs to be async and fallible, + // but `new` doesn't; and all the sled agent implementations that don't + // call this function also don't need to run cleanup.) + for mountpoint in self + .storage + .artifact_storage_paths() + .await + .map_err(StartError::DatasetConfig)? + { + let path = mountpoint.join(TEMP_SUBDIR); + if let Err(err) = tokio::fs::remove_dir_all(&path).await { + if err.kind() != ErrorKind::NotFound { + // We log an error here because we expect that if we are + // having disk I/O errors, something else (fmd?) will + // identify those issues and bubble them up to the operator. + // (As of writing this comment that is not true but we + // expect this to exist in the limit, and refusing to start + // Sled Agent because of a problem with a single FRU seems + // inappropriate.) + error!( + &self.log, + "Failed to remove stale temporary artifacts"; + "error" => &err, + "path" => path.as_str(), + ); + } + } + } + + let mut depot_address = sled_address; + depot_address.set_port(REPO_DEPOT_PORT); + + let log = self.log.new(o!("component" => "dropshot (Repo Depot)")); + Ok(HttpServerStarter::new( + &ConfigDropshot { + bind_address: depot_address.into(), + ..dropshot_config.clone() + }, + repo_depot_api_mod::api_description::() + .expect("registered entrypoints"), + self, + &log, + ) + .map_err(StartError::Dropshot)? + .start()) + } +} + +#[derive(Debug, thiserror::Error)] +pub enum StartError { + #[error("Error retrieving dataset configuration")] + DatasetConfig(#[source] sled_storage::error::Error), + + #[error("Dropshot error while starting Repo Depot service")] + Dropshot(#[source] Box), +} + +macro_rules! log_and_store { + ($last_error:expr, $log:expr, $verb:literal, $path:expr, $err:expr) => {{ + error!( + $log, + concat!("Failed to ", $verb, " path"); + "error" => &$err, + "path" => $path.as_str(), + ); + $last_error = Some(Error::File { verb: $verb, path: $path, err: $err }); + }}; +} + +impl ArtifactStore { + /// GET operation (served by Repo Depot API) + /// + /// We try all datasets, returning early if we find the artifact, logging + /// errors as we go. If we don't find it we return the most recent error we + /// logged or a NotFound. + pub(crate) async fn get( + &self, + sha256: ArtifactHash, + ) -> Result { + let sha256_str = sha256.to_string(); + let mut last_error = None; + for mountpoint in self.storage.artifact_storage_paths().await? { + let path = mountpoint.join(&sha256_str); + match File::open(&path).await { + Ok(file) => { + info!( + &self.log, + "Retrieved artifact"; + "sha256" => &sha256_str, + "path" => path.as_str(), + ); + return Ok(file); + } + Err(err) if err.kind() == ErrorKind::NotFound => {} + Err(err) => { + log_and_store!(last_error, &self.log, "open", path, err); + } + } + } + Err(last_error.unwrap_or(Error::NotFound { sha256 })) + } + + /// List operation (served by Sled Agent API) + /// + /// We try all datasets, logging errors as we go; if we're experiencing I/O + /// errors, Nexus should still be aware of the artifacts we think we have. + pub(crate) async fn list( + &self, + ) -> Result, Error> { + let mut map = BTreeMap::new(); + let mut any_datasets = false; + for mountpoint in self.storage.artifact_storage_paths().await? { + any_datasets = true; + let mut read_dir = match tokio::fs::read_dir(&mountpoint).await { + Ok(read_dir) => read_dir, + Err(err) => { + error!( + &self.log, + "Failed to read dir"; + "error" => &err, + "path" => mountpoint.as_str(), + ); + continue; + } + }; + // The semantics of tokio::fs::ReadDir are weird. At least with + // `std::fs::ReadDir`, we know when the end of the iterator is, + // because `.next()` returns `Option>`; we could + // theoretically log the error and continue trying to retrieve + // elements from the iterator (but whether this makes sense to do + // is not documented and likely system-dependent). + // + // The Tokio version returns `Result>`, which + // has no indication of whether there might be more items in + // the stream! (The stream adapter in tokio-stream simply calls + // `Result::transpose()`, so in theory an error is not the end of + // the stream.) + // + // For lack of any direction we stop reading entries from the stream + // on the first error. That way we at least don't get stuck retrying + // an operation that will always fail. + loop { + match read_dir.next_entry().await { + Ok(Some(entry)) => { + if let Ok(file_name) = entry.file_name().into_string() { + if let Ok(hash) = ArtifactHash::from_str(&file_name) + { + *map.entry(hash).or_default() += 1; + } + } + } + Ok(None) => break, + Err(err) => { + error!( + &self.log, + "Failed to read dir"; + "error" => &err, + "path" => mountpoint.as_str(), + ); + break; + } + } + } + } + if any_datasets { + Ok(map) + } else { + Err(Error::NoUpdateDataset) + } + } + + /// Common implementation for all artifact write operations that creates + /// a temporary file on all datasets. Returns an [`ArtifactWriter`] that + /// can be used to write the artifact to all temporary files, then move all + /// temporary files to their final paths. + /// + /// Most errors during the write process are considered non-fatal errors, + /// which are logged instead of immediately returned. + /// + /// In this method, possible fatal errors are: + /// - No temporary files could be created. + /// - A temporary file already exists (another task is writing to this + /// artifact). + async fn writer( + &self, + sha256: ArtifactHash, + ) -> Result { + let mut files = Vec::new(); + let mut last_error = None; + let mut datasets = 0; + for mountpoint in self.storage.artifact_storage_paths().await? { + datasets += 1; + let temp_dir = mountpoint.join(TEMP_SUBDIR); + if let Err(err) = tokio::fs::create_dir(&temp_dir).await { + if err.kind() != ErrorKind::AlreadyExists { + log_and_store!( + last_error, &self.log, "create", temp_dir, err + ); + continue; + } + } + + let temp_path = + Utf8TempPath::from_path(temp_dir.join(sha256.to_string())); + let file = match OpenOptions::new() + .write(true) + .create_new(true) + .open(&temp_path) + .await + { + Ok(file) => file, + Err(err) => { + if err.kind() == ErrorKind::AlreadyExists { + return Err(Error::AlreadyInProgress { sha256 }); + } else { + let path = temp_path.to_path_buf(); + log_and_store!( + last_error, &self.log, "create", path, err + ); + continue; + } + } + }; + let file = NamedUtf8TempFile::from_parts(file, temp_path); + + files.push(Some((file, mountpoint))); + } + if files.is_empty() { + Err(last_error.unwrap_or(Error::NoUpdateDataset)) + } else { + Ok(ArtifactWriter { + datasets, + hasher: Sha256::new(), + files, + log: self.log.clone(), + sha256, + }) + } + } + + /// PUT operation (served by Sled Agent API) which takes a [`StreamingBody`] + pub(crate) async fn put_body( + &self, + sha256: ArtifactHash, + body: StreamingBody, + ) -> Result { + self.writer(sha256) + .await? + .write_stream(body.into_stream().map_err(Error::Body)) + .await + } + + /// POST operation (served by Sled Agent API) + pub(crate) async fn copy_from_depot( + &self, + sha256: ArtifactHash, + depot_base_url: &str, + ) -> Result<(), Error> { + let client = repo_depot_client::Client::new_with_client( + depot_base_url, + self.reqwest_client.clone(), + self.log.new(slog::o!( + "component" => "Repo Depot client (ArtifactStore)", + "base_url" => depot_base_url.to_owned(), + )), + ); + // Check that there's no conflict before we send the upstream request. + let writer = self.writer(sha256).await?; + let response = client + .artifact_get_by_sha256(&sha256.to_string()) + .await + .map_err(|err| Error::DepotCopy { + sha256, + base_url: depot_base_url.to_owned(), + err, + })?; + // Copy from the stream on its own task and immediately return. + let log = self.log.clone(); + let base_url = depot_base_url.to_owned(); + tokio::task::spawn(async move { + let stream = response.into_inner().into_inner().map_err(|err| { + Error::DepotCopy { + sha256, + base_url: base_url.clone(), + err: repo_depot_client::ClientError::ResponseBodyError(err), + } + }); + if let Err(err) = writer.write_stream(stream).await { + error!( + &log, + "Failed to write artifact"; + "err" => &err, + ); + } + }); + Ok(()) + } + + /// DELETE operation (served by Sled Agent API) + /// + /// We attempt to delete the artifact in all datasets, logging errors as we + /// go. If any errors occurred we return the most recent error we logged. + pub(crate) async fn delete( + &self, + sha256: ArtifactHash, + ) -> Result<(), Error> { + let sha256 = sha256.to_string(); + let mut any_datasets = false; + let mut last_error = None; + for mountpoint in self.storage.artifact_storage_paths().await? { + any_datasets = true; + let path = mountpoint.join(&sha256); + match tokio::fs::remove_file(&path).await { + Ok(()) => { + info!( + &self.log, + "Removed artifact"; + "sha256" => &sha256, + "path" => path.as_str(), + ); + } + Err(err) if err.kind() == ErrorKind::NotFound => {} + Err(err) => { + log_and_store!(last_error, &self.log, "remove", path, err); + } + } + } + if let Some(last_error) = last_error { + Err(last_error) + } else if any_datasets { + Ok(()) + } else { + // If we're here because there aren't any update datasets, we should + // report Service Unavailable instead of a successful result. + Err(Error::NoUpdateDataset) + } + } +} + +/// Abstracts over what kind of sled agent we are; each of the real sled agent, +/// simulated sled agent, and this module's unit tests have different ways of +/// keeping track of the datasets on the system. +pub(crate) trait DatasetsManager: Sync { + async fn artifact_storage_paths( + &self, + ) -> Result + '_, StorageError>; +} + +/// Iterator `.filter().map()` common to `DatasetsManager` implementations. +pub(crate) fn filter_dataset_mountpoints( + config: DatasetsConfig, + root: &Utf8Path, +) -> impl Iterator + '_ { + config + .datasets + .into_values() + .filter(|dataset| *dataset.name.dataset() == DatasetKind::Update) + .map(|dataset| dataset.name.mountpoint(root)) +} + +impl DatasetsManager for StorageHandle { + async fn artifact_storage_paths( + &self, + ) -> Result + '_, StorageError> { + // TODO: When datasets are managed by Reconfigurator (#6229), + // this should be changed to use `self.datasets_config_list()` and + // `filter_dataset_mountpoints`. + Ok(self + .get_latest_disks() + .await + .all_m2_mountpoints(M2_ARTIFACT_DATASET) + .into_iter()) + } +} + +/// Abstraction that handles writing to several temporary files. +struct ArtifactWriter { + datasets: usize, + files: Vec, Utf8PathBuf)>>, + hasher: Sha256, + log: Logger, + sha256: ArtifactHash, +} + +impl ArtifactWriter { + /// Calls [`ArtifactWriter::write`] for each chunk in the stream, then + /// [`ArtifactWriter::finalize`]. See the documentation for these functions + /// for error handling information. + async fn write_stream( + self, + stream: impl Stream, Error>>, + ) -> Result { + let writer = stream + .try_fold(self, |mut writer, chunk| async { + writer.write(chunk).await?; + Ok(writer) + }) + .await?; + writer.finalize().await + } + + /// Write `chunk` to all temporary files. + /// + /// Errors in this method are considered non-fatal errors. All errors + /// are logged. If all files have failed, this method returns the most + /// recently-seen non-fatal error as a fatal error. + async fn write(&mut self, chunk: impl AsRef<[u8]>) -> Result<(), Error> { + self.hasher.update(&chunk); + + let mut last_error = None; + for option in &mut self.files { + if let Some((mut file, mountpoint)) = option.take() { + match file.as_file_mut().write_all(chunk.as_ref()).await { + Ok(()) => { + *option = Some((file, mountpoint)); + } + Err(err) => { + let path = file.path().to_owned(); + log_and_store!( + last_error, &self.log, "write to", path, err + ); + // `file` and `final_path` are dropped here, cleaning up + // the file + } + } + } + } + + self.files.retain(Option::is_some); + if self.files.is_empty() { + Err(last_error.unwrap_or(Error::NoUpdateDataset)) + } else { + Ok(()) + } + } + + /// Rename all files to their final paths. + /// + /// Errors in this method are considered non-fatal errors. If all files have + /// failed in some way, the most recently-seen error is returned as a fatal + /// error. + async fn finalize(self) -> Result { + let digest = self.hasher.finalize(); + if digest.as_slice() != self.sha256.as_ref() { + return Err(Error::HashMismatch { + expected: self.sha256, + actual: ArtifactHash(digest.into()), + }); + } + + let mut last_error = None; + let mut successful_writes = 0; + for (mut file, mountpoint) in self.files.into_iter().flatten() { + // 1. fsync the temporary file. + if let Err(err) = file.as_file_mut().sync_all().await { + let path = file.path().to_owned(); + log_and_store!(last_error, &self.log, "sync", path, err); + continue; + } + // 2. Open the parent directory so we can fsync it. + let parent_dir = match File::open(&mountpoint).await { + Ok(dir) => dir, + Err(err) => { + log_and_store!( + last_error, &self.log, "open", mountpoint, err + ); + continue; + } + }; + // 3. Rename the temporary file. + let final_path = mountpoint.join(self.sha256.to_string()); + let moved_final_path = final_path.clone(); + if let Err(err) = tokio::task::spawn_blocking(move || { + file.persist(&moved_final_path) + }) + .await? + { + error!( + &self.log, + "Failed to rename temporary file"; + "error" => &err.error, + "from" => err.file.path().as_str(), + "to" => final_path.as_str(), + ); + last_error = Some(Error::FileRename { + from: err.file.path().to_owned(), + to: final_path, + err: err.error, + }); + continue; + } + // 4. fsync the parent directory. + if let Err(err) = parent_dir.sync_all().await { + log_and_store!(last_error, &self.log, "sync", mountpoint, err); + continue; + } + + successful_writes += 1; + } + + if successful_writes > 0 { + info!( + &self.log, + "Wrote artifact"; + "sha256" => &self.sha256.to_string(), + "datasets" => self.datasets, + "successful_writes" => successful_writes, + ); + Ok(ArtifactPutResponse { + datasets: self.datasets, + successful_writes, + }) + } else { + Err(last_error.unwrap_or(Error::NoUpdateDataset)) + } + } +} + +/// Implementation of the Repo Depot API backed by an +/// `ArtifactStore`. +enum RepoDepotImpl {} + +impl RepoDepotApi for RepoDepotImpl { + type Context = ArtifactStore; + + async fn artifact_get_by_sha256( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError> { + let sha256 = path_params.into_inner().sha256; + let file = rqctx.context().get(sha256).await?; + let file_access = hyper_staticfile::vfs::TokioFileAccess::new(file); + let file_stream = + hyper_staticfile::util::FileBytesStream::new(file_access); + let body = Body::wrap(hyper_staticfile::Body::Full(file_stream)); + Ok(HttpResponseOk(FreeformBody(body))) + } +} + +#[derive(Debug, thiserror::Error, SlogInlineError)] +pub(crate) enum Error { + #[error("Another task is already writing artifact {sha256}")] + AlreadyInProgress { sha256: ArtifactHash }, + + #[error("Error while reading request body")] + Body(dropshot::HttpError), + + #[error("Error retrieving dataset configuration")] + DatasetConfig(#[from] sled_storage::error::Error), + + #[error("Error fetching artifact {sha256} from depot at {base_url}")] + DepotCopy { + sha256: ArtifactHash, + base_url: String, + #[source] + err: repo_depot_client::ClientError, + }, + + #[error("Failed to {verb} `{path}`")] + File { + verb: &'static str, + path: Utf8PathBuf, + #[source] + err: std::io::Error, + }, + + #[error("Failed to rename `{from}` to `{to}`")] + FileRename { + from: Utf8PathBuf, + to: Utf8PathBuf, + #[source] + err: std::io::Error, + }, + + #[error("Digest mismatch: expected {expected}, actual {actual}")] + HashMismatch { expected: ArtifactHash, actual: ArtifactHash }, + + #[error("Blocking task failed")] + Join(#[from] tokio::task::JoinError), + + #[error("Artifact {sha256} not found")] + NotFound { sha256: ArtifactHash }, + + #[error("No update datasets present")] + NoUpdateDataset, +} + +impl From for HttpError { + fn from(err: Error) -> HttpError { + match err { + Error::AlreadyInProgress { .. } => HttpError::for_client_error( + None, + StatusCode::CONFLICT, + err.to_string(), + ), + Error::Body(inner) => inner, + Error::DatasetConfig(_) | Error::NoUpdateDataset => { + HttpError::for_unavail(None, err.to_string()) + } + Error::DepotCopy { .. } + | Error::File { .. } + | Error::FileRename { .. } + | Error::Join(_) => HttpError::for_internal_error(err.to_string()), + Error::HashMismatch { .. } => { + HttpError::for_bad_request(None, err.to_string()) + } + Error::NotFound { .. } => { + HttpError::for_not_found(None, err.to_string()) + } + } + } +} + +#[cfg(test)] +mod test { + use camino_tempfile::Utf8TempDir; + use futures::stream; + use hex_literal::hex; + use omicron_common::disk::{ + DatasetConfig, DatasetKind, DatasetName, DatasetsConfig, + }; + use omicron_common::update::ArtifactHash; + use omicron_common::zpool_name::ZpoolName; + use omicron_test_utils::dev::test_setup_log; + use omicron_uuid_kinds::{DatasetUuid, ZpoolUuid}; + use sled_storage::error::Error as StorageError; + use tokio::io::AsyncReadExt; + + use super::{ArtifactStore, DatasetsManager, Error}; + + struct TestBackend { + datasets: DatasetsConfig, + mountpoint_root: Utf8TempDir, + } + + impl TestBackend { + fn new(len: usize) -> TestBackend { + let mountpoint_root = camino_tempfile::tempdir().unwrap(); + + let mut datasets = DatasetsConfig::default(); + if len > 0 { + datasets.generation = datasets.generation.next(); + } + for _ in 0..len { + let dataset = DatasetConfig { + id: DatasetUuid::new_v4(), + name: DatasetName::new( + ZpoolName::new_external(ZpoolUuid::new_v4()), + DatasetKind::Update, + ), + compression: Default::default(), + quota: None, + reservation: None, + }; + let mountpoint = + dataset.name.mountpoint(mountpoint_root.path()); + std::fs::create_dir_all(mountpoint).unwrap(); + datasets.datasets.insert(dataset.id, dataset); + } + + TestBackend { datasets, mountpoint_root } + } + } + + impl DatasetsManager for TestBackend { + async fn artifact_storage_paths( + &self, + ) -> Result + '_, StorageError> + { + Ok(super::filter_dataset_mountpoints( + self.datasets.clone(), + self.mountpoint_root.path(), + )) + } + } + + const TEST_ARTIFACT: &[u8] = b"I'm an artifact!\n"; + const TEST_HASH: ArtifactHash = ArtifactHash(hex!( + "ab3581cd62f6645518f61a8e4391af6c062d5d60111edb0e51b37bd84827f5b4" + )); + + #[tokio::test] + async fn list_get_put_delete() { + let log = test_setup_log("get_put_delete"); + let backend = TestBackend::new(2); + let store = ArtifactStore::new(&log.log, backend); + + // list succeeds with an empty result + assert!(store.list().await.unwrap().is_empty()); + // get fails, because it doesn't exist yet + assert!(matches!( + store.get(TEST_HASH).await, + Err(Error::NotFound { .. }) + )); + // delete does not fail because we don't fail if the artifact is not + // present + assert!(matches!(store.delete(TEST_HASH).await, Ok(()))); + + // test several things here: + // 1. put succeeds + // 2. put is idempotent (we don't care if it clobbers a file as long as + // the hash is okay) + // 3. we don't fail trying to create TEMP_SUBDIR twice + for _ in 0..2 { + store + .writer(TEST_HASH) + .await + .unwrap() + .write_stream(stream::once(async { Ok(TEST_ARTIFACT) })) + .await + .unwrap(); + // list lists the file + assert!(store + .list() + .await + .unwrap() + .into_iter() + .eq([(TEST_HASH, 2)])); + // get succeeds, file reads back OK + let mut file = store.get(TEST_HASH).await.unwrap(); + let mut vec = Vec::new(); + file.read_to_end(&mut vec).await.unwrap(); + assert_eq!(vec, TEST_ARTIFACT); + } + + // all datasets should have the artifact + for mountpoint in store.storage.artifact_storage_paths().await.unwrap() + { + assert_eq!( + tokio::fs::read(mountpoint.join(TEST_HASH.to_string())) + .await + .unwrap(), + TEST_ARTIFACT + ); + } + + // delete succeeds and is idempotent + for _ in 0..2 { + store.delete(TEST_HASH).await.unwrap(); + // list succeeds with an empty result + assert!(store.list().await.unwrap().is_empty()); + // get now fails because it no longer exists + assert!(matches!( + store.get(TEST_HASH).await, + Err(Error::NotFound { .. }) + )); + } + + log.cleanup_successful(); + } + + #[tokio::test] + async fn no_dataset() { + // If there are no update datasets, all gets should fail with + // `Error::NotFound`, and all other operations should fail with + // `Error::NoUpdateDataset`. + + let log = test_setup_log("no_dataset"); + let backend = TestBackend::new(0); + let store = ArtifactStore::new(&log.log, backend); + + assert!(matches!( + store.writer(TEST_HASH).await, + Err(Error::NoUpdateDataset) + )); + assert!(matches!( + store.get(TEST_HASH).await, + Err(Error::NotFound { .. }) + )); + assert!(matches!(store.list().await, Err(Error::NoUpdateDataset))); + assert!(matches!( + store.delete(TEST_HASH).await, + Err(Error::NoUpdateDataset) + )); + + log.cleanup_successful(); + } + + #[tokio::test] + async fn wrong_hash() { + const ACTUAL: ArtifactHash = ArtifactHash(hex!( + "4d27a9d1ddb65e0f2350a400cf73157e42ae2ca687a4220aa0a73b9bb2d211f7" + )); + + let log = test_setup_log("wrong_hash"); + let backend = TestBackend::new(2); + let store = ArtifactStore::new(&log.log, backend); + let err = store + .writer(TEST_HASH) + .await + .unwrap() + .write_stream(stream::once(async { + Ok(b"This isn't right at all.") + })) + .await + .unwrap_err(); + match err { + Error::HashMismatch { expected, actual } => { + assert_eq!(expected, TEST_HASH); + assert_eq!(actual, ACTUAL); + } + err => panic!("wrong error: {err}"), + } + assert!(matches!( + store.get(TEST_HASH).await, + Err(Error::NotFound { .. }) + )); + + log.cleanup_successful(); + } +} diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 489fc9ab0d..cd3afc44ac 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -11,10 +11,10 @@ use bootstore::schemes::v0::NetworkConfig; use camino::Utf8PathBuf; use display_error_chain::DisplayErrorChain; use dropshot::{ - ApiDescription, Body, FreeformBody, HttpError, HttpResponseCreated, - HttpResponseDeleted, HttpResponseHeaders, HttpResponseOk, - HttpResponseUpdatedNoContent, Path, Query, RequestContext, StreamingBody, - TypedBody, + ApiDescription, Body, FreeformBody, HttpError, HttpResponseAccepted, + HttpResponseCreated, HttpResponseDeleted, HttpResponseHeaders, + HttpResponseOk, HttpResponseUpdatedNoContent, Path, Query, RequestContext, + StreamingBody, TypedBody, }; use nexus_sled_agent_shared::inventory::{ Inventory, OmicronZonesConfig, SledRole, @@ -31,6 +31,7 @@ use omicron_common::disk::{ DatasetsConfig, DatasetsManagementResult, DiskVariant, DisksManagementResult, M2Slot, OmicronPhysicalDisksConfig, }; +use omicron_common::update::ArtifactHash; use sled_agent_api::*; use sled_agent_types::boot_disk::{ BootDiskOsWriteStatus, BootDiskPathParams, BootDiskUpdatePathParams, @@ -401,6 +402,48 @@ impl SledAgentApi for SledAgentImpl { Ok(HttpResponseUpdatedNoContent()) } + async fn artifact_list( + rqctx: RequestContext, + ) -> Result>, HttpError> { + Ok(HttpResponseOk(rqctx.context().artifact_store().list().await?)) + } + + async fn artifact_copy_from_depot( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError> + { + let sha256 = path_params.into_inner().sha256; + let depot_base_url = body.into_inner().depot_base_url; + rqctx + .context() + .artifact_store() + .copy_from_depot(sha256, &depot_base_url) + .await?; + Ok(HttpResponseAccepted(ArtifactCopyFromDepotResponse {})) + } + + async fn artifact_put( + rqctx: RequestContext, + path_params: Path, + body: StreamingBody, + ) -> Result, HttpError> { + let sha256 = path_params.into_inner().sha256; + Ok(HttpResponseOk( + rqctx.context().artifact_store().put_body(sha256, body).await?, + )) + } + + async fn artifact_delete( + rqctx: RequestContext, + path_params: Path, + ) -> Result { + let sha256 = path_params.into_inner().sha256; + rqctx.context().artifact_store().delete(sha256).await?; + Ok(HttpResponseDeleted()) + } + async fn vmm_issue_disk_snapshot_request( rqctx: RequestContext, path_params: Path, diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index a2421528e2..b2d78c4a5e 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -15,6 +15,7 @@ pub mod sim; pub mod common; // Modules for the non-simulated sled agent. +pub mod artifact_store; mod backing_fs; mod boot_disk_os_writer; pub mod bootstrap; diff --git a/sled-agent/src/sim/artifact_store.rs b/sled-agent/src/sim/artifact_store.rs new file mode 100644 index 0000000000..d73f5a2880 --- /dev/null +++ b/sled-agent/src/sim/artifact_store.rs @@ -0,0 +1,48 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Implementation of `crate::artifact_store::StorageBackend` for our simulated +//! storage. + +use std::sync::Arc; + +use camino_tempfile::Utf8TempDir; +use futures::lock::Mutex; +use sled_storage::error::Error as StorageError; + +use super::storage::Storage; +use crate::artifact_store::DatasetsManager; + +pub(super) struct SimArtifactStorage { + root: Utf8TempDir, + backend: Arc>, +} + +impl SimArtifactStorage { + pub(super) fn new(backend: Arc>) -> SimArtifactStorage { + SimArtifactStorage { + root: camino_tempfile::tempdir().unwrap(), + backend, + } + } +} + +impl DatasetsManager for SimArtifactStorage { + async fn artifact_storage_paths( + &self, + ) -> Result + '_, StorageError> + { + let config = self + .backend + .lock() + .await + .datasets_config_list() + .await + .map_err(|_| StorageError::LedgerNotFound)?; + Ok(crate::artifact_store::filter_dataset_mountpoints( + config, + self.root.path(), + )) + } +} diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index af9b016370..fdffb249cf 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -10,6 +10,7 @@ use dropshot::endpoint; use dropshot::ApiDescription; use dropshot::FreeformBody; use dropshot::HttpError; +use dropshot::HttpResponseAccepted; use dropshot::HttpResponseCreated; use dropshot::HttpResponseDeleted; use dropshot::HttpResponseHeaders; @@ -35,6 +36,7 @@ use omicron_common::disk::DatasetsConfig; use omicron_common::disk::DatasetsManagementResult; use omicron_common::disk::DisksManagementResult; use omicron_common::disk::OmicronPhysicalDisksConfig; +use omicron_common::update::ArtifactHash; use sled_agent_api::*; use sled_agent_types::boot_disk::BootDiskOsWriteStatus; use sled_agent_types::boot_disk::BootDiskPathParams; @@ -181,6 +183,48 @@ impl SledAgentApi for SledAgentSimImpl { Ok(HttpResponseUpdatedNoContent()) } + async fn artifact_list( + rqctx: RequestContext, + ) -> Result>, HttpError> { + Ok(HttpResponseOk(rqctx.context().artifact_store().list().await?)) + } + + async fn artifact_copy_from_depot( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError> + { + let sha256 = path_params.into_inner().sha256; + let depot_base_url = body.into_inner().depot_base_url; + rqctx + .context() + .artifact_store() + .copy_from_depot(sha256, &depot_base_url) + .await?; + Ok(HttpResponseAccepted(ArtifactCopyFromDepotResponse {})) + } + + async fn artifact_put( + rqctx: RequestContext, + path_params: Path, + body: StreamingBody, + ) -> Result, HttpError> { + let sha256 = path_params.into_inner().sha256; + Ok(HttpResponseOk( + rqctx.context().artifact_store().put_body(sha256, body).await?, + )) + } + + async fn artifact_delete( + rqctx: RequestContext, + path_params: Path, + ) -> Result { + let sha256 = path_params.into_inner().sha256; + rqctx.context().artifact_store().delete(sha256).await?; + Ok(HttpResponseDeleted()) + } + async fn vmm_issue_disk_snapshot_request( rqctx: RequestContext, path_params: Path, diff --git a/sled-agent/src/sim/mod.rs b/sled-agent/src/sim/mod.rs index 14d980cf79..ab3b155b36 100644 --- a/sled-agent/src/sim/mod.rs +++ b/sled-agent/src/sim/mod.rs @@ -4,6 +4,7 @@ //! Simulated sled agent implementation +mod artifact_store; mod collection; mod config; mod disk; diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 4786e6839d..4403116bab 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -4,12 +4,14 @@ //! Simulated sled agent implementation +use super::artifact_store::SimArtifactStorage; use super::collection::{PokeMode, SimCollection}; use super::config::Config; use super::disk::SimDisk; use super::instance::{self, SimInstance}; use super::storage::CrucibleData; use super::storage::Storage; +use crate::artifact_store::ArtifactStore; use crate::nexus::NexusClient; use crate::sim::simulatable::Simulatable; use crate::updates::UpdateManager; @@ -72,7 +74,7 @@ pub struct SledAgent { vmms: Arc>, /// collection of simulated disks, indexed by disk uuid disks: Arc>, - storage: Mutex, + storage: Arc>, updates: UpdateManager, nexus_address: SocketAddr, pub nexus_client: Arc, @@ -88,6 +90,7 @@ pub struct SledAgent { fake_zones: Mutex, instance_ensure_state_error: Mutex>, pub bootstore_network_config: Mutex, + artifacts: ArtifactStore, pub log: Logger, } @@ -165,6 +168,14 @@ impl SledAgent { }, }); + let storage = Arc::new(Mutex::new(Storage::new( + id.into_untyped_uuid(), + config.storage.ip, + storage_log, + ))); + let artifacts = + ArtifactStore::new(&log, SimArtifactStorage::new(storage.clone())); + Arc::new(SledAgent { id, ip: config.dropshot.bind_address.ip(), @@ -178,11 +189,7 @@ impl SledAgent { disk_log, sim_mode, )), - storage: Mutex::new(Storage::new( - id.into_untyped_uuid(), - config.storage.ip, - storage_log, - )), + storage, updates: UpdateManager::new(config.updates.clone()), nexus_address, nexus_client, @@ -197,6 +204,7 @@ impl SledAgent { zones: vec![], }), instance_ensure_state_error: Mutex::new(None), + artifacts, log, bootstore_network_config, }) @@ -558,6 +566,10 @@ impl SledAgent { &self.updates } + pub(super) fn artifact_store(&self) -> &ArtifactStore { + &self.artifacts + } + pub async fn vmm_count(&self) -> usize { self.vmms.size().await } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 8a5b15aaaf..3106c3bb38 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -4,6 +4,7 @@ //! Sled agent implementation +use crate::artifact_store::ArtifactStore; use crate::boot_disk_os_writer::BootDiskOsWriter; use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; use crate::bootstrap::early_networking::EarlyNetworkSetupError; @@ -167,6 +168,9 @@ pub enum Error { #[error("Expected revision to fit in a u32, but found {0}")] UnexpectedRevision(i64), + + #[error(transparent)] + RepoDepotStart(#[from] crate::artifact_store::StartError), } impl From for omicron_common::api::external::Error { @@ -360,6 +364,9 @@ struct SledAgentInner { // Component of Sled Agent responsible for managing instrumentation probes. probes: ProbeManager, + + // Component of Sled Agent responsible for managing the artifact store. + repo_depot: dropshot::HttpServer>, } impl SledAgentInner { @@ -592,6 +599,10 @@ impl SledAgent { log.new(o!("component" => "ProbeManager")), ); + let repo_depot = ArtifactStore::new(&log, storage_manager.clone()) + .start(sled_address, &config.dropshot) + .await?; + let sled_agent = SledAgent { inner: Arc::new(SledAgentInner { id: request.body.id, @@ -614,6 +625,7 @@ impl SledAgent { bootstore: long_running_task_handles.bootstore.clone(), _metrics_manager: metrics_manager, boot_disk_os_writer: BootDiskOsWriter::new(&parent_log), + repo_depot, }), log: log.clone(), sprockets: config.sprockets.clone(), @@ -1089,6 +1101,8 @@ impl SledAgent { } /// Downloads and applies an artifact. + // TODO: This is being split into "download" (i.e. store an artifact in the + // artifact store) and "apply" (perform an update using an artifact). pub async fn update_artifact( &self, artifact: UpdateArtifactId, @@ -1100,6 +1114,10 @@ impl SledAgent { Ok(()) } + pub fn artifact_store(&self) -> &ArtifactStore { + &self.inner.repo_depot.app_private() + } + /// Issue a snapshot request for a Crucible disk attached to an instance pub async fn vmm_issue_disk_snapshot_request( &self, diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index a715a33a69..0f2b610c42 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -30,6 +30,7 @@ pub const CLUSTER_DATASET: &'static str = "cluster"; pub const CONFIG_DATASET: &'static str = "config"; pub const M2_DEBUG_DATASET: &'static str = "debug"; pub const M2_BACKING_DATASET: &'static str = "backing"; +pub const M2_ARTIFACT_DATASET: &'static str = "update"; pub const DEBUG_DATASET_QUOTA: ByteCount = if cfg!(any(test, feature = "testing")) { @@ -46,6 +47,10 @@ pub const DUMP_DATASET_QUOTA: ByteCount = ByteCount::from_gibibytes_u32(100); // passed to zfs create -o compression= pub const DUMP_DATASET_COMPRESSION: CompressionAlgorithm = CompressionAlgorithm::GzipN { level: GzipLevel::new::<9>() }; +// TODO-correctness: This value of 20 GiB is a wild guess -- given TUF repo +// sizes as of Oct 2024, it would be capable of storing about 10 distinct system +// versions. +pub const ARTIFACT_DATASET_QUOTA: ByteCount = ByteCount::from_gibibytes_u32(20); // U.2 datasets live under the encrypted dataset and inherit encryption pub const ZONE_DATASET: &'static str = "crypt/zone"; @@ -65,7 +70,7 @@ const U2_EXPECTED_DATASETS: [ExpectedDataset; U2_EXPECTED_DATASET_COUNT] = [ .compression(DUMP_DATASET_COMPRESSION), ]; -const M2_EXPECTED_DATASET_COUNT: usize = 6; +const M2_EXPECTED_DATASET_COUNT: usize = 7; const M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ // Stores software images. // @@ -90,6 +95,9 @@ const M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ ExpectedDataset::new(CONFIG_DATASET), // Store debugging data, such as service bundles. ExpectedDataset::new(M2_DEBUG_DATASET).quota(DEBUG_DATASET_QUOTA), + // Stores software artifacts (zones, OS images, Hubris images, etc.) + // extracted from TUF repos by Nexus. + ExpectedDataset::new(M2_ARTIFACT_DATASET).quota(ARTIFACT_DATASET_QUOTA), ]; // Helper type for describing expected datasets and their optional quota.