From f9cdf1c68cad0d549602d5667feaeb913298cfad Mon Sep 17 00:00:00 2001 From: Will Chandler Date: Wed, 18 Dec 2024 14:44:43 -0500 Subject: [PATCH] Add task-dump endpoints to MGS This exposes the `faux-mgs dump` command, which shows the number of tasks dump present, and enables downloading a dehydrated dump. Closes #7271 --- Cargo.lock | 24 +++- Cargo.toml | 4 +- gateway-api/src/lib.rs | 31 +++++ gateway-types/src/component_details.rs | 2 + gateway-types/src/lib.rs | 1 + gateway-types/src/task_dump.rs | 35 ++++++ gateway/src/http_entrypoints.rs | 61 +++++++++ gateway/src/metrics.rs | 14 +++ openapi/gateway.json | 165 +++++++++++++++++++++++++ sp-sim/src/gimlet.rs | 65 ++++++++++ sp-sim/src/sidecar.rs | 65 ++++++++++ workspace-hack/Cargo.toml | 4 +- 12 files changed, 463 insertions(+), 8 deletions(-) create mode 100644 gateway-types/src/task_dump.rs diff --git a/Cargo.lock b/Cargo.lock index 44ef4eccc5..bc51c56543 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3392,7 +3392,7 @@ dependencies = [ [[package]] name = "gateway-messages" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/management-gateway-service?rev=9bbac475dcaac88286c07a20b6bd3e94fc81d7f0#9bbac475dcaac88286c07a20b6bd3e94fc81d7f0" +source = "git+https://github.com/oxidecomputer/management-gateway-service?rev=97301243f0707708ae9e629e2b4cdea5ae3fd078#97301243f0707708ae9e629e2b4cdea5ae3fd078" dependencies = [ "bitflags 2.6.0", "hubpack", @@ -3409,7 +3409,7 @@ dependencies = [ [[package]] name = "gateway-sp-comms" version = "0.1.1" -source = "git+https://github.com/oxidecomputer/management-gateway-service?rev=9bbac475dcaac88286c07a20b6bd3e94fc81d7f0#9bbac475dcaac88286c07a20b6bd3e94fc81d7f0" +source = "git+https://github.com/oxidecomputer/management-gateway-service?rev=97301243f0707708ae9e629e2b4cdea5ae3fd078#97301243f0707708ae9e629e2b4cdea5ae3fd078" dependencies = [ "async-trait", "backoff", @@ -3420,6 +3420,7 @@ dependencies = [ "hubpack", "hubtools", "lru-cache", + "lzss", "nix 0.27.1", "once_cell", "paste", @@ -5363,6 +5364,15 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "lzss" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39e2b9a6124e5200c13bb5c5d1124bf93b451bff69b651912810039e36ca97eb" +dependencies = [ + "void", +] + [[package]] name = "macaddr" version = "1.0.1" @@ -11030,9 +11040,9 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" dependencies = [ "libc", "windows-sys 0.52.0", @@ -12770,6 +12780,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + [[package]] name = "vsss-rs" version = "3.3.4" diff --git a/Cargo.toml b/Cargo.toml index bb9ff0e80f..53eaa383e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -390,8 +390,8 @@ gateway-client = { path = "clients/gateway-client" } # is "fine", because SP/MGS communication maintains forwards and backwards # compatibility, but will mean that faux-mgs might be missing new # functionality.) -gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "9bbac475dcaac88286c07a20b6bd3e94fc81d7f0", default-features = false, features = ["std"] } -gateway-sp-comms = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "9bbac475dcaac88286c07a20b6bd3e94fc81d7f0" } +gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "97301243f0707708ae9e629e2b4cdea5ae3fd078", default-features = false, features = ["std"] } +gateway-sp-comms = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "97301243f0707708ae9e629e2b4cdea5ae3fd078" } gateway-test-utils = { path = "gateway-test-utils" } gateway-types = { path = "gateway-types" } gethostname = "0.5.0" diff --git a/gateway-api/src/lib.rs b/gateway-api/src/lib.rs index 30f262ec45..59e49f079d 100644 --- a/gateway-api/src/lib.rs +++ b/gateway-api/src/lib.rs @@ -20,6 +20,7 @@ use gateway_types::{ ignition::{IgnitionCommand, SpIgnitionInfo}, rot::{RotCfpa, RotCfpaSlot, RotCmpa, RotState}, sensor::SpSensorReading, + task_dump::TaskDump, update::{ HostPhase2Progress, HostPhase2RecoveryImageId, InstallinatorImageId, SpUpdateStatus, @@ -306,6 +307,26 @@ pub trait GatewayApi { params: TypedBody, ) -> Result, HttpError>; + /// Get the number of task dumps present on an SP + #[endpoint { + method = GET, + path = "/sp/{type}/{slot}/task-dump", + }] + async fn sp_host_task_dump_count( + rqctx: RequestContext, + path: Path, + ) -> Result, HttpError>; + + /// Read a single task dump from an SP + #[endpoint { + method = GET, + path = "/sp/{type}/{slot}/task-dump/{task_dump_index}", + }] + async fn sp_host_task_dump_get( + rqctx: RequestContext, + path: Path, + ) -> Result, HttpError>; + /// List SPs via Ignition /// /// Retreive information for all SPs via the Ignition controller. This is @@ -498,6 +519,16 @@ pub struct PathSpComponent { pub component: String, } +#[derive(Deserialize, JsonSchema)] +pub struct PathSpTaskDumpIndex { + /// ID for the SP that the gateway service translates into the appropriate + /// port for communicating with the given SP. + #[serde(flatten)] + pub sp: SpIdentifier, + /// The index of the task dump to be read. + pub task_dump_index: u32, +} + #[derive(Deserialize, JsonSchema)] pub struct ComponentCabooseSlot { /// The firmware slot to for which we want to request caboose information. diff --git a/gateway-types/src/component_details.rs b/gateway-types/src/component_details.rs index 2b41d4b6f7..2c0dad8c95 100644 --- a/gateway-types/src/component_details.rs +++ b/gateway-types/src/component_details.rs @@ -374,6 +374,7 @@ pub enum MeasurementKind { InputCurrent, InputVoltage, Speed, + CpuTctl, } impl From for MeasurementKind { @@ -387,6 +388,7 @@ impl From for MeasurementKind { MeasurementKind::InputCurrent => Self::InputCurrent, MeasurementKind::InputVoltage => Self::InputVoltage, MeasurementKind::Speed => Self::Speed, + MeasurementKind::CpuTctl => Self::CpuTctl, } } } diff --git a/gateway-types/src/lib.rs b/gateway-types/src/lib.rs index 61bc291510..149cd1d8f0 100644 --- a/gateway-types/src/lib.rs +++ b/gateway-types/src/lib.rs @@ -11,4 +11,5 @@ pub mod host; pub mod ignition; pub mod rot; pub mod sensor; +pub mod task_dump; pub mod update; diff --git a/gateway-types/src/task_dump.rs b/gateway-types/src/task_dump.rs new file mode 100644 index 0000000000..684a7c95bd --- /dev/null +++ b/gateway-types/src/task_dump.rs @@ -0,0 +1,35 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; + +#[derive( + Debug, + Clone, + PartialEq, + Eq, + PartialOrd, + Ord, + Deserialize, + Serialize, + JsonSchema, +)] +pub struct TaskDump { + /// Index of the crashed task. + pub task_index: u16, + /// Timestamp at which the task crash occurred. + pub timestamp: u64, + /// Hex-encoded Hubris archive ID. + pub archive_id: String, + /// `BORD` field from the caboose. + pub bord: String, + /// `GITC` field from the caboose. + pub gitc: String, + /// `VERS` field from the caboose, if present. + pub vers: Option, + /// Base64-encoded raw memory read from the SP. + pub base64_memory: BTreeMap, +} diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs index 5746fbbf2e..81e6860311 100644 --- a/gateway/src/http_entrypoints.rs +++ b/gateway/src/http_entrypoints.rs @@ -43,6 +43,7 @@ use gateway_types::rot::RotCfpaSlot; use gateway_types::rot::RotCmpa; use gateway_types::rot::RotState; use gateway_types::sensor::SpSensorReading; +use gateway_types::task_dump::TaskDump; use gateway_types::update::HostPhase2Progress; use gateway_types::update::HostPhase2RecoveryImageId; use gateway_types::update::InstallinatorImageId; @@ -655,6 +656,66 @@ impl GatewayApi for GatewayImpl { apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } + async fn sp_host_task_dump_count( + rqctx: RequestContext, + path: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let sp_id = path.into_inner().sp.into(); + + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let ct = sp.task_dump_count().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + + Ok(HttpResponseOk(ct)) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await + } + + async fn sp_host_task_dump_get( + rqctx: RequestContext, + path: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let path = path.into_inner(); + let task_index = path.task_dump_index; + let sp_id = path.sp.into(); + + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let raw_dump = + sp.task_dump_read(task_index).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + + let archive_id = hex::encode(raw_dump.archive_id); + let base64_memory = raw_dump + .memory + .into_iter() + .map(|(key, mem)| { + let base64_mem = + base64::engine::general_purpose::STANDARD.encode(mem); + (key, base64_mem) + }) + .collect(); + + let dump = TaskDump { + task_index: raw_dump.task_index, + timestamp: raw_dump.timestamp, + archive_id, + bord: raw_dump.bord, + gitc: raw_dump.gitc, + vers: raw_dump.vers, + base64_memory, + }; + + Ok(HttpResponseOk(dump)) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await + } + async fn ignition_list( rqctx: RequestContext, ) -> Result>, HttpError> { diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index d389edf9e6..1bf70f56e9 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -804,6 +804,7 @@ impl SpPoller { MeasurementKind::InputCurrent => "input_current", MeasurementKind::InputVoltage => "input_voltage", MeasurementKind::Speed => "fan_speed", + MeasurementKind::CpuTctl => "cpu_tctl", }; let error = match error { MeasurementError::InvalidSensor => "invalid_sensor", @@ -858,6 +859,10 @@ impl SpPoller { &metric::AmdCpuTctl { sensor, datum }, ) } + (Ok(datum), MeasurementKind::CpuTctl) => Sample::new( + target, + &metric::AmdCpuTctl { sensor, datum }, + ), // Other measurements with the "temperature" measurement // kind are physical temperatures that actually exist in // reality (and are always in Celsius). @@ -873,6 +878,12 @@ impl SpPoller { &metric::AmdCpuTctl { sensor, datum: 0.0 }, ) } + (Err(_), MeasurementKind::CpuTctl) => { + Sample::new_missing( + target, + &metric::AmdCpuTctl { sensor, datum: 0.0 }, + ) + } (Err(_), MeasurementKind::Temperature) => { Sample::new_missing( target, @@ -1205,5 +1216,8 @@ fn comms_error_str(error: CommunicationError) -> &'static str { CommunicationError::BadTrailingDataSize { .. } => { "bad_trailing_data_size" } + CommunicationError::BadDecompressionSize { .. } => { + "bad_decompression_size" + } } } diff --git a/openapi/gateway.json b/openapi/gateway.json index b1a7adc96e..8eeecc138c 100644 --- a/openapi/gateway.json +++ b/openapi/gateway.json @@ -1510,6 +1510,108 @@ } } } + }, + "/sp/{type}/{slot}/task-dump": { + "get": { + "summary": "Get the number of task dumps present on an SP", + "operationId": "sp_host_task_dump_count", + "parameters": [ + { + "in": "path", + "name": "slot", + "required": true, + "schema": { + "type": "integer", + "format": "uint32", + "minimum": 0 + } + }, + { + "in": "path", + "name": "type", + "required": true, + "schema": { + "$ref": "#/components/schemas/SpType" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "uint32", + "type": "integer", + "format": "uint32", + "minimum": 0 + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/sp/{type}/{slot}/task-dump/{task_dump_index}": { + "get": { + "summary": "Read a single task dump from an SP", + "operationId": "sp_host_task_dump_get", + "parameters": [ + { + "in": "path", + "name": "slot", + "required": true, + "schema": { + "type": "integer", + "format": "uint32", + "minimum": 0 + } + }, + { + "in": "path", + "name": "task_dump_index", + "description": "The index of the task dump to be read.", + "required": true, + "schema": { + "type": "integer", + "format": "uint32", + "minimum": 0 + } + }, + { + "in": "path", + "name": "type", + "required": true, + "schema": { + "$ref": "#/components/schemas/SpType" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TaskDump" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } } }, "components": { @@ -1953,6 +2055,20 @@ "required": [ "kind" ] + }, + { + "type": "object", + "properties": { + "kind": { + "type": "string", + "enum": [ + "cpu_tctl" + ] + } + }, + "required": [ + "kind" + ] } ] }, @@ -3418,6 +3534,55 @@ } ] }, + "TaskDump": { + "type": "object", + "properties": { + "archive_id": { + "description": "Hex-encoded Hubris archive ID.", + "type": "string" + }, + "base64_memory": { + "description": "Base64-encoded raw memory read from the SP.", + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "bord": { + "description": "`BORD` field from the caboose.", + "type": "string" + }, + "gitc": { + "description": "`GITC` field from the caboose.", + "type": "string" + }, + "task_index": { + "description": "Index of the crashed task.", + "type": "integer", + "format": "uint16", + "minimum": 0 + }, + "timestamp": { + "description": "Timestamp at which the task crash occurred.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "vers": { + "nullable": true, + "description": "`VERS` field from the caboose, if present.", + "type": "string" + } + }, + "required": [ + "archive_id", + "base64_memory", + "bord", + "gitc", + "task_index", + "timestamp" + ] + }, "UpdateAbortBody": { "type": "object", "properties": { diff --git a/sp-sim/src/gimlet.rs b/sp-sim/src/gimlet.rs index de6c91b4d7..5a128de147 100644 --- a/sp-sim/src/gimlet.rs +++ b/sp-sim/src/gimlet.rs @@ -26,6 +26,10 @@ use gateway_messages::sp_impl::{BoundsChecked, DeviceDescription}; use gateway_messages::CfpaPage; use gateway_messages::ComponentAction; use gateway_messages::ComponentActionResponse; +use gateway_messages::DumpCompression; +use gateway_messages::DumpError; +use gateway_messages::DumpSegment; +use gateway_messages::DumpTask; use gateway_messages::Header; use gateway_messages::MgsRequest; use gateway_messages::MgsResponse; @@ -652,6 +656,7 @@ struct Handler { should_fail_to_respond_signal: Option>, no_stage0_caboose: bool, old_rot_state: bool, + inflight_dump_reads: HashMap<[u8; 16], u32>, } impl Handler { @@ -677,6 +682,7 @@ impl Handler { } let sensors = Sensors::from_component_configs(&components); + let inflight_dump_reads = HashMap::new(); Self { log, @@ -696,6 +702,7 @@ impl Handler { should_fail_to_respond_signal: None, old_rot_state, no_stage0_caboose, + inflight_dump_reads, } } @@ -1500,6 +1507,64 @@ impl SpHandler for Handler { } } } + + fn get_task_dump_count(&mut self) -> std::result::Result { + Ok(1) + } + + fn task_dump_read_start( + &mut self, + index: u32, + key: [u8; 16], + ) -> std::result::Result { + if index != 0 { + return Err(SpError::Dump(DumpError::BadIndex)); + } + + self.inflight_dump_reads.insert(key, 0); + + Ok(DumpTask { task: 0, time: 2, compression: DumpCompression::Lzss }) + } + + fn task_dump_read_continue( + &mut self, + key: [u8; 16], + seq: u32, + buf: &mut [u8], + ) -> std::result::Result, SpError> + { + const DATA: &'static [u8] = b"CAFECAFECAFECAFE"; + const MAX_SEQ: u32 = 2; + + let Some(current_seq) = self.inflight_dump_reads.get_mut(&key) else { + return Err(SpError::Dump(DumpError::BadKey)); + }; + + let expected_seq = *current_seq; + if seq != expected_seq { + return Err(SpError::Dump(DumpError::BadSequenceNumber)); + } + + // Dump read complete. + if seq > MAX_SEQ { + self.inflight_dump_reads.remove(&key); + return Ok(None); + } + + if buf.len() < DATA.len() { + return Err(SpError::Dump(DumpError::SegmentTooLong)); + } + + *current_seq += 1; + buf.copy_from_slice(DATA); + + Ok(Some(DumpSegment { + address: expected_seq, + compressed_length: DATA.len() as u16, + uncompressed_length: DATA.len() as u16, + seq: expected_seq, + })) + } } impl SimSpHandler for Handler { diff --git a/sp-sim/src/sidecar.rs b/sp-sim/src/sidecar.rs index 286dd50638..e526a2ac9a 100644 --- a/sp-sim/src/sidecar.rs +++ b/sp-sim/src/sidecar.rs @@ -34,6 +34,10 @@ use gateway_messages::ComponentAction; use gateway_messages::ComponentActionResponse; use gateway_messages::ComponentDetails; use gateway_messages::DiscoverResponse; +use gateway_messages::DumpCompression; +use gateway_messages::DumpError; +use gateway_messages::DumpSegment; +use gateway_messages::DumpTask; use gateway_messages::IgnitionCommand; use gateway_messages::IgnitionState; use gateway_messages::MgsError; @@ -54,6 +58,7 @@ use slog::debug; use slog::info; use slog::warn; use slog::Logger; +use std::collections::HashMap; use std::iter; use std::net::SocketAddrV6; use std::pin::Pin; @@ -400,6 +405,7 @@ struct Handler { should_fail_to_respond_signal: Option>, no_stage0_caboose: bool, old_rot_state: bool, + inflight_dump_reads: HashMap<[u8; 16], u32>, } impl Handler { @@ -440,6 +446,7 @@ impl Handler { should_fail_to_respond_signal: None, old_rot_state, no_stage0_caboose, + inflight_dump_reads: HashMap::new(), } } @@ -1210,6 +1217,64 @@ impl SpHandler for Handler { } } } + + fn get_task_dump_count(&mut self) -> std::result::Result { + Ok(1) + } + + fn task_dump_read_start( + &mut self, + index: u32, + key: [u8; 16], + ) -> std::result::Result { + if index != 0 { + return Err(SpError::Dump(DumpError::BadIndex)); + } + + self.inflight_dump_reads.insert(key, 0); + + Ok(DumpTask { task: 0, time: 2, compression: DumpCompression::Lzss }) + } + + fn task_dump_read_continue( + &mut self, + key: [u8; 16], + seq: u32, + buf: &mut [u8], + ) -> std::result::Result, SpError> + { + const DATA: &'static [u8] = b"CAFECAFECAFECAFE"; + const MAX_SEQ: u32 = 2; + + let Some(current_seq) = self.inflight_dump_reads.get_mut(&key) else { + return Err(SpError::Dump(DumpError::BadKey)); + }; + + let expected_seq = *current_seq; + if seq != expected_seq { + return Err(SpError::Dump(DumpError::BadSequenceNumber)); + } + + // Dump read complete. + if seq > MAX_SEQ { + self.inflight_dump_reads.remove(&key); + return Ok(None); + } + + if buf.len() < DATA.len() { + return Err(SpError::Dump(DumpError::SegmentTooLong)); + } + + *current_seq += 1; + buf.copy_from_slice(DATA); + + Ok(Some(DumpSegment { + address: expected_seq, + compressed_length: DATA.len() as u16, + uncompressed_length: DATA.len() as u16, + seq: expected_seq, + })) + } } impl SimSpHandler for Handler { diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 678170b25e..1e2d77fd7f 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -58,7 +58,7 @@ futures-io = { version = "0.3.31" } futures-sink = { version = "0.3.31" } futures-task = { version = "0.3.31", default-features = false, features = ["std"] } futures-util = { version = "0.3.31", features = ["channel", "io", "sink"] } -gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "9bbac475dcaac88286c07a20b6bd3e94fc81d7f0", features = ["std"] } +gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "97301243f0707708ae9e629e2b4cdea5ae3fd078", features = ["std"] } generic-array = { version = "0.14.7", default-features = false, features = ["more_lengths", "zeroize"] } getrandom = { version = "0.2.15", default-features = false, features = ["js", "rdrand", "std"] } group = { version = "0.13.0", default-features = false, features = ["alloc"] } @@ -178,7 +178,7 @@ futures-io = { version = "0.3.31" } futures-sink = { version = "0.3.31" } futures-task = { version = "0.3.31", default-features = false, features = ["std"] } futures-util = { version = "0.3.31", features = ["channel", "io", "sink"] } -gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "9bbac475dcaac88286c07a20b6bd3e94fc81d7f0", features = ["std"] } +gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "97301243f0707708ae9e629e2b4cdea5ae3fd078", features = ["std"] } generic-array = { version = "0.14.7", default-features = false, features = ["more_lengths", "zeroize"] } getrandom = { version = "0.2.15", default-features = false, features = ["js", "rdrand", "std"] } group = { version = "0.13.0", default-features = false, features = ["alloc"] }