diff --git a/Cargo.lock b/Cargo.lock index 3516f72facf..9d4251ef7da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3804,6 +3804,16 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "kstat-rs" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc713c7902f757cf0c04012dbad3864ea505f2660467b704847ea7ea2ff6d67" +dependencies = [ + "libc", + "thiserror", +] + [[package]] name = "lalrpop" version = "0.19.12" @@ -5344,6 +5354,7 @@ dependencies = [ "openapiv3", "opte-ioctl", "oximeter 0.1.0", + "oximeter-instruments", "oximeter-producer 0.1.0", "percent-encoding", "pretty_assertions", @@ -5870,12 +5881,19 @@ dependencies = [ name = "oximeter-instruments" version = "0.1.0" dependencies = [ + "cfg-if 1.0.0", "chrono", "dropshot", "futures", "http", + "kstat-rs", "omicron-workspace-hack", "oximeter 0.1.0", + "rand 0.8.5", + "slog", + "slog-async", + "slog-term", + "thiserror", "tokio", "uuid", ] diff --git a/Cargo.toml b/Cargo.toml index e915a2d1132..83d63693aee 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -224,6 +224,7 @@ ipcc-key-value = { path = "ipcc-key-value" } ipnetwork = { version = "0.20", features = ["schemars"] } itertools = "0.11.0" key-manager = { path = "key-manager" } +kstat-rs = "0.2.3" lazy_static = "1.4.0" libc = "0.2.150" linear-map = "1.2.0" diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 20cadd30542..a14103601bd 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -289,6 +289,44 @@ } } }, + "/metrics/collect/{producer_id}": { + "get": { + "operationId": "metrics_collect", + "parameters": [ + { + "in": "path", + "name": "producer_id", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_ProducerResultsItem", + "type": "array", + "items": { + "$ref": "#/components/schemas/ProducerResultsItem" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/network-bootstore-config": { "get": { "summary": "This API endpoint is only reading the local sled agent's view of the", @@ -1096,817 +1134,3023 @@ "port" ] }, - "BundleUtilization": { - "description": "The portion of a debug dataset used for zone bundles.", - "type": "object", - "properties": { - "bytes_available": { - "description": "The total number of bytes available for zone bundles.\n\nThis is `dataset_quota` multiplied by the context's storage limit.", - "type": "integer", - "format": "uint64", - "minimum": 0 - }, - "bytes_used": { - "description": "Total bundle usage, in bytes.", - "type": "integer", - "format": "uint64", - "minimum": 0 - }, - "dataset_quota": { - "description": "The total dataset quota, in bytes.", - "type": "integer", - "format": "uint64", - "minimum": 0 - } - }, - "required": [ - "bytes_available", - "bytes_used", - "dataset_quota" - ] - }, - "ByteCount": { - "description": "Byte count to express memory or storage capacity.", - "type": "integer", - "format": "uint64", - "minimum": 0 - }, - "CleanupContext": { - "description": "Context provided for the zone bundle cleanup task.", - "type": "object", - "properties": { - "period": { - "description": "The period on which automatic checks and cleanup is performed.", - "allOf": [ - { - "$ref": "#/components/schemas/CleanupPeriod" - } - ] - }, - "priority": { - "description": "The priority ordering for keeping old bundles.", - "allOf": [ - { - "$ref": "#/components/schemas/PriorityOrder" - } - ] - }, - "storage_limit": { - "description": "The limit on the dataset quota available for zone bundles.", - "allOf": [ - { - "$ref": "#/components/schemas/StorageLimit" - } - ] - } - }, - "required": [ - "period", - "priority", - "storage_limit" - ] - }, - "CleanupContextUpdate": { - "description": "Parameters used to update the zone bundle cleanup context.", - "type": "object", - "properties": { - "period": { - "nullable": true, - "description": "The new period on which automatic cleanups are run.", - "allOf": [ - { - "$ref": "#/components/schemas/Duration" - } - ] - }, - "priority": { - "nullable": true, - "description": "The priority ordering for preserving old zone bundles.", - "allOf": [ - { - "$ref": "#/components/schemas/PriorityOrder" - } - ] - }, - "storage_limit": { - "nullable": true, - "description": "The new limit on the underlying dataset quota allowed for bundles.", - "type": "integer", - "format": "uint8", - "minimum": 0 - } - } - }, - "CleanupCount": { - "description": "The count of bundles / bytes removed during a cleanup operation.", - "type": "object", - "properties": { - "bundles": { - "description": "The number of bundles removed.", - "type": "integer", - "format": "uint64", - "minimum": 0 - }, - "bytes": { - "description": "The number of bytes removed.", - "type": "integer", - "format": "uint64", - "minimum": 0 - } - }, - "required": [ - "bundles", - "bytes" - ] - }, - "CleanupPeriod": { - "description": "A period on which bundles are automatically cleaned up.", - "allOf": [ - { - "$ref": "#/components/schemas/Duration" - } - ] - }, - "CrucibleOpts": { - "type": "object", - "properties": { - "cert_pem": { - "nullable": true, - "type": "string" - }, - "control": { - "nullable": true, - "type": "string" - }, - "flush_timeout": { - "nullable": true, - "type": "number", - "format": "float" - }, - "id": { - "type": "string", - "format": "uuid" - }, - "key": { - "nullable": true, - "type": "string" - }, - "key_pem": { - "nullable": true, - "type": "string" - }, - "lossy": { - "type": "boolean" - }, - "read_only": { - "type": "boolean" - }, - "root_cert_pem": { - "nullable": true, - "type": "string" - }, - "target": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "required": [ - "id", - "lossy", - "read_only", - "target" - ] - }, - "DatasetKind": { - "description": "The type of a dataset, and an auxiliary information necessary to successfully launch a zone managing the associated data.", + "BinRangedouble": { + "description": "A type storing a range over `T`.\n\nThis type supports ranges similar to the `RangeTo`, `Range` and `RangeFrom` types in the standard library. Those cover `(..end)`, `(start..end)`, and `(start..)` respectively.", "oneOf": [ { + "description": "A range unbounded below and exclusively above, `..end`.", "type": "object", "properties": { + "end": { + "type": "number", + "format": "double" + }, "type": { "type": "string", "enum": [ - "cockroach_db" + "range_to" ] } }, "required": [ + "end", "type" ] }, { + "description": "A range bounded inclusively below and exclusively above, `start..end`.", "type": "object", "properties": { + "end": { + "type": "number", + "format": "double" + }, + "start": { + "type": "number", + "format": "double" + }, "type": { "type": "string", "enum": [ - "crucible" + "range" ] } }, "required": [ + "end", + "start", "type" ] }, { + "description": "A range bounded inclusively below and unbounded above, `start..`.", "type": "object", "properties": { + "start": { + "type": "number", + "format": "double" + }, "type": { "type": "string", "enum": [ - "clickhouse" + "range_from" ] } }, "required": [ + "start", "type" ] - }, + } + ] + }, + "BinRangefloat": { + "description": "A type storing a range over `T`.\n\nThis type supports ranges similar to the `RangeTo`, `Range` and `RangeFrom` types in the standard library. Those cover `(..end)`, `(start..end)`, and `(start..)` respectively.", + "oneOf": [ { + "description": "A range unbounded below and exclusively above, `..end`.", "type": "object", "properties": { + "end": { + "type": "number", + "format": "float" + }, "type": { "type": "string", "enum": [ - "clickhouse_keeper" + "range_to" ] } }, "required": [ + "end", "type" ] }, { + "description": "A range bounded inclusively below and exclusively above, `start..end`.", "type": "object", "properties": { + "end": { + "type": "number", + "format": "float" + }, + "start": { + "type": "number", + "format": "float" + }, "type": { "type": "string", "enum": [ - "external_dns" + "range" ] } }, "required": [ + "end", + "start", "type" ] }, { + "description": "A range bounded inclusively below and unbounded above, `start..`.", "type": "object", "properties": { + "start": { + "type": "number", + "format": "float" + }, "type": { "type": "string", "enum": [ - "internal_dns" + "range_from" ] } }, "required": [ + "start", "type" ] } ] }, - "DatasetName": { - "type": "object", - "properties": { - "kind": { - "$ref": "#/components/schemas/DatasetKind" - }, - "pool_name": { - "$ref": "#/components/schemas/ZpoolName" - } - }, - "required": [ - "kind", - "pool_name" - ] - }, - "DatasetRequest": { - "description": "Describes a request to provision a specific dataset", - "type": "object", - "properties": { + "BinRangeint16": { + "description": "A type storing a range over `T`.\n\nThis type supports ranges similar to the `RangeTo`, `Range` and `RangeFrom` types in the standard library. Those cover `(..end)`, `(start..end)`, and `(start..)` respectively.", + "oneOf": [ + { + "description": "A range unbounded below and exclusively above, `..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "int16" + }, + "type": { + "type": "string", + "enum": [ + "range_to" + ] + } + }, + "required": [ + "end", + "type" + ] + }, + { + "description": "A range bounded inclusively below and exclusively above, `start..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "int16" + }, + "start": { + "type": "integer", + "format": "int16" + }, + "type": { + "type": "string", + "enum": [ + "range" + ] + } + }, + "required": [ + "end", + "start", + "type" + ] + }, + { + "description": "A range bounded inclusively below and unbounded above, `start..`.", + "type": "object", + "properties": { + "start": { + "type": "integer", + "format": "int16" + }, + "type": { + "type": "string", + "enum": [ + "range_from" + ] + } + }, + "required": [ + "start", + "type" + ] + } + ] + }, + "BinRangeint32": { + "description": "A type storing a range over `T`.\n\nThis type supports ranges similar to the `RangeTo`, `Range` and `RangeFrom` types in the standard library. Those cover `(..end)`, `(start..end)`, and `(start..)` respectively.", + "oneOf": [ + { + "description": "A range unbounded below and exclusively above, `..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "int32" + }, + "type": { + "type": "string", + "enum": [ + "range_to" + ] + } + }, + "required": [ + "end", + "type" + ] + }, + { + "description": "A range bounded inclusively below and exclusively above, `start..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "int32" + }, + "start": { + "type": "integer", + "format": "int32" + }, + "type": { + "type": "string", + "enum": [ + "range" + ] + } + }, + "required": [ + "end", + "start", + "type" + ] + }, + { + "description": "A range bounded inclusively below and unbounded above, `start..`.", + "type": "object", + "properties": { + "start": { + "type": "integer", + "format": "int32" + }, + "type": { + "type": "string", + "enum": [ + "range_from" + ] + } + }, + "required": [ + "start", + "type" + ] + } + ] + }, + "BinRangeint64": { + "description": "A type storing a range over `T`.\n\nThis type supports ranges similar to the `RangeTo`, `Range` and `RangeFrom` types in the standard library. Those cover `(..end)`, `(start..end)`, and `(start..)` respectively.", + "oneOf": [ + { + "description": "A range unbounded below and exclusively above, `..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "int64" + }, + "type": { + "type": "string", + "enum": [ + "range_to" + ] + } + }, + "required": [ + "end", + "type" + ] + }, + { + "description": "A range bounded inclusively below and exclusively above, `start..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "int64" + }, + "start": { + "type": "integer", + "format": "int64" + }, + "type": { + "type": "string", + "enum": [ + "range" + ] + } + }, + "required": [ + "end", + "start", + "type" + ] + }, + { + "description": "A range bounded inclusively below and unbounded above, `start..`.", + "type": "object", + "properties": { + "start": { + "type": "integer", + "format": "int64" + }, + "type": { + "type": "string", + "enum": [ + "range_from" + ] + } + }, + "required": [ + "start", + "type" + ] + } + ] + }, + "BinRangeint8": { + "description": "A type storing a range over `T`.\n\nThis type supports ranges similar to the `RangeTo`, `Range` and `RangeFrom` types in the standard library. Those cover `(..end)`, `(start..end)`, and `(start..)` respectively.", + "oneOf": [ + { + "description": "A range unbounded below and exclusively above, `..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "int8" + }, + "type": { + "type": "string", + "enum": [ + "range_to" + ] + } + }, + "required": [ + "end", + "type" + ] + }, + { + "description": "A range bounded inclusively below and exclusively above, `start..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "int8" + }, + "start": { + "type": "integer", + "format": "int8" + }, + "type": { + "type": "string", + "enum": [ + "range" + ] + } + }, + "required": [ + "end", + "start", + "type" + ] + }, + { + "description": "A range bounded inclusively below and unbounded above, `start..`.", + "type": "object", + "properties": { + "start": { + "type": "integer", + "format": "int8" + }, + "type": { + "type": "string", + "enum": [ + "range_from" + ] + } + }, + "required": [ + "start", + "type" + ] + } + ] + }, + "BinRangeuint16": { + "description": "A type storing a range over `T`.\n\nThis type supports ranges similar to the `RangeTo`, `Range` and `RangeFrom` types in the standard library. Those cover `(..end)`, `(start..end)`, and `(start..)` respectively.", + "oneOf": [ + { + "description": "A range unbounded below and exclusively above, `..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "uint16", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "range_to" + ] + } + }, + "required": [ + "end", + "type" + ] + }, + { + "description": "A range bounded inclusively below and exclusively above, `start..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "uint16", + "minimum": 0 + }, + "start": { + "type": "integer", + "format": "uint16", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "range" + ] + } + }, + "required": [ + "end", + "start", + "type" + ] + }, + { + "description": "A range bounded inclusively below and unbounded above, `start..`.", + "type": "object", + "properties": { + "start": { + "type": "integer", + "format": "uint16", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "range_from" + ] + } + }, + "required": [ + "start", + "type" + ] + } + ] + }, + "BinRangeuint32": { + "description": "A type storing a range over `T`.\n\nThis type supports ranges similar to the `RangeTo`, `Range` and `RangeFrom` types in the standard library. Those cover `(..end)`, `(start..end)`, and `(start..)` respectively.", + "oneOf": [ + { + "description": "A range unbounded below and exclusively above, `..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "uint32", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "range_to" + ] + } + }, + "required": [ + "end", + "type" + ] + }, + { + "description": "A range bounded inclusively below and exclusively above, `start..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "uint32", + "minimum": 0 + }, + "start": { + "type": "integer", + "format": "uint32", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "range" + ] + } + }, + "required": [ + "end", + "start", + "type" + ] + }, + { + "description": "A range bounded inclusively below and unbounded above, `start..`.", + "type": "object", + "properties": { + "start": { + "type": "integer", + "format": "uint32", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "range_from" + ] + } + }, + "required": [ + "start", + "type" + ] + } + ] + }, + "BinRangeuint64": { + "description": "A type storing a range over `T`.\n\nThis type supports ranges similar to the `RangeTo`, `Range` and `RangeFrom` types in the standard library. Those cover `(..end)`, `(start..end)`, and `(start..)` respectively.", + "oneOf": [ + { + "description": "A range unbounded below and exclusively above, `..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "range_to" + ] + } + }, + "required": [ + "end", + "type" + ] + }, + { + "description": "A range bounded inclusively below and exclusively above, `start..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "start": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "range" + ] + } + }, + "required": [ + "end", + "start", + "type" + ] + }, + { + "description": "A range bounded inclusively below and unbounded above, `start..`.", + "type": "object", + "properties": { + "start": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "range_from" + ] + } + }, + "required": [ + "start", + "type" + ] + } + ] + }, + "BinRangeuint8": { + "description": "A type storing a range over `T`.\n\nThis type supports ranges similar to the `RangeTo`, `Range` and `RangeFrom` types in the standard library. Those cover `(..end)`, `(start..end)`, and `(start..)` respectively.", + "oneOf": [ + { + "description": "A range unbounded below and exclusively above, `..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "uint8", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "range_to" + ] + } + }, + "required": [ + "end", + "type" + ] + }, + { + "description": "A range bounded inclusively below and exclusively above, `start..end`.", + "type": "object", + "properties": { + "end": { + "type": "integer", + "format": "uint8", + "minimum": 0 + }, + "start": { + "type": "integer", + "format": "uint8", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "range" + ] + } + }, + "required": [ + "end", + "start", + "type" + ] + }, + { + "description": "A range bounded inclusively below and unbounded above, `start..`.", + "type": "object", + "properties": { + "start": { + "type": "integer", + "format": "uint8", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "range_from" + ] + } + }, + "required": [ + "start", + "type" + ] + } + ] + }, + "Bindouble": { + "description": "Type storing bin edges and a count of samples within it.", + "type": "object", + "properties": { + "count": { + "description": "The total count of samples in this bin.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "range": { + "description": "The range of the support covered by this bin.", + "allOf": [ + { + "$ref": "#/components/schemas/BinRangedouble" + } + ] + } + }, + "required": [ + "count", + "range" + ] + }, + "Binfloat": { + "description": "Type storing bin edges and a count of samples within it.", + "type": "object", + "properties": { + "count": { + "description": "The total count of samples in this bin.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "range": { + "description": "The range of the support covered by this bin.", + "allOf": [ + { + "$ref": "#/components/schemas/BinRangefloat" + } + ] + } + }, + "required": [ + "count", + "range" + ] + }, + "Binint16": { + "description": "Type storing bin edges and a count of samples within it.", + "type": "object", + "properties": { + "count": { + "description": "The total count of samples in this bin.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "range": { + "description": "The range of the support covered by this bin.", + "allOf": [ + { + "$ref": "#/components/schemas/BinRangeint16" + } + ] + } + }, + "required": [ + "count", + "range" + ] + }, + "Binint32": { + "description": "Type storing bin edges and a count of samples within it.", + "type": "object", + "properties": { + "count": { + "description": "The total count of samples in this bin.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "range": { + "description": "The range of the support covered by this bin.", + "allOf": [ + { + "$ref": "#/components/schemas/BinRangeint32" + } + ] + } + }, + "required": [ + "count", + "range" + ] + }, + "Binint64": { + "description": "Type storing bin edges and a count of samples within it.", + "type": "object", + "properties": { + "count": { + "description": "The total count of samples in this bin.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "range": { + "description": "The range of the support covered by this bin.", + "allOf": [ + { + "$ref": "#/components/schemas/BinRangeint64" + } + ] + } + }, + "required": [ + "count", + "range" + ] + }, + "Binint8": { + "description": "Type storing bin edges and a count of samples within it.", + "type": "object", + "properties": { + "count": { + "description": "The total count of samples in this bin.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "range": { + "description": "The range of the support covered by this bin.", + "allOf": [ + { + "$ref": "#/components/schemas/BinRangeint8" + } + ] + } + }, + "required": [ + "count", + "range" + ] + }, + "Binuint16": { + "description": "Type storing bin edges and a count of samples within it.", + "type": "object", + "properties": { + "count": { + "description": "The total count of samples in this bin.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "range": { + "description": "The range of the support covered by this bin.", + "allOf": [ + { + "$ref": "#/components/schemas/BinRangeuint16" + } + ] + } + }, + "required": [ + "count", + "range" + ] + }, + "Binuint32": { + "description": "Type storing bin edges and a count of samples within it.", + "type": "object", + "properties": { + "count": { + "description": "The total count of samples in this bin.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "range": { + "description": "The range of the support covered by this bin.", + "allOf": [ + { + "$ref": "#/components/schemas/BinRangeuint32" + } + ] + } + }, + "required": [ + "count", + "range" + ] + }, + "Binuint64": { + "description": "Type storing bin edges and a count of samples within it.", + "type": "object", + "properties": { + "count": { + "description": "The total count of samples in this bin.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "range": { + "description": "The range of the support covered by this bin.", + "allOf": [ + { + "$ref": "#/components/schemas/BinRangeuint64" + } + ] + } + }, + "required": [ + "count", + "range" + ] + }, + "Binuint8": { + "description": "Type storing bin edges and a count of samples within it.", + "type": "object", + "properties": { + "count": { + "description": "The total count of samples in this bin.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "range": { + "description": "The range of the support covered by this bin.", + "allOf": [ + { + "$ref": "#/components/schemas/BinRangeuint8" + } + ] + } + }, + "required": [ + "count", + "range" + ] + }, + "BundleUtilization": { + "description": "The portion of a debug dataset used for zone bundles.", + "type": "object", + "properties": { + "bytes_available": { + "description": "The total number of bytes available for zone bundles.\n\nThis is `dataset_quota` multiplied by the context's storage limit.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "bytes_used": { + "description": "Total bundle usage, in bytes.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "dataset_quota": { + "description": "The total dataset quota, in bytes.", + "type": "integer", + "format": "uint64", + "minimum": 0 + } + }, + "required": [ + "bytes_available", + "bytes_used", + "dataset_quota" + ] + }, + "ByteCount": { + "description": "Byte count to express memory or storage capacity.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "CleanupContext": { + "description": "Context provided for the zone bundle cleanup task.", + "type": "object", + "properties": { + "period": { + "description": "The period on which automatic checks and cleanup is performed.", + "allOf": [ + { + "$ref": "#/components/schemas/CleanupPeriod" + } + ] + }, + "priority": { + "description": "The priority ordering for keeping old bundles.", + "allOf": [ + { + "$ref": "#/components/schemas/PriorityOrder" + } + ] + }, + "storage_limit": { + "description": "The limit on the dataset quota available for zone bundles.", + "allOf": [ + { + "$ref": "#/components/schemas/StorageLimit" + } + ] + } + }, + "required": [ + "period", + "priority", + "storage_limit" + ] + }, + "CleanupContextUpdate": { + "description": "Parameters used to update the zone bundle cleanup context.", + "type": "object", + "properties": { + "period": { + "nullable": true, + "description": "The new period on which automatic cleanups are run.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + }, + "priority": { + "nullable": true, + "description": "The priority ordering for preserving old zone bundles.", + "allOf": [ + { + "$ref": "#/components/schemas/PriorityOrder" + } + ] + }, + "storage_limit": { + "nullable": true, + "description": "The new limit on the underlying dataset quota allowed for bundles.", + "type": "integer", + "format": "uint8", + "minimum": 0 + } + } + }, + "CleanupCount": { + "description": "The count of bundles / bytes removed during a cleanup operation.", + "type": "object", + "properties": { + "bundles": { + "description": "The number of bundles removed.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "bytes": { + "description": "The number of bytes removed.", + "type": "integer", + "format": "uint64", + "minimum": 0 + } + }, + "required": [ + "bundles", + "bytes" + ] + }, + "CleanupPeriod": { + "description": "A period on which bundles are automatically cleaned up.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + }, + "CrucibleOpts": { + "type": "object", + "properties": { + "cert_pem": { + "nullable": true, + "type": "string" + }, + "control": { + "nullable": true, + "type": "string" + }, + "flush_timeout": { + "nullable": true, + "type": "number", + "format": "float" + }, + "id": { + "type": "string", + "format": "uuid" + }, + "key": { + "nullable": true, + "type": "string" + }, + "key_pem": { + "nullable": true, + "type": "string" + }, + "lossy": { + "type": "boolean" + }, + "read_only": { + "type": "boolean" + }, + "root_cert_pem": { + "nullable": true, + "type": "string" + }, + "target": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "id", + "lossy", + "read_only", + "target" + ] + }, + "Cumulativedouble": { + "description": "A cumulative or counter data type.", + "type": "object", + "properties": { + "start_time": { + "type": "string", + "format": "date-time" + }, + "value": { + "type": "number", + "format": "double" + } + }, + "required": [ + "start_time", + "value" + ] + }, + "Cumulativefloat": { + "description": "A cumulative or counter data type.", + "type": "object", + "properties": { + "start_time": { + "type": "string", + "format": "date-time" + }, + "value": { + "type": "number", + "format": "float" + } + }, + "required": [ + "start_time", + "value" + ] + }, + "Cumulativeint64": { + "description": "A cumulative or counter data type.", + "type": "object", + "properties": { + "start_time": { + "type": "string", + "format": "date-time" + }, + "value": { + "type": "integer", + "format": "int64" + } + }, + "required": [ + "start_time", + "value" + ] + }, + "Cumulativeuint64": { + "description": "A cumulative or counter data type.", + "type": "object", + "properties": { + "start_time": { + "type": "string", + "format": "date-time" + }, + "value": { + "type": "integer", + "format": "uint64", + "minimum": 0 + } + }, + "required": [ + "start_time", + "value" + ] + }, + "DatasetKind": { + "description": "The type of a dataset, and an auxiliary information necessary to successfully launch a zone managing the associated data.", + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "cockroach_db" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "crucible" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "clickhouse" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "clickhouse_keeper" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "external_dns" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "internal_dns" + ] + } + }, + "required": [ + "type" + ] + } + ] + }, + "DatasetName": { + "type": "object", + "properties": { + "kind": { + "$ref": "#/components/schemas/DatasetKind" + }, + "pool_name": { + "$ref": "#/components/schemas/ZpoolName" + } + }, + "required": [ + "kind", + "pool_name" + ] + }, + "DatasetRequest": { + "description": "Describes a request to provision a specific dataset", + "type": "object", + "properties": { "id": { "type": "string", - "format": "uuid" + "format": "uuid" + }, + "name": { + "$ref": "#/components/schemas/DatasetName" + }, + "service_address": { + "type": "string" + } + }, + "required": [ + "id", + "name", + "service_address" + ] + }, + "Datum": { + "description": "A `Datum` is a single sampled data point from a metric.", + "oneOf": [ + { + "type": "object", + "properties": { + "datum": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "bool" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "type": "integer", + "format": "int8" + }, + "type": { + "type": "string", + "enum": [ + "i8" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "type": "integer", + "format": "uint8", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "u8" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "type": "integer", + "format": "int16" + }, + "type": { + "type": "string", + "enum": [ + "i16" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "type": "integer", + "format": "uint16", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "u16" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "type": "integer", + "format": "int32" + }, + "type": { + "type": "string", + "enum": [ + "i32" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "type": "integer", + "format": "uint32", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "u32" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "type": "integer", + "format": "int64" + }, + "type": { + "type": "string", + "enum": [ + "i64" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "type": { + "type": "string", + "enum": [ + "u64" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "type": "number", + "format": "float" + }, + "type": { + "type": "string", + "enum": [ + "f32" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "type": "number", + "format": "double" + }, + "type": { + "type": "string", + "enum": [ + "f64" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "string" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "type": "array", + "items": { + "type": "integer", + "format": "uint8", + "minimum": 0 + } + }, + "type": { + "type": "string", + "enum": [ + "bytes" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Cumulativeint64" + }, + "type": { + "type": "string", + "enum": [ + "cumulative_i64" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Cumulativeuint64" + }, + "type": { + "type": "string", + "enum": [ + "cumulative_u64" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Cumulativefloat" + }, + "type": { + "type": "string", + "enum": [ + "cumulative_f32" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Cumulativedouble" + }, + "type": { + "type": "string", + "enum": [ + "cumulative_f64" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Histogramint8" + }, + "type": { + "type": "string", + "enum": [ + "histogram_i8" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Histogramuint8" + }, + "type": { + "type": "string", + "enum": [ + "histogram_u8" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Histogramint16" + }, + "type": { + "type": "string", + "enum": [ + "histogram_i16" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Histogramuint16" + }, + "type": { + "type": "string", + "enum": [ + "histogram_u16" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Histogramint32" + }, + "type": { + "type": "string", + "enum": [ + "histogram_i32" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Histogramuint32" + }, + "type": { + "type": "string", + "enum": [ + "histogram_u32" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Histogramint64" + }, + "type": { + "type": "string", + "enum": [ + "histogram_i64" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Histogramuint64" + }, + "type": { + "type": "string", + "enum": [ + "histogram_u64" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Histogramfloat" + }, + "type": { + "type": "string", + "enum": [ + "histogram_f32" + ] + } + }, + "required": [ + "datum", + "type" + ] + }, + { + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Histogramdouble" + }, + "type": { + "type": "string", + "enum": [ + "histogram_f64" + ] + } + }, + "required": [ + "datum", + "type" + ] + } + ] + }, + "DeleteVirtualNetworkInterfaceHost": { + "description": "The data needed to identify a virtual IP for which a sled maintains an OPTE virtual-to-physical mapping such that that mapping can be deleted.", + "type": "object", + "properties": { + "virtual_ip": { + "description": "The virtual IP whose mapping should be deleted.", + "type": "string", + "format": "ip" + }, + "vni": { + "description": "The VNI for the network containing the virtual IP whose mapping should be deleted.", + "allOf": [ + { + "$ref": "#/components/schemas/Vni" + } + ] + } + }, + "required": [ + "virtual_ip", + "vni" + ] + }, + "DhcpConfig": { + "description": "DHCP configuration for a port\n\nNot present here: Hostname (DHCPv4 option 12; used in DHCPv6 option 39); we use `InstanceRuntimeState::hostname` for this value.", + "type": "object", + "properties": { + "dns_servers": { + "description": "DNS servers to send to the instance\n\n(DHCPv4 option 6; DHCPv6 option 23)", + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + }, + "host_domain": { + "nullable": true, + "description": "DNS zone this instance's hostname belongs to (e.g. the `project.example` part of `instance1.project.example`)\n\n(DHCPv4 option 15; used in DHCPv6 option 39)", + "type": "string" + }, + "search_domains": { + "description": "DNS search domains\n\n(DHCPv4 option 119; DHCPv6 option 24)", + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "dns_servers", + "search_domains" + ] + }, + "DiskEnsureBody": { + "description": "Sent from to a sled agent to establish the runtime state of a Disk", + "type": "object", + "properties": { + "initial_runtime": { + "description": "Last runtime state of the Disk known to Nexus (used if the agent has never seen this Disk before).", + "allOf": [ + { + "$ref": "#/components/schemas/DiskRuntimeState" + } + ] + }, + "target": { + "description": "requested runtime state of the Disk", + "allOf": [ + { + "$ref": "#/components/schemas/DiskStateRequested" + } + ] + } + }, + "required": [ + "initial_runtime", + "target" + ] + }, + "DiskRequest": { + "type": "object", + "properties": { + "device": { + "type": "string" + }, + "name": { + "type": "string" + }, + "read_only": { + "type": "boolean" + }, + "slot": { + "$ref": "#/components/schemas/Slot" + }, + "volume_construction_request": { + "$ref": "#/components/schemas/VolumeConstructionRequest" + } + }, + "required": [ + "device", + "name", + "read_only", + "slot", + "volume_construction_request" + ] + }, + "DiskRuntimeState": { + "description": "Runtime state of the Disk, which includes its attach state and some minimal metadata", + "type": "object", + "properties": { + "disk_state": { + "description": "runtime state of the Disk", + "allOf": [ + { + "$ref": "#/components/schemas/DiskState" + } + ] + }, + "gen": { + "description": "generation number for this state", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, + "time_updated": { + "description": "timestamp for this information", + "type": "string", + "format": "date-time" + } + }, + "required": [ + "disk_state", + "gen", + "time_updated" + ] + }, + "DiskState": { + "description": "State of a Disk", + "oneOf": [ + { + "description": "Disk is being initialized", + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "creating" + ] + } + }, + "required": [ + "state" + ] }, - "name": { - "$ref": "#/components/schemas/DatasetName" + { + "description": "Disk is ready but detached from any Instance", + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "detached" + ] + } + }, + "required": [ + "state" + ] + }, + { + "description": "Disk is ready to receive blocks from an external source", + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "import_ready" + ] + } + }, + "required": [ + "state" + ] + }, + { + "description": "Disk is importing blocks from a URL", + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "importing_from_url" + ] + } + }, + "required": [ + "state" + ] + }, + { + "description": "Disk is importing blocks from bulk writes", + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "importing_from_bulk_writes" + ] + } + }, + "required": [ + "state" + ] + }, + { + "description": "Disk is being finalized to state Detached", + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "finalizing" + ] + } + }, + "required": [ + "state" + ] + }, + { + "description": "Disk is undergoing maintenance", + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "maintenance" + ] + } + }, + "required": [ + "state" + ] + }, + { + "description": "Disk is being attached to the given Instance", + "type": "object", + "properties": { + "instance": { + "type": "string", + "format": "uuid" + }, + "state": { + "type": "string", + "enum": [ + "attaching" + ] + } + }, + "required": [ + "instance", + "state" + ] + }, + { + "description": "Disk is attached to the given Instance", + "type": "object", + "properties": { + "instance": { + "type": "string", + "format": "uuid" + }, + "state": { + "type": "string", + "enum": [ + "attached" + ] + } + }, + "required": [ + "instance", + "state" + ] + }, + { + "description": "Disk is being detached from the given Instance", + "type": "object", + "properties": { + "instance": { + "type": "string", + "format": "uuid" + }, + "state": { + "type": "string", + "enum": [ + "detaching" + ] + } + }, + "required": [ + "instance", + "state" + ] + }, + { + "description": "Disk has been destroyed", + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "destroyed" + ] + } + }, + "required": [ + "state" + ] + }, + { + "description": "Disk is unavailable", + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "faulted" + ] + } + }, + "required": [ + "state" + ] + } + ] + }, + "DiskStateRequested": { + "description": "Used to request a Disk state change", + "oneOf": [ + { + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "detached" + ] + } + }, + "required": [ + "state" + ] + }, + { + "type": "object", + "properties": { + "instance": { + "type": "string", + "format": "uuid" + }, + "state": { + "type": "string", + "enum": [ + "attached" + ] + } + }, + "required": [ + "instance", + "state" + ] + }, + { + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "destroyed" + ] + } + }, + "required": [ + "state" + ] + }, + { + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "faulted" + ] + } + }, + "required": [ + "state" + ] + } + ] + }, + "DiskType": { + "type": "string", + "enum": [ + "U2", + "M2" + ] + }, + "Duration": { + "type": "object", + "properties": { + "nanos": { + "type": "integer", + "format": "uint32", + "minimum": 0 }, - "service_address": { - "type": "string" + "secs": { + "type": "integer", + "format": "uint64", + "minimum": 0 } }, "required": [ - "id", - "name", - "service_address" + "nanos", + "secs" ] }, - "DeleteVirtualNetworkInterfaceHost": { - "description": "The data needed to identify a virtual IP for which a sled maintains an OPTE virtual-to-physical mapping such that that mapping can be deleted.", + "EarlyNetworkConfig": { + "description": "Network configuration required to bring up the control plane\n\nThe fields in this structure are those from [`super::params::RackInitializeRequest`] necessary for use beyond RSS. This is just for the initial rack configuration and cold boot purposes. Updates come from Nexus.", "type": "object", "properties": { - "virtual_ip": { - "description": "The virtual IP whose mapping should be deleted.", - "type": "string", - "format": "ip" + "body": { + "$ref": "#/components/schemas/EarlyNetworkConfigBody" }, - "vni": { - "description": "The VNI for the network containing the virtual IP whose mapping should be deleted.", - "allOf": [ - { - "$ref": "#/components/schemas/Vni" - } - ] + "generation": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "schema_version": { + "type": "integer", + "format": "uint32", + "minimum": 0 } }, "required": [ - "virtual_ip", - "vni" + "body", + "generation", + "schema_version" ] }, - "DhcpConfig": { - "description": "DHCP configuration for a port\n\nNot present here: Hostname (DHCPv4 option 12; used in DHCPv6 option 39); we use `InstanceRuntimeState::hostname` for this value.", + "EarlyNetworkConfigBody": { + "description": "This is the actual configuration of EarlyNetworking.\n\nWe nest it below the \"header\" of `generation` and `schema_version` so that we can perform partial deserialization of `EarlyNetworkConfig` to only read the header and defer deserialization of the body once we know the schema version. This is possible via the use of [`serde_json::value::RawValue`] in future (post-v1) deserialization paths.", "type": "object", "properties": { - "dns_servers": { - "description": "DNS servers to send to the instance\n\n(DHCPv4 option 6; DHCPv6 option 23)", + "ntp_servers": { + "description": "The external NTP server addresses.", "type": "array", "items": { - "type": "string", - "format": "ip" + "type": "string" } }, - "host_domain": { + "rack_network_config": { "nullable": true, - "description": "DNS zone this instance's hostname belongs to (e.g. the `project.example` part of `instance1.project.example`)\n\n(DHCPv4 option 15; used in DHCPv6 option 39)", - "type": "string" - }, - "search_domains": { - "description": "DNS search domains\n\n(DHCPv4 option 119; DHCPv6 option 24)", - "type": "array", - "items": { - "type": "string" - } + "allOf": [ + { + "$ref": "#/components/schemas/RackNetworkConfigV1" + } + ] } }, "required": [ - "dns_servers", - "search_domains" + "ntp_servers" ] }, - "DiskEnsureBody": { - "description": "Sent from to a sled agent to establish the runtime state of a Disk", + "Error": { + "description": "Error information from a response.", "type": "object", "properties": { - "initial_runtime": { - "description": "Last runtime state of the Disk known to Nexus (used if the agent has never seen this Disk before).", - "allOf": [ - { - "$ref": "#/components/schemas/DiskRuntimeState" - } - ] + "error_code": { + "type": "string" }, - "target": { - "description": "requested runtime state of the Disk", - "allOf": [ - { - "$ref": "#/components/schemas/DiskStateRequested" - } - ] + "message": { + "type": "string" + }, + "request_id": { + "type": "string" } }, "required": [ - "initial_runtime", - "target" + "message", + "request_id" ] }, - "DiskRequest": { + "Field": { + "description": "A `Field` is a named aspect of a target or metric.", "type": "object", "properties": { - "device": { - "type": "string" - }, "name": { "type": "string" }, - "read_only": { - "type": "boolean" - }, - "slot": { - "$ref": "#/components/schemas/Slot" - }, - "volume_construction_request": { - "$ref": "#/components/schemas/VolumeConstructionRequest" + "value": { + "$ref": "#/components/schemas/FieldValue" } }, "required": [ - "device", "name", - "read_only", - "slot", - "volume_construction_request" + "value" ] }, - "DiskRuntimeState": { - "description": "Runtime state of the Disk, which includes its attach state and some minimal metadata", + "FieldSet": { "type": "object", "properties": { - "disk_state": { - "description": "runtime state of the Disk", - "allOf": [ - { - "$ref": "#/components/schemas/DiskState" - } - ] - }, - "gen": { - "description": "generation number for this state", - "allOf": [ - { - "$ref": "#/components/schemas/Generation" - } - ] + "fields": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/Field" + } }, - "time_updated": { - "description": "timestamp for this information", - "type": "string", - "format": "date-time" + "name": { + "type": "string" } }, "required": [ - "disk_state", - "gen", - "time_updated" + "fields", + "name" ] }, - "DiskState": { - "description": "State of a Disk", + "FieldValue": { + "description": "The `FieldValue` contains the value of a target or metric field.", "oneOf": [ { - "description": "Disk is being initialized", "type": "object", "properties": { - "state": { + "type": { "type": "string", "enum": [ - "creating" + "string" ] + }, + "value": { + "type": "string" } }, "required": [ - "state" + "type", + "value" ] }, { - "description": "Disk is ready but detached from any Instance", "type": "object", "properties": { - "state": { + "type": { "type": "string", "enum": [ - "detached" + "i8" ] + }, + "value": { + "type": "integer", + "format": "int8" } }, "required": [ - "state" + "type", + "value" ] }, { - "description": "Disk is ready to receive blocks from an external source", "type": "object", "properties": { - "state": { + "type": { "type": "string", "enum": [ - "import_ready" + "u8" ] + }, + "value": { + "type": "integer", + "format": "uint8", + "minimum": 0 } }, "required": [ - "state" + "type", + "value" ] }, { - "description": "Disk is importing blocks from a URL", "type": "object", "properties": { - "state": { + "type": { "type": "string", "enum": [ - "importing_from_url" + "i16" + ] + }, + "value": { + "type": "integer", + "format": "int16" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "u16" ] + }, + "value": { + "type": "integer", + "format": "uint16", + "minimum": 0 } }, "required": [ - "state" + "type", + "value" ] }, { - "description": "Disk is importing blocks from bulk writes", "type": "object", "properties": { - "state": { + "type": { "type": "string", "enum": [ - "importing_from_bulk_writes" + "i32" ] + }, + "value": { + "type": "integer", + "format": "int32" } }, "required": [ - "state" + "type", + "value" ] }, { - "description": "Disk is being finalized to state Detached", "type": "object", "properties": { - "state": { + "type": { "type": "string", "enum": [ - "finalizing" + "u32" ] + }, + "value": { + "type": "integer", + "format": "uint32", + "minimum": 0 } }, "required": [ - "state" + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "i64" + ] + }, + "value": { + "type": "integer", + "format": "int64" + } + }, + "required": [ + "type", + "value" ] }, { - "description": "Disk is undergoing maintenance", "type": "object", "properties": { - "state": { + "type": { "type": "string", "enum": [ - "maintenance" + "u64" ] + }, + "value": { + "type": "integer", + "format": "uint64", + "minimum": 0 } }, "required": [ - "state" + "type", + "value" ] }, { - "description": "Disk is being attached to the given Instance", "type": "object", "properties": { - "instance": { - "type": "string", - "format": "uuid" - }, - "state": { + "type": { "type": "string", "enum": [ - "attaching" + "ip_addr" ] + }, + "value": { + "type": "string", + "format": "ip" } }, "required": [ - "instance", - "state" + "type", + "value" ] }, { - "description": "Disk is attached to the given Instance", "type": "object", "properties": { - "instance": { - "type": "string", - "format": "uuid" - }, - "state": { + "type": { "type": "string", "enum": [ - "attached" + "uuid" ] + }, + "value": { + "type": "string", + "format": "uuid" } }, "required": [ - "instance", - "state" + "type", + "value" ] }, { - "description": "Disk is being detached from the given Instance", "type": "object", "properties": { - "instance": { - "type": "string", - "format": "uuid" - }, - "state": { + "type": { "type": "string", "enum": [ - "detaching" + "bool" ] + }, + "value": { + "type": "boolean" } }, "required": [ - "instance", - "state" + "type", + "value" ] - }, + } + ] + }, + "Generation": { + "description": "Generation numbers stored in the database, used for optimistic concurrency control", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "HistogramError": { + "description": "Errors related to constructing histograms or adding samples into them.", + "oneOf": [ { - "description": "Disk has been destroyed", + "description": "An attempt to construct a histogram with an empty set of bins.", "type": "object", "properties": { - "state": { + "type": { "type": "string", "enum": [ - "destroyed" + "empty_bins" ] } }, "required": [ - "state" + "type" ] }, { - "description": "Disk is unavailable", + "description": "An attempt to construct a histogram with non-monotonic bins.", "type": "object", "properties": { - "state": { + "type": { "type": "string", "enum": [ - "faulted" + "nonmonotonic_bins" ] } }, "required": [ - "state" + "type" ] - } - ] - }, - "DiskStateRequested": { - "description": "Used to request a Disk state change", - "oneOf": [ + }, { + "description": "A non-finite was encountered, either as a bin edge or a sample.", "type": "object", "properties": { - "state": { + "content": { + "type": "string" + }, + "type": { "type": "string", "enum": [ - "detached" + "non_finite_value" ] } }, "required": [ - "state" + "content", + "type" ] }, { + "description": "Error returned when two neighboring bins are not adjoining (there's space between them)", "type": "object", "properties": { - "instance": { - "type": "string", - "format": "uuid" + "content": { + "type": "object", + "properties": { + "left": { + "type": "string" + }, + "right": { + "type": "string" + } + }, + "required": [ + "left", + "right" + ] }, - "state": { + "type": { "type": "string", "enum": [ - "attached" + "non_adjoining_bins" ] } }, "required": [ - "instance", - "state" + "content", + "type" ] }, { + "description": "Bin and count arrays are of different sizes.", "type": "object", "properties": { - "state": { + "content": { + "type": "object", + "properties": { + "n_bins": { + "type": "integer", + "format": "uint", + "minimum": 0 + }, + "n_counts": { + "type": "integer", + "format": "uint", + "minimum": 0 + } + }, + "required": [ + "n_bins", + "n_counts" + ] + }, + "type": { "type": "string", "enum": [ - "destroyed" + "array_size_mismatch" ] } }, "required": [ - "state" + "content", + "type" ] }, { "type": "object", "properties": { - "state": { + "content": { + "$ref": "#/components/schemas/QuantizationError" + }, + "type": { "type": "string", "enum": [ - "faulted" + "quantization" ] } }, "required": [ - "state" + "content", + "type" ] } ] }, - "DiskType": { - "type": "string", - "enum": [ - "U2", - "M2" + "Histogramdouble": { + "description": "Histogram metric\n\nA histogram maintains the count of any number of samples, over a set of bins. Bins are specified on construction via their _left_ edges, inclusive. There can't be any \"gaps\" in the bins, and an additional bin may be added to the left, right, or both so that the bins extend to the entire range of the support.\n\nNote that any gaps, unsorted bins, or non-finite values will result in an error.", + "type": "object", + "properties": { + "bins": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Bindouble" + } + }, + "n_samples": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "start_time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "bins", + "n_samples", + "start_time" ] }, - "Duration": { + "Histogramfloat": { + "description": "Histogram metric\n\nA histogram maintains the count of any number of samples, over a set of bins. Bins are specified on construction via their _left_ edges, inclusive. There can't be any \"gaps\" in the bins, and an additional bin may be added to the left, right, or both so that the bins extend to the entire range of the support.\n\nNote that any gaps, unsorted bins, or non-finite values will result in an error.", "type": "object", "properties": { - "nanos": { + "bins": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Binfloat" + } + }, + "n_samples": { "type": "integer", - "format": "uint32", + "format": "uint64", "minimum": 0 }, - "secs": { + "start_time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "bins", + "n_samples", + "start_time" + ] + }, + "Histogramint16": { + "description": "Histogram metric\n\nA histogram maintains the count of any number of samples, over a set of bins. Bins are specified on construction via their _left_ edges, inclusive. There can't be any \"gaps\" in the bins, and an additional bin may be added to the left, right, or both so that the bins extend to the entire range of the support.\n\nNote that any gaps, unsorted bins, or non-finite values will result in an error.", + "type": "object", + "properties": { + "bins": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Binint16" + } + }, + "n_samples": { "type": "integer", "format": "uint64", "minimum": 0 + }, + "start_time": { + "type": "string", + "format": "date-time" } }, "required": [ - "nanos", - "secs" + "bins", + "n_samples", + "start_time" ] }, - "EarlyNetworkConfig": { - "description": "Network configuration required to bring up the control plane\n\nThe fields in this structure are those from [`super::params::RackInitializeRequest`] necessary for use beyond RSS. This is just for the initial rack configuration and cold boot purposes. Updates come from Nexus.", + "Histogramint32": { + "description": "Histogram metric\n\nA histogram maintains the count of any number of samples, over a set of bins. Bins are specified on construction via their _left_ edges, inclusive. There can't be any \"gaps\" in the bins, and an additional bin may be added to the left, right, or both so that the bins extend to the entire range of the support.\n\nNote that any gaps, unsorted bins, or non-finite values will result in an error.", "type": "object", "properties": { - "body": { - "$ref": "#/components/schemas/EarlyNetworkConfigBody" + "bins": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Binint32" + } }, - "generation": { + "n_samples": { "type": "integer", "format": "uint64", "minimum": 0 }, - "schema_version": { + "start_time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "bins", + "n_samples", + "start_time" + ] + }, + "Histogramint64": { + "description": "Histogram metric\n\nA histogram maintains the count of any number of samples, over a set of bins. Bins are specified on construction via their _left_ edges, inclusive. There can't be any \"gaps\" in the bins, and an additional bin may be added to the left, right, or both so that the bins extend to the entire range of the support.\n\nNote that any gaps, unsorted bins, or non-finite values will result in an error.", + "type": "object", + "properties": { + "bins": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Binint64" + } + }, + "n_samples": { "type": "integer", - "format": "uint32", + "format": "uint64", "minimum": 0 + }, + "start_time": { + "type": "string", + "format": "date-time" } }, "required": [ - "body", - "generation", - "schema_version" + "bins", + "n_samples", + "start_time" ] }, - "EarlyNetworkConfigBody": { - "description": "This is the actual configuration of EarlyNetworking.\n\nWe nest it below the \"header\" of `generation` and `schema_version` so that we can perform partial deserialization of `EarlyNetworkConfig` to only read the header and defer deserialization of the body once we know the schema version. This is possible via the use of [`serde_json::value::RawValue`] in future (post-v1) deserialization paths.", + "Histogramint8": { + "description": "Histogram metric\n\nA histogram maintains the count of any number of samples, over a set of bins. Bins are specified on construction via their _left_ edges, inclusive. There can't be any \"gaps\" in the bins, and an additional bin may be added to the left, right, or both so that the bins extend to the entire range of the support.\n\nNote that any gaps, unsorted bins, or non-finite values will result in an error.", "type": "object", "properties": { - "ntp_servers": { - "description": "The external NTP server addresses.", + "bins": { "type": "array", "items": { - "type": "string" + "$ref": "#/components/schemas/Binint8" + } + }, + "n_samples": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "start_time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "bins", + "n_samples", + "start_time" + ] + }, + "Histogramuint16": { + "description": "Histogram metric\n\nA histogram maintains the count of any number of samples, over a set of bins. Bins are specified on construction via their _left_ edges, inclusive. There can't be any \"gaps\" in the bins, and an additional bin may be added to the left, right, or both so that the bins extend to the entire range of the support.\n\nNote that any gaps, unsorted bins, or non-finite values will result in an error.", + "type": "object", + "properties": { + "bins": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Binuint16" + } + }, + "n_samples": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "start_time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "bins", + "n_samples", + "start_time" + ] + }, + "Histogramuint32": { + "description": "Histogram metric\n\nA histogram maintains the count of any number of samples, over a set of bins. Bins are specified on construction via their _left_ edges, inclusive. There can't be any \"gaps\" in the bins, and an additional bin may be added to the left, right, or both so that the bins extend to the entire range of the support.\n\nNote that any gaps, unsorted bins, or non-finite values will result in an error.", + "type": "object", + "properties": { + "bins": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Binuint32" + } + }, + "n_samples": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "start_time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "bins", + "n_samples", + "start_time" + ] + }, + "Histogramuint64": { + "description": "Histogram metric\n\nA histogram maintains the count of any number of samples, over a set of bins. Bins are specified on construction via their _left_ edges, inclusive. There can't be any \"gaps\" in the bins, and an additional bin may be added to the left, right, or both so that the bins extend to the entire range of the support.\n\nNote that any gaps, unsorted bins, or non-finite values will result in an error.", + "type": "object", + "properties": { + "bins": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Binuint64" } }, - "rack_network_config": { - "nullable": true, - "allOf": [ - { - "$ref": "#/components/schemas/RackNetworkConfigV1" - } - ] + "n_samples": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "start_time": { + "type": "string", + "format": "date-time" } }, "required": [ - "ntp_servers" + "bins", + "n_samples", + "start_time" ] }, - "Error": { - "description": "Error information from a response.", + "Histogramuint8": { + "description": "Histogram metric\n\nA histogram maintains the count of any number of samples, over a set of bins. Bins are specified on construction via their _left_ edges, inclusive. There can't be any \"gaps\" in the bins, and an additional bin may be added to the left, right, or both so that the bins extend to the entire range of the support.\n\nNote that any gaps, unsorted bins, or non-finite values will result in an error.", "type": "object", "properties": { - "error_code": { - "type": "string" + "bins": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Binuint8" + } }, - "message": { - "type": "string" + "n_samples": { + "type": "integer", + "format": "uint64", + "minimum": 0 }, - "request_id": { - "type": "string" + "start_time": { + "type": "string", + "format": "date-time" } }, "required": [ - "message", - "request_id" + "bins", + "n_samples", + "start_time" ] }, - "Generation": { - "description": "Generation numbers stored in the database, used for optimistic concurrency control", - "type": "integer", - "format": "uint64", - "minimum": 0 - }, "HostIdentifier": { "description": "A `HostIdentifier` represents either an IP host or network (v4 or v6), or an entire VPC (identified by its VNI). It is used in firewall rule host filters.", "oneOf": [ @@ -2521,6 +4765,143 @@ "minLength": 5, "maxLength": 17 }, + "Measurement": { + "description": "A `Measurement` is a timestamped datum from a single metric", + "type": "object", + "properties": { + "datum": { + "$ref": "#/components/schemas/Datum" + }, + "timestamp": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "datum", + "timestamp" + ] + }, + "MetricsError": { + "description": "Errors related to the generation or collection of metrics.", + "oneOf": [ + { + "description": "An error related to generating metric data points", + "type": "object", + "properties": { + "content": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "datum_error" + ] + } + }, + "required": [ + "content", + "type" + ] + }, + { + "description": "An error running an `Oximeter` server", + "type": "object", + "properties": { + "content": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "oximeter_server" + ] + } + }, + "required": [ + "content", + "type" + ] + }, + { + "description": "An error related to creating or sampling a [`histogram::Histogram`] metric.", + "type": "object", + "properties": { + "content": { + "$ref": "#/components/schemas/HistogramError" + }, + "type": { + "type": "string", + "enum": [ + "histogram_error" + ] + } + }, + "required": [ + "content", + "type" + ] + }, + { + "description": "An error parsing a field or measurement from a string.", + "type": "object", + "properties": { + "content": { + "type": "object", + "properties": { + "src": { + "type": "string" + }, + "typ": { + "type": "string" + } + }, + "required": [ + "src", + "typ" + ] + }, + "type": { + "type": "string", + "enum": [ + "parse_error" + ] + } + }, + "required": [ + "content", + "type" + ] + }, + { + "description": "A field name is duplicated between the target and metric.", + "type": "object", + "properties": { + "content": { + "type": "object", + "properties": { + "name": { + "type": "string" + } + }, + "required": [ + "name" + ] + }, + "type": { + "type": "string", + "enum": [ + "duplicate_field_name" + ] + } + }, + "required": [ + "content", + "type" + ] + } + ] + }, "Name": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID though they may contain a UUID.", @@ -2737,6 +5118,138 @@ "minItems": 2, "maxItems": 2 }, + "ProducerResultsItem": { + "oneOf": [ + { + "type": "object", + "properties": { + "info": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Sample" + } + }, + "status": { + "type": "string", + "enum": [ + "ok" + ] + } + }, + "required": [ + "info", + "status" + ] + }, + { + "type": "object", + "properties": { + "info": { + "$ref": "#/components/schemas/MetricsError" + }, + "status": { + "type": "string", + "enum": [ + "err" + ] + } + }, + "required": [ + "info", + "status" + ] + } + ] + }, + "QuantizationError": { + "description": "Errors occurring during quantizated bin generation.", + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "overflow" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "precision" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "invalid_base" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "invalid_steps" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "uneven_steps_for_base" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "powers_out_of_order" + ] + } + }, + "required": [ + "type" + ] + } + ] + }, "RackNetworkConfigV1": { "description": "Initial network configuration", "type": "object", @@ -2799,6 +5312,36 @@ "nexthop" ] }, + "Sample": { + "description": "A concrete type representing a single, timestamped measurement from a timeseries.", + "type": "object", + "properties": { + "measurement": { + "description": "The measured value of the metric at this sample", + "allOf": [ + { + "$ref": "#/components/schemas/Measurement" + } + ] + }, + "metric": { + "$ref": "#/components/schemas/FieldSet" + }, + "target": { + "$ref": "#/components/schemas/FieldSet" + }, + "timeseries_name": { + "description": "The name of the timeseries this sample belongs to", + "type": "string" + } + }, + "required": [ + "measurement", + "metric", + "target", + "timeseries_name" + ] + }, "SemverVersion": { "type": "string", "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" diff --git a/oximeter/instruments/Cargo.toml b/oximeter/instruments/Cargo.toml index 3653ab80112..8372b7c5602 100644 --- a/oximeter/instruments/Cargo.toml +++ b/oximeter/instruments/Cargo.toml @@ -5,15 +5,27 @@ edition = "2021" license = "MPL-2.0" [dependencies] +cfg-if.workspace = true chrono.workspace = true dropshot.workspace = true futures.workspace = true +http = { workspace = true, optional = true } oximeter.workspace = true +slog.workspace = true tokio.workspace = true -http = { workspace = true, optional = true } +thiserror.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true [features] -default = ["http-instruments"] +default = ["http-instruments", "kstat"] http-instruments = ["http"] +kstat = ["kstat-rs"] + +[dev-dependencies] +rand.workspace = true +slog-async.workspace = true +slog-term.workspace = true + +[target.'cfg(target_os = "illumos")'.dependencies] +kstat-rs = { workspace = true, optional = true } diff --git a/oximeter/instruments/src/kstat/link.rs b/oximeter/instruments/src/kstat/link.rs new file mode 100644 index 00000000000..d22ac60378c --- /dev/null +++ b/oximeter/instruments/src/kstat/link.rs @@ -0,0 +1,653 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Report metrics about Ethernet data links on the host system + +use crate::kstat::hrtime_to_utc; +use crate::kstat::ConvertNamedData; +use crate::kstat::Error; +use crate::kstat::KstatList; +use crate::kstat::KstatTarget; +use chrono::DateTime; +use chrono::Utc; +use kstat_rs::Data; +use kstat_rs::Kstat; +use kstat_rs::Named; +use oximeter::types::Cumulative; +use oximeter::Metric; +use oximeter::Sample; +use oximeter::Target; +use uuid::Uuid; + +/// Information about a single physical Ethernet link on a host. +#[derive(Clone, Debug, Target)] +pub struct PhysicalDataLink { + /// The ID of the rack (cluster) containing this host. + pub rack_id: Uuid, + /// The ID of the sled itself. + pub sled_id: Uuid, + /// The serial number of the hosting sled. + pub serial: String, + /// The name of the host. + pub hostname: String, + /// The name of the link. + pub link_name: String, +} + +/// Information about a virtual Ethernet link on a host. +/// +/// Note that this is specifically for a VNIC in on the host system, not a guest +/// data link. +#[derive(Clone, Debug, Target)] +pub struct VirtualDataLink { + /// The ID of the rack (cluster) containing this host. + pub rack_id: Uuid, + /// The ID of the sled itself. + pub sled_id: Uuid, + /// The serial number of the hosting sled. + pub serial: String, + /// The name of the host, or the zone name for links in a zone. + pub hostname: String, + /// The name of the link. + pub link_name: String, +} + +/// Information about a guest virtual Ethernet link. +#[derive(Clone, Debug, Target)] +pub struct GuestDataLink { + /// The ID of the rack (cluster) containing this host. + pub rack_id: Uuid, + /// The ID of the sled itself. + pub sled_id: Uuid, + /// The serial number of the hosting sled. + pub serial: String, + /// The name of the host, or the zone name for links in a zone. + pub hostname: String, + /// The ID of the project containing the instance. + pub project_id: Uuid, + /// The ID of the instance. + pub instance_id: Uuid, + /// The name of the link. + pub link_name: String, +} + +/// The number of packets received on the link. +#[derive(Clone, Copy, Metric)] +pub struct PacketsReceived { + pub datum: Cumulative, +} + +/// The number of packets sent on the link. +#[derive(Clone, Copy, Metric)] +pub struct PacketsSent { + pub datum: Cumulative, +} + +/// The number of bytes sent on the link. +#[derive(Clone, Copy, Metric)] +pub struct BytesSent { + pub datum: Cumulative, +} + +/// The number of bytes received on the link. +#[derive(Clone, Copy, Metric)] +pub struct BytesReceived { + pub datum: Cumulative, +} + +/// The number of errors received on the link. +#[derive(Clone, Copy, Metric)] +pub struct ErrorsReceived { + pub datum: Cumulative, +} + +/// The number of errors sent on the link. +#[derive(Clone, Copy, Metric)] +pub struct ErrorsSent { + pub datum: Cumulative, +} + +// Helper function to extract the same kstat metrics from all link targets. +fn extract_link_kstats( + target: &T, + named_data: &Named, + creation_time: DateTime, + snapshot_time: DateTime, +) -> Option> +where + T: KstatTarget, +{ + let Named { name, value } = named_data; + if *name == "rbytes64" { + Some(value.as_u64().and_then(|x| { + let metric = BytesReceived { + datum: Cumulative::with_start_time(creation_time, x), + }; + Sample::new_with_timestamp(snapshot_time, target, &metric) + .map_err(Error::Sample) + })) + } else if *name == "obytes64" { + Some(value.as_u64().and_then(|x| { + let metric = BytesSent { + datum: Cumulative::with_start_time(creation_time, x), + }; + Sample::new_with_timestamp(snapshot_time, target, &metric) + .map_err(Error::Sample) + })) + } else if *name == "ipackets64" { + Some(value.as_u64().and_then(|x| { + let metric = PacketsReceived { + datum: Cumulative::with_start_time(creation_time, x), + }; + Sample::new_with_timestamp(snapshot_time, target, &metric) + .map_err(Error::Sample) + })) + } else if *name == "opackets64" { + Some(value.as_u64().and_then(|x| { + let metric = PacketsSent { + datum: Cumulative::with_start_time(creation_time, x), + }; + Sample::new_with_timestamp(snapshot_time, target, &metric) + .map_err(Error::Sample) + })) + } else if *name == "ierrors" { + Some(value.as_u32().and_then(|x| { + let metric = ErrorsReceived { + datum: Cumulative::with_start_time(creation_time, x.into()), + }; + Sample::new_with_timestamp(snapshot_time, target, &metric) + .map_err(Error::Sample) + })) + } else if *name == "oerrors" { + Some(value.as_u32().and_then(|x| { + let metric = ErrorsSent { + datum: Cumulative::with_start_time(creation_time, x.into()), + }; + Sample::new_with_timestamp(snapshot_time, target, &metric) + .map_err(Error::Sample) + })) + } else { + None + } +} + +// Helper trait for defining `KstatTarget` for all the link-based stats. +trait LinkKstatTarget: KstatTarget { + fn link_name(&self) -> &str; +} + +impl LinkKstatTarget for PhysicalDataLink { + fn link_name(&self) -> &str { + &self.link_name + } +} + +impl LinkKstatTarget for VirtualDataLink { + fn link_name(&self) -> &str { + &self.link_name + } +} + +impl LinkKstatTarget for GuestDataLink { + fn link_name(&self) -> &str { + &self.link_name + } +} + +impl KstatTarget for T +where + T: LinkKstatTarget, +{ + fn interested(&self, kstat: &Kstat<'_>) -> bool { + kstat.ks_module == "link" + && kstat.ks_instance == 0 + && kstat.ks_name == self.link_name() + } + + fn to_samples( + &self, + kstats: KstatList<'_, '_>, + ) -> Result, Error> { + let Some((creation_time, kstat, data)) = kstats.first() else { + return Ok(vec![]); + }; + let snapshot_time = hrtime_to_utc(kstat.ks_snaptime)?; + let Data::Named(named) = data else { + return Err(Error::ExpectedNamedKstat); + }; + named + .iter() + .filter_map(|nd| { + extract_link_kstats(self, nd, *creation_time, snapshot_time) + }) + .collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::kstat::sampler::KstatPath; + use crate::kstat::sampler::CREATION_TIME_PRUNE_INTERVAL; + use crate::kstat::CollectionDetails; + use crate::kstat::KstatSampler; + use crate::kstat::TargetStatus; + use kstat_rs::Ctl; + use oximeter::Producer; + use rand::distributions::Uniform; + use rand::Rng; + use slog::info; + use slog::Drain; + use slog::Logger; + use std::time::Duration; + use tokio::time::Instant; + use uuid::uuid; + use uuid::Uuid; + + fn test_logger() -> Logger { + let dec = + slog_term::PlainSyncDecorator::new(slog_term::TestStdoutWriter); + let drain = slog_term::FullFormat::new(dec).build().fuse(); + let log = + Logger::root(drain, slog::o!("component" => "fake-cleanup-task")); + log + } + + const RACK_ID: Uuid = uuid!("de784702-cafb-41a9-b3e5-93af189def29"); + const SLED_ID: Uuid = uuid!("88240343-5262-45f4-86f1-3c82fe383f2a"); + + // An etherstub we can use for testing. + // + // This is not meant to produce real data. It is simply a data link that + // shows up with the `link:::` kstat scheme, and which doesn't require us to + // decide which physical link over which to create something like a VNIC. + #[derive(Debug)] + struct TestEtherstub { + name: String, + } + + impl TestEtherstub { + const PFEXEC: &str = "/usr/bin/pfexec"; + const DLADM: &str = "/usr/sbin/dladm"; + fn new() -> Self { + let name = format!( + "kstest{}0", + rand::thread_rng() + .sample_iter(Uniform::new('a', 'z')) + .take(5) + .map(char::from) + .collect::(), + ); + Self::create(&name); + Self { name } + } + + fn create(name: &str) { + let output = std::process::Command::new(Self::PFEXEC) + .env_clear() + .arg(Self::DLADM) + .arg("create-etherstub") + .arg("-t") + .arg(name) + .output() + .expect("failed to spawn dladm"); + assert!( + output.status.success(), + "failed to create test etherstub:\n{}", + String::from_utf8_lossy(&output.stderr) + ); + } + } + + impl Drop for TestEtherstub { + fn drop(&mut self) { + let output = std::process::Command::new(Self::PFEXEC) + .env_clear() + .arg(Self::DLADM) + .arg("delete-etherstub") + .arg(&self.name) + .output() + .expect("failed to spawn dladm"); + if !output.status.success() { + eprintln!( + "Failed to delete etherstub '{}'.\n\ + Delete manually with `dladm delete-etherstub {}`:\n{}", + &self.name, + &self.name, + String::from_utf8_lossy(&output.stderr), + ); + } + } + } + + #[test] + fn test_physical_datalink() { + let link = TestEtherstub::new(); + let sn = String::from("BRM000001"); + let dl = PhysicalDataLink { + rack_id: RACK_ID, + sled_id: SLED_ID, + serial: sn.clone(), + hostname: sn, + link_name: link.name.to_string(), + }; + let ctl = Ctl::new().unwrap(); + let ctl = ctl.update().unwrap(); + let mut kstat = ctl + .filter(Some("link"), Some(0), Some(dl.link_name.as_str())) + .next() + .unwrap(); + let creation_time = hrtime_to_utc(kstat.ks_crtime).unwrap(); + let data = ctl.read(&mut kstat).unwrap(); + let samples = dl.to_samples(&[(creation_time, kstat, data)]).unwrap(); + println!("{samples:#?}"); + } + + #[tokio::test] + async fn test_kstat_sampler() { + let mut sampler = KstatSampler::new(&test_logger()).unwrap(); + let sn = String::from("BRM000001"); + let link = TestEtherstub::new(); + let dl = PhysicalDataLink { + rack_id: RACK_ID, + sled_id: SLED_ID, + serial: sn.clone(), + hostname: sn, + link_name: link.name.to_string(), + }; + let details = CollectionDetails::never(Duration::from_secs(1)); + let id = sampler.add_target(dl, details).await.unwrap(); + let samples: Vec<_> = sampler.produce().unwrap().collect(); + assert!(samples.is_empty()); + + // Pause time, and advance until we're notified of new samples. + tokio::time::pause(); + const MAX_DURATION: Duration = Duration::from_secs(3); + const STEP_DURATION: Duration = Duration::from_secs(1); + let now = Instant::now(); + let expected_counts = loop { + tokio::time::advance(STEP_DURATION).await; + if now.elapsed() > MAX_DURATION { + panic!("Waited too long for samples"); + } + if let Some(counts) = sampler.sample_counts() { + break counts; + } + }; + let samples: Vec<_> = sampler.produce().unwrap().collect(); + println!("{samples:#?}"); + assert_eq!(samples.len(), expected_counts.total); + assert_eq!(expected_counts.overflow, 0); + + // Test status and remove behavior. + tokio::time::resume(); + assert!(matches!( + sampler.target_status(id).await.unwrap(), + TargetStatus::Ok { .. }, + )); + sampler.remove_target(id).await.unwrap(); + assert!(sampler.target_status(id).await.is_err()); + } + + #[tokio::test] + async fn test_kstat_sampler_with_overflow() { + let limit = 2; + let mut sampler = + KstatSampler::with_sample_limit(&test_logger(), limit).unwrap(); + let sn = String::from("BRM000001"); + let link = TestEtherstub::new(); + let dl = PhysicalDataLink { + rack_id: RACK_ID, + sled_id: SLED_ID, + serial: sn.clone(), + hostname: sn, + link_name: link.name.to_string(), + }; + let details = CollectionDetails::never(Duration::from_secs(1)); + sampler.add_target(dl, details).await.unwrap(); + let samples: Vec<_> = sampler.produce().unwrap().collect(); + assert!(samples.is_empty()); + + // Pause time, and advance until we're notified of new samples. + tokio::time::pause(); + const MAX_DURATION: Duration = Duration::from_secs(3); + const STEP_DURATION: Duration = Duration::from_secs(1); + let now = Instant::now(); + let expected_counts = loop { + tokio::time::advance(STEP_DURATION).await; + if now.elapsed() > MAX_DURATION { + panic!("Waited too long for samples"); + } + if let Some(counts) = sampler.sample_counts() { + break counts; + } + }; + + // We should have produced 2 samples from the actual target, plus one + // from the counter indicating we've dropped some samples! + let samples: Vec<_> = sampler.produce().unwrap().collect(); + let (link_samples, dropped_samples): (Vec<_>, Vec<_>) = samples + .iter() + .partition(|s| s.timeseries_name.contains("physical_data_link")); + println!("{link_samples:#?}"); + assert_eq!(link_samples.len(), limit); + + // The total number of samples less overflow should be match the number + // of samples for the link we've produced. + assert_eq!( + link_samples.len(), + expected_counts.total - expected_counts.overflow + ); + + // The worker must have produced one sample representing the number of + // overflows. + println!("{dropped_samples:#?}"); + assert_eq!(dropped_samples.len(), 1); + + // Verify that we actually counted the correct number of dropped + // samples. + let oximeter::Datum::CumulativeU64(overflow) = + dropped_samples[0].measurement.datum() + else { + unreachable!(); + }; + assert_eq!(overflow.value(), expected_counts.overflow as u64); + } + + #[tokio::test] + async fn test_kstat_with_expiration() { + // Create a VNIC, which we'll start tracking from, then delete it and + // make sure we expire after the expected period. + let log = test_logger(); + let mut sampler = KstatSampler::new(&log).unwrap(); + let sn = String::from("BRM000001"); + let link = TestEtherstub::new(); + info!(log, "created test etherstub"; "name" => &link.name); + let dl = PhysicalDataLink { + rack_id: RACK_ID, + sled_id: SLED_ID, + serial: sn.clone(), + hostname: sn, + link_name: link.name.to_string(), + }; + let collection_interval = Duration::from_secs(1); + let expiry = Duration::from_secs(1); + let details = CollectionDetails::duration(collection_interval, expiry); + let id = sampler.add_target(dl, details).await.unwrap(); + info!(log, "target added"; "id" => ?id); + assert!(matches!( + sampler.target_status(id).await.unwrap(), + TargetStatus::Ok { .. }, + )); + + // Delete the link right away. + drop(link); + info!(log, "dropped test etherstub"); + + // Pause time, and advance until we should have expired the target. + tokio::time::pause(); + const MAX_DURATION: Duration = Duration::from_secs(3); + let now = Instant::now(); + let is_expired = loop { + tokio::time::advance(expiry).await; + if now.elapsed() > MAX_DURATION { + panic!("Waited too long for samples"); + } + if let TargetStatus::Expired { .. } = + sampler.target_status(id).await.unwrap() + { + break true; + } + }; + assert!(is_expired, "Target should have expired by now"); + + // We should have some self-stat expiration samples now. + let samples = sampler.produce().unwrap(); + let expiration_samples: Vec<_> = samples + .filter(|sample| { + sample.timeseries_name == "kstat_sampler:expired_targets" + }) + .collect(); + assert_eq!(expiration_samples.len(), 1); + } + + // A sanity check that a cumulative start time does not change over time, + // since we've fixed the time reference at the time it was added. + #[tokio::test] + async fn test_kstat_start_time_is_equal() { + let log = test_logger(); + let mut sampler = KstatSampler::new(&log).unwrap(); + let sn = String::from("BRM000001"); + let link = TestEtherstub::new(); + info!(log, "created test etherstub"; "name" => &link.name); + let dl = PhysicalDataLink { + rack_id: RACK_ID, + sled_id: SLED_ID, + serial: sn.clone(), + hostname: sn, + link_name: link.name.to_string(), + }; + let collection_interval = Duration::from_secs(1); + let expiry = Duration::from_secs(1); + let details = CollectionDetails::duration(collection_interval, expiry); + let id = sampler.add_target(dl, details).await.unwrap(); + info!(log, "target added"; "id" => ?id); + assert!(matches!( + sampler.target_status(id).await.unwrap(), + TargetStatus::Ok { .. }, + )); + tokio::time::pause(); + let now = Instant::now(); + while now.elapsed() < (expiry * 10) { + tokio::time::advance(expiry).await; + } + let samples = sampler.produce().unwrap(); + let mut start_times = samples + .filter(|sample| { + sample.timeseries_name.as_str().starts_with("physical") + }) + .map(|sample| sample.measurement.start_time().unwrap()); + let first = start_times.next().unwrap(); + println!("{first}"); + assert!(start_times.all(|t| { + println!("{t}"); + t == first + })); + } + + #[tokio::test] + async fn test_prune_creation_times_when_kstat_is_gone() { + // Create a VNIC, which we'll start tracking from, then delete it and + // make sure the creation times are pruned. + let log = test_logger(); + let sampler = KstatSampler::new(&log).unwrap(); + let sn = String::from("BRM000001"); + let link = TestEtherstub::new(); + let path = KstatPath { + module: "link".to_string(), + instance: 0, + name: link.name.clone(), + }; + info!(log, "created test etherstub"; "name" => &link.name); + let dl = PhysicalDataLink { + rack_id: RACK_ID, + sled_id: SLED_ID, + serial: sn.clone(), + hostname: sn, + link_name: link.name.to_string(), + }; + let collection_interval = Duration::from_secs(1); + let expiry = Duration::from_secs(1); + let details = CollectionDetails::duration(collection_interval, expiry); + let id = sampler.add_target(dl, details).await.unwrap(); + info!(log, "target added"; "id" => ?id); + assert!(matches!( + sampler.target_status(id).await.unwrap(), + TargetStatus::Ok { .. }, + )); + + // Delete the link right away. + drop(link); + info!(log, "dropped test etherstub"); + + // Advance time through the prune interval. + tokio::time::pause(); + let now = Instant::now(); + while now.elapsed() < CREATION_TIME_PRUNE_INTERVAL + expiry { + tokio::time::advance(expiry).await; + } + + // Now check that the creation times are pruned. + let times = sampler.creation_times().await; + assert!(!times.contains_key(&path)); + } + + #[tokio::test] + async fn test_prune_creation_times_when_target_is_removed() { + // Create a VNIC, which we'll start tracking from, then delete it and + // make sure the creation times are pruned. + let log = test_logger(); + let sampler = KstatSampler::new(&log).unwrap(); + let sn = String::from("BRM000001"); + let link = TestEtherstub::new(); + let path = KstatPath { + module: "link".to_string(), + instance: 0, + name: link.name.clone(), + }; + info!(log, "created test etherstub"; "name" => &link.name); + let dl = PhysicalDataLink { + rack_id: RACK_ID, + sled_id: SLED_ID, + serial: sn.clone(), + hostname: sn, + link_name: link.name.to_string(), + }; + let collection_interval = Duration::from_secs(1); + let expiry = Duration::from_secs(1); + let details = CollectionDetails::duration(collection_interval, expiry); + let id = sampler.add_target(dl, details).await.unwrap(); + info!(log, "target added"; "id" => ?id); + assert!(matches!( + sampler.target_status(id).await.unwrap(), + TargetStatus::Ok { .. }, + )); + + // Remove the target, but do not drop the link. This will mean that the + // underlying kstat is still around, even though there's no target + // that's interested in it. We should keep it, in this case. + sampler.remove_target(id).await.unwrap(); + + // Advance time through the prune interval. + tokio::time::pause(); + let now = Instant::now(); + while now.elapsed() < CREATION_TIME_PRUNE_INTERVAL + expiry { + tokio::time::advance(expiry).await; + } + + // Now check that the creation time is still around. + let times = sampler.creation_times().await; + assert!(times.contains_key(&path)); + } +} diff --git a/oximeter/instruments/src/kstat/mod.rs b/oximeter/instruments/src/kstat/mod.rs new file mode 100644 index 00000000000..90f34acae85 --- /dev/null +++ b/oximeter/instruments/src/kstat/mod.rs @@ -0,0 +1,267 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2023 Oxide Computer Company + +//! Types for publishing kernel statistics via oximeter. +//! +//! # illumos kernel statistics +//! +//! illumos defines a generic framework for tracking statistics throughout the +//! OS, called `kstat`. These statistics are generally defined by the kernel or +//! device drivers, and made available to userspace for reading through +//! `libkstat`. +//! +//! Kernel statistics are defined by a 4-tuple of names, canonically separated +//! with a ":". For example, the `cpu:0:vm:pgin` kstat tracks the number of +//! memory pages paged in for CPU 0. In this case, the kstat is tracked by a +//! u64, though several data types are supported. +//! +//! This module uses the `kstat-rs` crate, which is a fairly low-level wrapper +//! around `libkstat`. For the purposes of this module, most folks will be +//! interested in the types [`kstat_rs::Kstat`], [`kstat_rs::Data`], and +//! [`kstat_rs::NamedData`]. +//! +//! # Oximeter +//! +//! Oximeter is the Oxide control plane component which collects telemetry from +//! other parts of the system, called _producers_. Statistics are defined using +//! a _target_, which is an object about which statistics are collected, and +//! _metrics_, which are the actual measurements about a target. As an example, +//! a target might be an NVMe drive on the rack, and a metric might be its +//! temperature, or the estimated remaining drive lifetime. +//! +//! Targets and metrics are encapsulated by traits of the same name. Using +//! these, producers can generate timestamped [`Sample`]s, which the `oximeter` +//! collector program pulls at regular intervals for storage in the timeseries +//! database. +//! +//! # What does this mod do? +//! +//! This module is intended to connect illumos kstats with oximeter. Developers +//! can use this to define a mapping betweeen one or more kstats, and an +//! oximeter `Target` and `Metric`. This mapping is encapsulated in the +//! [`KstatTarget`] trait, which extends the `Target` trait itself. +//! +//! To implement the trait, developers register their interest in a particular +//! [`Kstat`] through the [`KstatTarget::interested()`] method. They then +//! describe how to generate any number of [`Sample`]s from that set of kstats, +//! through the [`KstatTarget::to_samples()`] method. +//! +//! # The [`KstatSampler`] +//! +//! Most folks will instantiate a [`KstatSampler`], which manages any number of +//! tracked `KstatTarget`s. Users can register their implementation of +//! `KstatTarget` with the sampler, and it will periodically generate samples +//! from it, converting the "interesting" kstats into `Sample`s. +//! +//! # Intervals and expiration +//! +//! When users register a target for sampling, they are required to include +//! details about how often their target should be sampled, and what to do if we +//! cannot produce samples due to an error, or if there are _no kstats_ that the +//! target is interested in. These details are captured in the +//! [`CollectionDetails`] type. +//! +//! After a configurable period of errors (expressed in either consecutive error +//! counts or a duration of concecutive errors), a target is _expired_, and will +//! no longer be collected from. A target's status may be queried with the +//! [`KstatSampler::target_status()`] method, which will inform the caller if +//! the target has expired. In this case, users can re-register a target, which +//! will replace the expired one, generating new samples (assuming the error +//! condition has been resolved). + +use chrono::DateTime; +use chrono::Utc; +use kstat_rs::Data; +use kstat_rs::Error as KstatError; +use kstat_rs::Kstat; +use kstat_rs::NamedData; +use kstat_rs::NamedType; +use oximeter::FieldValue; +use oximeter::MetricsError; +use oximeter::Sample; +use oximeter::Target; +use std::cmp::Ordering; +use std::collections::BTreeMap; +use std::time::Duration; + +pub mod link; +mod sampler; + +pub use sampler::CollectionDetails; +pub use sampler::ExpirationBehavior; +pub use sampler::KstatSampler; +pub use sampler::TargetId; +pub use sampler::TargetStatus; + +/// The reason a kstat target was expired and removed from a sampler. +#[derive(Clone, Copy, Debug)] +pub enum ExpirationReason { + /// Expired after too many failed attempts. + Attempts(usize), + /// Expired after a defined interval of consistent failures. + Duration(Duration), +} + +/// An error describing why a kstat target was expired. +#[derive(Debug)] +pub struct Expiration { + /// The reason for expiration. + pub reason: ExpirationReason, + /// The last error before expiration. + pub error: Box, + /// The time at which the expiration occurred. + #[cfg(test)] + pub expired_at: tokio::time::Instant, + #[cfg(not(test))] + pub expired_at: DateTime, +} + +/// Errors resulting from reporting kernel statistics. +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("Could not find kstat with the expected name")] + NoSuchKstat, + + #[error("Kstat does not have the expected data type")] + UnexpectedDataType { expected: NamedType, found: NamedType }, + + #[error("Expected a named kstat")] + ExpectedNamedKstat, + + #[error("Duplicate target instance")] + DuplicateTarget { + target_name: String, + fields: BTreeMap, + }, + + #[error("No such kstat target exists")] + NoSuchTarget, + + #[error("Error generating sample")] + Sample(#[from] MetricsError), + + #[error("Kstat library error")] + Kstat(#[from] KstatError), + + #[error("Kstat control handle is not available")] + NoKstatCtl, + + #[error("Overflow computing kstat creation or snapshot time")] + TimestampOverflow, + + #[error("Failed to send message to background sampling task")] + SendError, + + #[error("Failed to receive message from background sampling task")] + RecvError, + + #[error("Expired following too many unsuccessful collection attempts")] + Expired(Expiration), + + #[error("Expired after unsucessfull collections for {duration:?}")] + ExpiredAfterDuration { duration: Duration, error: Box }, +} + +/// Type alias for a list of kstats. +/// +/// This includes the kstat's creation time, the kstat itself, and its data. +pub type KstatList<'a, 'k> = &'a [(DateTime, Kstat<'k>, Data<'k>)]; + +/// A trait for generating oximeter samples from a kstat. +/// +/// This trait is used to describe the kernel statistics that are relevant for +/// an `oximeter::Target`, and how to generate samples from them. +pub trait KstatTarget: + Target + Send + Sync + 'static + std::fmt::Debug +{ + /// Return true for any kstat you're interested in. + fn interested(&self, kstat: &Kstat<'_>) -> bool; + + /// Convert from a kstat and its data to a list of samples. + fn to_samples( + &self, + kstats: KstatList<'_, '_>, + ) -> Result, Error>; +} + +/// Convert from a high-res timestamp into UTC, if possible. +pub fn hrtime_to_utc(hrtime: i64) -> Result, Error> { + let utc_now = Utc::now(); + let hrtime_now = unsafe { gethrtime() }; + match hrtime_now.cmp(&hrtime) { + Ordering::Equal => Ok(utc_now), + Ordering::Less => { + let offset = u64::try_from(hrtime - hrtime_now) + .map_err(|_| Error::TimestampOverflow)?; + Ok(utc_now + Duration::from_nanos(offset)) + } + Ordering::Greater => { + let offset = u64::try_from(hrtime_now - hrtime) + .map_err(|_| Error::TimestampOverflow)?; + Ok(utc_now - Duration::from_nanos(offset)) + } + } +} + +// Helper trait for converting a `NamedData` item into a specific contained data +// type, if possible. +pub(crate) trait ConvertNamedData { + fn as_i32(&self) -> Result; + fn as_u32(&self) -> Result; + fn as_i64(&self) -> Result; + fn as_u64(&self) -> Result; +} + +impl<'a> ConvertNamedData for NamedData<'a> { + fn as_i32(&self) -> Result { + if let NamedData::Int32(x) = self { + Ok(*x) + } else { + Err(Error::UnexpectedDataType { + expected: NamedType::Int32, + found: self.data_type(), + }) + } + } + + fn as_u32(&self) -> Result { + if let NamedData::UInt32(x) = self { + Ok(*x) + } else { + Err(Error::UnexpectedDataType { + expected: NamedType::UInt32, + found: self.data_type(), + }) + } + } + + fn as_i64(&self) -> Result { + if let NamedData::Int64(x) = self { + Ok(*x) + } else { + Err(Error::UnexpectedDataType { + expected: NamedType::Int64, + found: self.data_type(), + }) + } + } + + fn as_u64(&self) -> Result { + if let NamedData::UInt64(x) = self { + Ok(*x) + } else { + Err(Error::UnexpectedDataType { + expected: NamedType::UInt64, + found: self.data_type(), + }) + } + } +} + +#[link(name = "c")] +extern "C" { + fn gethrtime() -> i64; +} diff --git a/oximeter/instruments/src/kstat/sampler.rs b/oximeter/instruments/src/kstat/sampler.rs new file mode 100644 index 00000000000..bab8ad0ba51 --- /dev/null +++ b/oximeter/instruments/src/kstat/sampler.rs @@ -0,0 +1,1225 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Generate oximeter samples from kernel statistics. + +use crate::kstat::hrtime_to_utc; +use crate::kstat::Error; +use crate::kstat::Expiration; +use crate::kstat::ExpirationReason; +use crate::kstat::KstatTarget; +use chrono::DateTime; +use chrono::Utc; +use futures::stream::FuturesUnordered; +use futures::StreamExt; +use kstat_rs::Ctl; +use kstat_rs::Kstat; +use oximeter::types::Cumulative; +use oximeter::Metric; +use oximeter::MetricsError; +use oximeter::Sample; +use slog::debug; +use slog::error; +use slog::o; +use slog::trace; +use slog::warn; +use slog::Logger; +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::fmt; +use std::pin::Pin; +use std::sync::Arc; +use std::sync::Mutex; +use std::task::Context; +use std::task::Poll; +use std::time::Duration; +use tokio::sync::mpsc; +use tokio::sync::oneshot; +use tokio::time::interval; +use tokio::time::sleep; +use tokio::time::Sleep; + +#[cfg(test)] +use tokio::time::Instant; + +// The `KstatSampler` generates some statistics about its own operation, mostly +// for surfacing failures to collect and dropped samples. +mod self_stats { + use super::BTreeMap; + use super::Cumulative; + use super::TargetId; + + /// Information identifying this kstat sampler. + #[derive(Debug, oximeter::Target)] + pub struct KstatSampler { + /// The hostname (or zonename) of the host machine. + pub hostname: String, + } + + /// The total number of samples dropped for a single target. + #[derive(Debug, oximeter::Metric)] + pub struct SamplesDropped { + /// The ID of the target being tracked. + pub target_id: u64, + /// The name of the target being tracked. + pub target_name: String, + pub datum: Cumulative, + } + + /// The cumulative number of expired targets. + #[derive(Debug, oximeter::Metric)] + pub struct ExpiredTargets { + pub datum: Cumulative, + } + + #[derive(Debug)] + pub struct SelfStats { + pub target: KstatSampler, + // We'll store it this way for quick lookups, and build the type as we + // need it when publishing the samples, from the key and value. + pub drops: BTreeMap<(TargetId, String), Cumulative>, + pub expired: ExpiredTargets, + } + + impl SelfStats { + pub fn new(hostname: String) -> Self { + Self { + target: KstatSampler { hostname }, + drops: BTreeMap::new(), + expired: ExpiredTargets { datum: Cumulative::new(0) }, + } + } + } +} + +/// An identifier for a single tracked kstat target. +/// +/// This opaque identifier can be used to unregister targets from the sampler. +/// If not removed, data from the targets will be produced according to the +/// [`ExpirationBehavior`] configured for the target. +#[derive(Clone, Copy, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct TargetId(u64); + +impl fmt::Debug for TargetId { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +impl fmt::Display for TargetId { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +/// When to expire kstat which can no longer be collected from. +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub enum ExpirationBehavior { + /// Never stop attempting to produce data from this target. + Never, + /// Expire after a number of sequential failed collections. + /// + /// If the payload is 0, expire after the first failure. + Attempts(usize), + /// Expire after a specified period of failing to collect. + Duration(Duration), +} + +/// Details about the collection and expiration intervals for a target. +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct CollectionDetails { + /// The interval on which data from the target is collected. + pub interval: Duration, + /// The expiration behavior, specifying how to handle situations in which we + /// cannot collect kstat samples. + /// + /// Note that this includes both errors during an attempt to collect, and + /// situations in which the target doesn't signal interest in any kstats. + /// The latter can occur if the physical resource (such as a datalink) that + /// underlies the kstat has disappeared, for example. + pub expiration: ExpirationBehavior, +} + +impl CollectionDetails { + /// Return collection details with no expiration. + pub fn never(interval: Duration) -> Self { + Self { interval, expiration: ExpirationBehavior::Never } + } + + /// Return collection details that expires after a number of attempts. + pub fn attempts(interval: Duration, count: usize) -> Self { + Self { interval, expiration: ExpirationBehavior::Attempts(count) } + } + + /// Return collection details that expires after a duration has elapsed. + pub fn duration(interval: Duration, duration: Duration) -> Self { + Self { interval, expiration: ExpirationBehavior::Duration(duration) } + } +} + +/// The status of a sampled kstat-based target. +#[derive(Clone, Debug)] +pub enum TargetStatus { + /// The target is currently being collected from normally. + /// + /// The timestamp of the last collection is included. + Ok { + #[cfg(test)] + last_collection: Option, + #[cfg(not(test))] + last_collection: Option>, + }, + /// The target has been expired. + /// + /// The details about the expiration are included. + Expired { + reason: ExpirationReason, + // NOTE: The error is a string, because it's not cloneable. + error: String, + #[cfg(test)] + expired_at: Instant, + #[cfg(not(test))] + expired_at: DateTime, + }, +} + +/// A request sent from `KstatSampler` to the worker task. +// NOTE: The docstrings here are public for ease of consumption by IDEs and +// other tooling. +#[derive(Debug)] +enum Request { + /// Add a new target for sampling. + AddTarget { + target: Box, + details: CollectionDetails, + reply_tx: oneshot::Sender>, + }, + /// Request the status for a target + TargetStatus { + id: TargetId, + reply_tx: oneshot::Sender>, + }, + /// Remove a target. + RemoveTarget { id: TargetId, reply_tx: oneshot::Sender> }, + /// Return the creation times of all tracked / extant kstats. + #[cfg(test)] + CreationTimes { + reply_tx: oneshot::Sender>>, + }, +} + +/// Data about a single kstat target. +#[derive(Debug)] +struct SampledKstat { + /// The target from which to collect. + target: Box, + /// The details around collection and expiration behavior. + details: CollectionDetails, + /// The time at which we _added_ this target to the sampler. + #[cfg(test)] + time_added: Instant, + #[cfg(not(test))] + time_added: DateTime, + /// The last time we successfully collected from the target. + #[cfg(test)] + time_of_last_collection: Option, + #[cfg(not(test))] + time_of_last_collection: Option>, + /// Attempts since we last successfully collected from the target. + attempts_since_last_collection: usize, +} + +/// Represents the current state of a registered target. +/// +/// We use this to report the status of a kstat target, such as reporting if its +/// been expired. +#[derive(Debug)] +enum SampledObject { + Kstat(SampledKstat), + Expired(Expiration), +} + +/// Helper to hash a target, used for creating unique IDs for them. +fn hash_target(t: &dyn KstatTarget) -> TargetId { + use std::hash::Hash; + use std::hash::Hasher; + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + t.name().hash(&mut hasher); + for f in t.fields() { + f.hash(&mut hasher); + } + TargetId(hasher.finish()) +} + +/// Small future that yields an ID for a target to be sampled after its +/// predefined interval expires. +struct YieldIdAfter { + /// Future to which we delegate to awake us after our interval. + sleep: Pin>, + /// The next interval to yield when we complete. + interval: Duration, + /// The ID of the target to yield when we complete. + id: TargetId, +} + +impl std::fmt::Debug for YieldIdAfter { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("YieldIdAfter") + .field("sleep", &"_") + .field("interval", &self.interval) + .field("id", &self.id) + .finish() + } +} + +impl YieldIdAfter { + fn new(id: TargetId, interval: Duration) -> Self { + Self { sleep: Box::pin(sleep(interval)), interval, id } + } +} + +impl core::future::Future for YieldIdAfter { + type Output = (TargetId, Duration); + + fn poll( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll { + match self.sleep.as_mut().poll(cx) { + Poll::Pending => Poll::Pending, + Poll::Ready(_) => Poll::Ready((self.id, self.interval)), + } + } +} + +/// An owned type used to keep track of the creation time for each kstat in +/// which interest has been signaled. +#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] +pub(crate) struct KstatPath { + pub module: String, + pub instance: i32, + pub name: String, +} + +impl<'a> From> for KstatPath { + fn from(k: Kstat<'a>) -> Self { + Self { + module: k.ks_module.to_string(), + instance: k.ks_instance, + name: k.ks_name.to_string(), + } + } +} + +/// The interval on which we prune creation times. +/// +/// As targets are added and kstats are tracked, we store their creation times +/// in the `KstatSamplerWorker::creation_times` mapping. This lets us keep a +/// consistent start time, which is important for cumulative metrics. In order +/// to keep these consistent, we don't necessarily prune them as a target is +/// removed or interest in the kstat changes, since the target may be added +/// again. Instead, we prune the creation times only when the _kstat itself_ is +/// removed from the kstat chain. +pub(crate) const CREATION_TIME_PRUNE_INTERVAL: Duration = + Duration::from_secs(60); + +/// Type which owns the `kstat` chain and samples each target on an interval. +/// +/// This type runs in a separate tokio task. As targets are added, it schedules +/// futures which expire after the target's sampling interval, yielding the +/// target's ID. (This is the `YieldIdAfter` type.) When those futures complete, +/// this type then samples the kstats they indicate, and push those onto a +/// per-target queue of samples. +#[derive(Debug)] +struct KstatSamplerWorker { + log: Logger, + + /// The kstat chain. + ctl: Option, + + /// The set of registered targets to collect kstats from, ordered by their + /// IDs. + targets: BTreeMap, + + /// The set of creation times for all tracked kstats. + /// + /// As interest in kstats is noted, we add a creation time for those kstats + /// here. It is removed only when the kstat itself no longer appears on the + /// kstat chain, even if a caller removes the target or no longer signals + /// interest in it. + creation_times: BTreeMap>, + + /// The per-target queue of samples, pulled by the main `KstatSampler` type + /// when producing metrics. + samples: Arc>>>, + + /// Inbox channel on which the `KstatSampler` sends messages. + inbox: mpsc::Receiver, + + /// The maximum number of samples we allow in each per-target buffer, to + /// avoid huge allocations when we produce data faster than it's collected. + sample_limit: usize, + + /// Outbound queue on which to publish self statistics, which are expected to + /// be low-volume. + self_stat_queue: mpsc::Sender, + + /// The statistics we maintain about ourselves. + /// + /// This is an option, since it's possible we fail to extract the hostname + /// at construction time. In that case, we'll try again the next time we + /// need it. + self_stats: Option, +} + +fn hostname() -> Option { + let out = + std::process::Command::new("hostname").env_clear().output().ok()?; + if !out.status.success() { + return None; + } + Some(String::from_utf8_lossy(&out.stdout).trim().to_string()) +} + +/// Stores the number of samples taken, used for testing. +#[cfg(test)] +pub(crate) struct SampleCounts { + pub total: usize, + pub overflow: usize, +} + +impl KstatSamplerWorker { + /// Create a new sampler worker. + fn new( + log: Logger, + inbox: mpsc::Receiver, + self_stat_queue: mpsc::Sender, + samples: Arc>>>, + sample_limit: usize, + ) -> Result { + let ctl = Some(Ctl::new().map_err(Error::Kstat)?); + let self_stats = hostname().map(self_stats::SelfStats::new); + Ok(Self { + ctl, + log, + targets: BTreeMap::new(), + creation_times: BTreeMap::new(), + samples, + inbox, + sample_limit, + self_stat_queue, + self_stats, + }) + } + + /// Consume self and run its main polling loop. + /// + /// This will accept messages on its inbox, and also sample the registered + /// kstats at their intervals. Samples will be pushed onto the queue. + async fn run( + mut self, + #[cfg(test)] sample_count_tx: mpsc::UnboundedSender, + ) { + let mut sample_timeouts = FuturesUnordered::new(); + let mut creation_prune_interval = + interval(CREATION_TIME_PRUNE_INTERVAL); + creation_prune_interval.tick().await; // Completes immediately. + loop { + tokio::select! { + _ = creation_prune_interval.tick() => { + if let Err(e) = self.prune_creation_times() { + error!( + self.log, + "failed to prune creation times"; + "error" => ?e, + ); + } + } + maybe_id = sample_timeouts.next(), if !sample_timeouts.is_empty() => { + let Some((id, interval)) = maybe_id else { + unreachable!(); + }; + match self.sample_one(id) { + Ok(Some(samples)) => { + if samples.is_empty() { + debug!( + self.log, + "no new samples from target, requeueing"; + "id" => ?id, + ); + sample_timeouts.push(YieldIdAfter::new(id, interval)); + continue; + } + let n_samples = samples.len(); + debug!( + self.log, + "pulled samples from target"; + "id" => ?id, + "n_samples" => n_samples, + ); + + // Append any samples to the per-target queues. + // + // This returns None if there was no queue, and logs + // the error internally. We'll just go round the + // loop again in that case. + let Some(n_overflow_samples) = + self.append_per_target_samples(id, samples) else { + continue; + }; + + // Safety: We only get here if the `sample_one()` + // method works, which means we have a record of the + // target, and it's not expired. + if n_overflow_samples > 0 { + let SampledObject::Kstat(ks) = self.targets.get(&id).unwrap() else { + unreachable!(); + }; + self.increment_dropped_sample_counter( + id, + ks.target.name().to_string(), + n_overflow_samples, + ).await; + } + + // Send the total number of samples we've actually + // taken and the number we've appended over to any + // testing code which might be listening. + #[cfg(test)] + sample_count_tx.send(SampleCounts { + total: n_samples, + overflow: n_overflow_samples, + }).unwrap(); + + trace!( + self.log, + "re-queueing target for sampling"; + "id" => ?id, + "interval" => ?interval, + ); + sample_timeouts.push(YieldIdAfter::new(id, interval)); + } + Ok(None) => { + debug!( + self.log, + "sample timeout triggered for non-existent target"; + "id" => ?id, + ); + } + Err(Error::Expired(expiration)) => { + error!( + self.log, + "expiring kstat after too many failures"; + "id" => ?id, + "reason" => ?expiration.reason, + "error" => ?expiration.error, + ); + let _ = self.targets.insert(id, SampledObject::Expired(expiration)); + self.increment_expired_target_counter().await; + } + Err(e) => { + error!( + self.log, + "failed to sample kstat target, requeueing"; + "id" => ?id, + "error" => ?e, + ); + sample_timeouts.push(YieldIdAfter::new(id, interval)); + } + } + } + maybe_request = self.inbox.recv() => { + let Some(request) = maybe_request else { + debug!(self.log, "inbox returned None, exiting"); + return; + }; + trace!( + self.log, + "received request on inbox"; + "request" => ?request, + ); + match request { + Request::AddTarget { + target, + details, + reply_tx, + } => { + match self.add_target(target, details) { + Ok(id) => { + let timeout = YieldIdAfter::new(id, details.interval); + sample_timeouts.push(timeout); + trace!( + self.log, + "added target with timeout"; + "id" => ?id, + "details" => ?details, + ); + match reply_tx.send(Ok(id)) { + Ok(_) => trace!(self.log, "sent reply"), + Err(e) => error!( + self.log, + "failed to send reply"; + "id" => ?id, + "error" => ?e, + ) + } + } + Err(e) => { + error!( + self.log, + "failed to add target"; + "error" => ?e, + ); + match reply_tx.send(Err(e)) { + Ok(_) => trace!(self.log, "sent reply"), + Err(e) => error!( + self.log, + "failed to send reply"; + "error" => ?e, + ) + } + } + } + } + Request::RemoveTarget { id, reply_tx } => { + self.targets.remove(&id); + if let Some(remaining_samples) = self.samples.lock().unwrap().remove(&id) { + if !remaining_samples.is_empty() { + warn!( + self.log, + "target removed with queued samples"; + "id" => ?id, + "n_samples" => remaining_samples.len(), + ); + } + } + match reply_tx.send(Ok(())) { + Ok(_) => trace!(self.log, "sent reply"), + Err(e) => error!( + self.log, + "failed to send reply"; + "error" => ?e, + ) + } + } + Request::TargetStatus { id, reply_tx } => { + trace!( + self.log, + "request for target status"; + "id" => ?id, + ); + let response = match self.targets.get(&id) { + None => Err(Error::NoSuchTarget), + Some(SampledObject::Kstat(k)) => { + Ok(TargetStatus::Ok { + last_collection: k.time_of_last_collection, + }) + } + Some(SampledObject::Expired(e)) => { + Ok(TargetStatus::Expired { + reason: e.reason, + error: e.error.to_string(), + expired_at: e.expired_at, + }) + } + }; + match reply_tx.send(response) { + Ok(_) => trace!(self.log, "sent reply"), + Err(e) => error!( + self.log, + "failed to send reply"; + "id" => ?id, + "error" => ?e, + ), + } + } + #[cfg(test)] + Request::CreationTimes { reply_tx } => { + debug!(self.log, "request for creation times"); + reply_tx.send(self.creation_times.clone()).unwrap(); + debug!(self.log, "sent reply for creation times"); + } + } + } + } + } + } + + fn append_per_target_samples( + &self, + id: TargetId, + mut samples: Vec, + ) -> Option { + // Limit the number of samples we actually contain + // in the sample queue. This is to avoid huge + // allocations when we produce lots of data, but + // we're not polled quickly enough by oximeter. + // + // Note that this is a _per-target_ queue. + let mut all_samples = self.samples.lock().unwrap(); + let Some(current_samples) = all_samples.get_mut(&id) else { + error!( + self.log, + "per-target sample queue not found!"; + "id" => ?id, + ); + return None; + }; + let n_new_samples = samples.len(); + let n_current_samples = current_samples.len(); + let n_total_samples = n_new_samples + n_current_samples; + let n_overflow_samples = + n_total_samples.checked_sub(self.sample_limit).unwrap_or(0); + if n_overflow_samples > 0 { + warn!( + self.log, + "sample queue is too full, dropping oldest samples"; + "n_new_samples" => n_new_samples, + "n_current_samples" => n_current_samples, + "n_overflow_samples" => n_overflow_samples, + ); + // It's possible that the number of new samples + // is big enough to overflow the current + // capacity, and also require removing new + // samples. + if n_overflow_samples < n_current_samples { + let _ = current_samples.drain(..n_overflow_samples); + } else { + // Clear all the current samples, and some + // of the new ones. The subtraction below + // cannot panic, because + // `n_overflow_samples` is computed above by + // adding `n_current_samples`. + current_samples.clear(); + let _ = + samples.drain(..(n_overflow_samples - n_current_samples)); + } + } + current_samples.extend(samples); + Some(n_overflow_samples) + } + + /// Add samples for one target to the internal queue. + /// + /// Note that this updates the kstat chain. + fn sample_one( + &mut self, + id: TargetId, + ) -> Result>, Error> { + self.update_chain()?; + let ctl = self.ctl.as_ref().unwrap(); + let Some(maybe_kstat) = self.targets.get_mut(&id) else { + return Ok(None); + }; + let SampledObject::Kstat(sampled_kstat) = maybe_kstat else { + panic!("Should not be sampling an expired kstat"); + }; + + // Fetch each interested kstat, and include the data and creation times + // for each of them. + let kstats = ctl + .iter() + .filter(|kstat| sampled_kstat.target.interested(kstat)) + .map(|mut kstat| { + let data = ctl.read(&mut kstat)?; + let creation_time = Self::ensure_kstat_creation_time( + &self.log, + kstat, + &mut self.creation_times, + )?; + Ok((creation_time, kstat, data)) + }) + .collect::, _>>(); + match kstats { + Ok(k) if !k.is_empty() => { + cfg_if::cfg_if! { + if #[cfg(test)] { + sampled_kstat.time_of_last_collection = Some(Instant::now()); + } else { + sampled_kstat.time_of_last_collection = Some(Utc::now()); + } + } + sampled_kstat.attempts_since_last_collection = 0; + sampled_kstat.target.to_samples(&k).map(Option::Some) + } + other => { + // Convert a list of zero interested kstats into an error. + let e = match other { + Err(e) => e, + Ok(k) if k.is_empty() => { + trace!( + self.log, + "no matching samples for target, converting \ + to sampling error"; + "id" => ?id, + ); + Error::NoSuchKstat + } + _ => unreachable!(), + }; + sampled_kstat.attempts_since_last_collection += 1; + + // Check if this kstat should be expired, based on the attempts + // we've previously made and the expiration policy. + match sampled_kstat.details.expiration { + ExpirationBehavior::Never => {} + ExpirationBehavior::Attempts(n_attempts) => { + if sampled_kstat.attempts_since_last_collection + >= n_attempts + { + cfg_if::cfg_if! { + if #[cfg(test)] { + let expired_at = Instant::now(); + } else { + let expired_at = Utc::now(); + } + } + return Err(Error::Expired(Expiration { + reason: ExpirationReason::Attempts(n_attempts), + error: Box::new(e), + expired_at, + })); + } + } + ExpirationBehavior::Duration(duration) => { + // Use the time of the last collection, if one exists, + // or the time we added the kstat if not. + let start = sampled_kstat + .time_of_last_collection + .unwrap_or_else(|| sampled_kstat.time_added); + let expire_at = start + duration; + cfg_if::cfg_if! { + if #[cfg(test)] { + let now = Instant::now(); + } else { + let now = Utc::now(); + } + } + if now >= expire_at { + return Err(Error::Expired(Expiration { + reason: ExpirationReason::Duration(duration), + error: Box::new(e), + expired_at: now, + })); + } + } + } + + // Do not expire the kstat, simply fail this collection. + Err(e) + } + } + } + + async fn increment_dropped_sample_counter( + &mut self, + target_id: TargetId, + target_name: String, + n_overflow_samples: usize, + ) { + assert!(n_overflow_samples > 0); + if let Some(stats) = self.get_or_try_build_self_stats() { + // Get the entry for this target, or build a counter starting a 0. + // We'll always add the number of overflow samples afterwards. + let drops = stats + .drops + .entry((target_id, target_name.clone())) + .or_default(); + *drops += n_overflow_samples as u64; + let metric = self_stats::SamplesDropped { + target_id: target_id.0, + target_name, + datum: *drops, + }; + let sample = match Sample::new(&stats.target, &metric) { + Ok(s) => s, + Err(e) => { + error!( + self.log, + "could not generate sample for dropped sample counter"; + "error" => ?e, + ); + return; + } + }; + match self.self_stat_queue.send(sample).await { + Ok(_) => trace!(self.log, "sent dropped sample counter stat"), + Err(e) => error!( + self.log, + "failed to send dropped sample counter to self stat queue"; + "error" => ?e, + ), + } + } else { + warn!( + self.log, + "cannot record dropped sample statistic, failed to get hostname" + ); + } + } + + async fn increment_expired_target_counter(&mut self) { + if let Some(stats) = self.get_or_try_build_self_stats() { + stats.expired.datum_mut().increment(); + let sample = match Sample::new(&stats.target, &stats.expired) { + Ok(s) => s, + Err(e) => { + error!( + self.log, + "could not generate sample for expired target counter"; + "error" => ?e, + ); + return; + } + }; + match self.self_stat_queue.send(sample).await { + Ok(_) => trace!(self.log, "sent expired target counter stat"), + Err(e) => error!( + self.log, + "failed to send target counter to self stat queue"; + "error" => ?e, + ), + } + } else { + warn!( + self.log, + "cannot record expiration statistic, failed to get hostname" + ); + } + } + + /// If we have an actual `SelfStats` struct, return it, or try to create one. + /// We'll still return `None` in that latter case, and we fail to make one. + fn get_or_try_build_self_stats( + &mut self, + ) -> Option<&mut self_stats::SelfStats> { + if self.self_stats.is_none() { + self.self_stats = hostname().map(self_stats::SelfStats::new); + } + self.self_stats.as_mut() + } + + /// Ensure that we have recorded the creation time for all interested kstats + /// for a new target. + fn ensure_creation_times_for_target( + &mut self, + target: &dyn KstatTarget, + ) -> Result<(), Error> { + self.update_chain()?; + let ctl = self.ctl.as_ref().unwrap(); + for kstat in ctl.iter().filter(|k| target.interested(k)) { + Self::ensure_kstat_creation_time( + &self.log, + kstat, + &mut self.creation_times, + )?; + } + Ok(()) + } + + /// Ensure that we store the creation time for the provided kstat. + fn ensure_kstat_creation_time( + log: &Logger, + kstat: Kstat, + creation_times: &mut BTreeMap>, + ) -> Result, Error> { + let path = KstatPath::from(kstat); + match creation_times.entry(path.clone()) { + Entry::Occupied(entry) => { + trace!( + log, + "creation time already exists for tracked target"; + "path" => ?path, + ); + Ok(*entry.get()) + } + Entry::Vacant(entry) => { + let creation_time = hrtime_to_utc(kstat.ks_crtime)?; + debug!( + log, + "storing new creation time for tracked target"; + "path" => ?path, + "creation_time" => ?creation_time, + ); + entry.insert(creation_time); + Ok(creation_time) + } + } + } + + /// Prune the stored creation times, removing any that no longer have + /// corresponding kstats on the chain. + fn prune_creation_times(&mut self) -> Result<(), Error> { + if self.creation_times.is_empty() { + trace!(self.log, "no creation times to prune"); + return Ok(()); + } + // We'll create a list of all the creation times to prune, by + // progressively removing any _extant_ kstats from the set of keys we + // currently have. If something is _not_ on the chain, it'll remain in + // this map at the end of the loop below, and thus we know we need to + // remove it. + let mut to_remove: BTreeSet<_> = + self.creation_times.keys().cloned().collect(); + + // Iterate the chain, and remove any current keys that do _not_ appear + // on the chain. + self.update_chain()?; + let ctl = self.ctl.as_ref().unwrap(); + for kstat in ctl.iter() { + let path = KstatPath::from(kstat); + let _ = to_remove.remove(&path); + } + + if to_remove.is_empty() { + trace!(self.log, "kstat creation times is already pruned"); + } else { + debug!( + self.log, + "pruning creation times for kstats that are gone"; + "to_remove" => ?to_remove, + "n_to_remove" => to_remove.len(), + ); + self.creation_times.retain(|key, _value| !to_remove.contains(key)); + } + Ok(()) + } + + /// Start tracking a single KstatTarget object. + fn add_target( + &mut self, + target: Box, + details: CollectionDetails, + ) -> Result { + let id = hash_target(&*target); + match self.targets.get(&id) { + Some(SampledObject::Kstat(_)) => { + return Err(Error::DuplicateTarget { + target_name: target.name().to_string(), + fields: target + .field_names() + .iter() + .map(ToString::to_string) + .zip(target.field_values()) + .collect(), + }); + } + Some(SampledObject::Expired(e)) => { + warn!( + self.log, + "replacing expired kstat target"; + "id" => ?id, + "expiration_reason" => ?e.reason, + "error" => ?e.error, + "expired_at" => ?e.expired_at, + ); + } + None => {} + } + self.ensure_creation_times_for_target(&*target)?; + + cfg_if::cfg_if! { + if #[cfg(test)] { + let time_added = Instant::now(); + } else { + let time_added = Utc::now(); + } + } + let item = SampledKstat { + target, + details, + time_added, + time_of_last_collection: None, + attempts_since_last_collection: 0, + }; + let _ = self.targets.insert(id, SampledObject::Kstat(item)); + + // Add to the per-target queues, making sure to keep any samples that + // were already there previously. This would be a bit odd, since it + // means that the target expired, but we hadn't been polled by oximeter. + // Nonetheless keep these samples anyway. + let n_samples = + self.samples.lock().unwrap().entry(id).or_default().len(); + match n_samples { + 0 => debug!( + self.log, + "inserted empty per-target sample queue"; + "id" => ?id, + ), + n => debug!( + self.log, + "per-target queue appears to have old samples"; + "id" => ?id, + "n_samples" => n, + ), + } + Ok(id) + } + + fn update_chain(&mut self) -> Result<(), Error> { + let new_ctl = match self.ctl.take() { + None => Ctl::new(), + Some(old) => old.update(), + } + .map_err(Error::Kstat)?; + let _ = self.ctl.insert(new_ctl); + Ok(()) + } +} + +/// A type for reporting kernel statistics as oximeter samples. +#[derive(Clone, Debug)] +pub struct KstatSampler { + samples: Arc>>>, + outbox: mpsc::Sender, + self_stat_rx: Arc>>, + _worker_task: Arc>, + #[cfg(test)] + sample_count_rx: Arc>>, +} + +impl KstatSampler { + /// The maximum number of samples allowed in the internal buffer, before + /// oldest samples are dropped. + /// + /// This is to avoid unbounded allocations in situations where data is + /// produced faster than it is collected. + /// + /// Note that this is a _per-target_ sample limit! + pub const DEFAULT_SAMPLE_LIMIT: usize = 500; + + /// Create a new sampler. + pub fn new(log: &Logger) -> Result { + Self::with_sample_limit(log, Self::DEFAULT_SAMPLE_LIMIT) + } + + /// Create a new sampler with a sample limit. + pub fn with_sample_limit( + log: &Logger, + limit: usize, + ) -> Result { + let samples = Arc::new(Mutex::new(BTreeMap::new())); + let (self_stat_tx, self_stat_rx) = mpsc::channel(4096); + let (outbox, inbox) = mpsc::channel(1); + let worker = KstatSamplerWorker::new( + log.new(o!("component" => "kstat-sampler-worker")), + inbox, + self_stat_tx, + samples.clone(), + limit, + )?; + #[cfg(test)] + let (sample_count_rx, _worker_task) = { + let (sample_count_tx, sample_count_rx) = mpsc::unbounded_channel(); + ( + Arc::new(Mutex::new(sample_count_rx)), + Arc::new(tokio::task::spawn(worker.run(sample_count_tx))), + ) + }; + #[cfg(not(test))] + let _worker_task = Arc::new(tokio::task::spawn(worker.run())); + Ok(Self { + samples, + outbox, + self_stat_rx: Arc::new(Mutex::new(self_stat_rx)), + _worker_task, + #[cfg(test)] + sample_count_rx, + }) + } + + /// Add a target, which can be used to produce zero or more samples. + /// + /// Note that adding a target which has previously expired is _not_ an + /// error, and instead replaces the expired target. + pub async fn add_target( + &self, + target: impl KstatTarget, + details: CollectionDetails, + ) -> Result { + let (reply_tx, reply_rx) = oneshot::channel(); + let request = + Request::AddTarget { target: Box::new(target), details, reply_tx }; + self.outbox.send(request).await.map_err(|_| Error::SendError)?; + reply_rx.await.map_err(|_| Error::RecvError)? + } + + /// Remove a tracked target. + pub async fn remove_target(&self, id: TargetId) -> Result<(), Error> { + let (reply_tx, reply_rx) = oneshot::channel(); + let request = Request::RemoveTarget { id, reply_tx }; + self.outbox.send(request).await.map_err(|_| Error::SendError)?; + reply_rx.await.map_err(|_| Error::RecvError)? + } + + /// Fetch the status for a target. + /// + /// If the target is being collected normally, then `TargetStatus::Ok` is + /// returned, which contains the time of the last collection, if any. + /// + /// If the target exists, but has been expired, then the details about the + /// expiration are returned in `TargetStatus::Expired`. + /// + /// If the target doesn't exist at all, then an error is returned. + pub async fn target_status( + &self, + id: TargetId, + ) -> Result { + let (reply_tx, reply_rx) = oneshot::channel(); + let request = Request::TargetStatus { id, reply_tx }; + self.outbox.send(request).await.map_err(|_| Error::SendError)?; + reply_rx.await.map_err(|_| Error::RecvError)? + } + + /// Return the number of samples pushed by the sampling task, if any. + #[cfg(test)] + pub(crate) fn sample_counts(&self) -> Option { + match self.sample_count_rx.lock().unwrap().try_recv() { + Ok(c) => Some(c), + Err(mpsc::error::TryRecvError::Empty) => None, + _ => panic!("sample_tx disconnected"), + } + } + + /// Return the creation times for all tracked kstats. + #[cfg(test)] + pub(crate) async fn creation_times( + &self, + ) -> BTreeMap> { + let (reply_tx, reply_rx) = oneshot::channel(); + let request = Request::CreationTimes { reply_tx }; + self.outbox.send(request).await.map_err(|_| Error::SendError).unwrap(); + reply_rx.await.map_err(|_| Error::RecvError).unwrap() + } +} + +impl oximeter::Producer for KstatSampler { + fn produce( + &mut self, + ) -> Result)>, MetricsError> { + // Swap the _entries_ of all the existing per-target sample queues, but + // we need to leave empty queues in their place. I.e., we can't remove + // keys. + let mut samples = Vec::new(); + for (_id, queue) in self.samples.lock().unwrap().iter_mut() { + samples.append(queue); + } + + // Append any self-stat samples as well. + let mut rx = self.self_stat_rx.lock().unwrap(); + loop { + match rx.try_recv() { + Ok(sample) => samples.push(sample), + Err(mpsc::error::TryRecvError::Empty) => break, + Err(mpsc::error::TryRecvError::Disconnected) => { + panic!("kstat stampler self-stat queue tx disconnected"); + } + } + } + drop(rx); + + Ok(Box::new(samples.into_iter())) + } +} diff --git a/oximeter/instruments/src/lib.rs b/oximeter/instruments/src/lib.rs index d5c53fd05c0..d003e717395 100644 --- a/oximeter/instruments/src/lib.rs +++ b/oximeter/instruments/src/lib.rs @@ -4,7 +4,10 @@ //! General-purpose types for instrumenting code to producer oximeter metrics. -// Copyright 2021 Oxide Computer Company +// Copyright 2023 Oxide Computer Company #[cfg(feature = "http-instruments")] pub mod http; + +#[cfg(all(feature = "kstat", target_os = "illumos"))] +pub mod kstat; diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 3889be5eff2..06cc8092cea 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -45,6 +45,7 @@ nexus-client.workspace = true omicron-common.workspace = true once_cell.workspace = true oximeter.workspace = true +oximeter-instruments.workspace = true oximeter-producer.workspace = true percent-encoding.workspace = true progenitor.workspace = true diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 92149b6475c..ab107f9a63e 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -31,6 +31,9 @@ use omicron_common::api::internal::nexus::{ DiskRuntimeState, SledInstanceState, UpdateArtifactId, }; use omicron_common::api::internal::shared::SwitchPorts; +use oximeter::types::ProducerResults; +use oximeter_producer::collect; +use oximeter_producer::ProducerIdPathParams; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; @@ -70,6 +73,7 @@ pub fn api() -> SledApiDescription { api.register(read_network_bootstore_config_cache)?; api.register(write_network_bootstore_config)?; api.register(add_sled_to_initialized_rack)?; + api.register(metrics_collect)?; Ok(()) } @@ -749,3 +753,17 @@ async fn add_sled_to_initialized_rack( })?; Ok(HttpResponseUpdatedNoContent()) } + +/// Collect oximeter samples from the sled agent. +#[endpoint { + method = GET, + path = "/metrics/collect/{producer_id}", +}] +async fn metrics_collect( + request_context: RequestContext, + path_params: Path, +) -> Result, HttpError> { + let sa = request_context.context(); + let producer_id = path_params.into_inner().producer_id; + collect(&sa.metrics_registry(), producer_id).await +} diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 4e7921c605d..db89b17b5a4 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -23,6 +23,7 @@ pub mod config; mod http_entrypoints; mod instance; mod instance_manager; +mod metrics; mod nexus; pub mod params; mod profile; diff --git a/sled-agent/src/metrics.rs b/sled-agent/src/metrics.rs new file mode 100644 index 00000000000..6c3383c88f0 --- /dev/null +++ b/sled-agent/src/metrics.rs @@ -0,0 +1,260 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Metrics produced by the sled-agent for collection by oximeter. + +use oximeter::types::MetricsError; +use oximeter::types::ProducerRegistry; +use sled_hardware::Baseboard; +use slog::Logger; +use std::time::Duration; +use uuid::Uuid; + +cfg_if::cfg_if! { + if #[cfg(target_os = "illumos")] { + use oximeter_instruments::kstat::link; + use oximeter_instruments::kstat::CollectionDetails; + use oximeter_instruments::kstat::Error as KstatError; + use oximeter_instruments::kstat::KstatSampler; + use oximeter_instruments::kstat::TargetId; + use std::collections::BTreeMap; + } else { + use anyhow::anyhow; + } +} + +/// The interval on which we ask `oximeter` to poll us for metric data. +pub(crate) const METRIC_COLLECTION_INTERVAL: Duration = Duration::from_secs(30); + +/// The interval on which we sample link metrics. +pub(crate) const LINK_SAMPLE_INTERVAL: Duration = Duration::from_secs(10); + +/// An error during sled-agent metric production. +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[cfg(target_os = "illumos")] + #[error("Kstat-based metric failure")] + Kstat(#[source] KstatError), + + #[cfg(not(target_os = "illumos"))] + #[error("Kstat-based metric failure")] + Kstat(#[source] anyhow::Error), + + #[error("Failed to insert metric producer into registry")] + Registry(#[source] MetricsError), + + #[error("Failed to fetch hostname")] + Hostname(#[source] std::io::Error), +} + +/// Type managing all oximeter metrics produced by the sled-agent. +// +// TODO-completeness: We probably want to get kstats or other metrics in to this +// type from other parts of the code, possibly before the `SledAgent` itself +// exists. This is similar to the storage resources or other objects, most of +// which are essentially an `Arc>`. It would be nice to avoid that +// pattern, but until we have more statistics, it's not clear whether that's +// worth it right now. +#[derive(Clone, Debug)] +// NOTE: The ID fields aren't used on non-illumos systems, rather than changing +// the name of fields that are not yet used. +#[cfg_attr(not(target_os = "illumos"), allow(dead_code))] +pub struct MetricsManager { + sled_id: Uuid, + rack_id: Uuid, + baseboard: Baseboard, + hostname: Option, + _log: Logger, + #[cfg(target_os = "illumos")] + kstat_sampler: KstatSampler, + // TODO-scalability: We may want to generalize this to store any kind of + // tracked target, and use a naming scheme that allows us pick out which + // target we're interested in from the arguments. + // + // For example, we can use the link name to do this, for any physical or + // virtual link, because they need to be unique. We could also do the same + // for disks or memory. If we wanted to guarantee uniqueness, we could + // namespace them internally, e.g., `"datalink:{link_name}"` would be the + // real key. + #[cfg(target_os = "illumos")] + tracked_links: BTreeMap, + registry: ProducerRegistry, +} + +impl MetricsManager { + /// Construct a new metrics manager. + /// + /// This takes a few key pieces of identifying information that are used + /// when reporting sled-specific metrics. + pub fn new( + sled_id: Uuid, + rack_id: Uuid, + baseboard: Baseboard, + log: Logger, + ) -> Result { + let registry = ProducerRegistry::with_id(sled_id); + + cfg_if::cfg_if! { + if #[cfg(target_os = "illumos")] { + let kstat_sampler = KstatSampler::new(&log).map_err(Error::Kstat)?; + registry + .register_producer(kstat_sampler.clone()) + .map_err(Error::Registry)?; + let tracked_links = BTreeMap::new(); + } + } + Ok(Self { + sled_id, + rack_id, + baseboard, + hostname: None, + _log: log, + #[cfg(target_os = "illumos")] + kstat_sampler, + #[cfg(target_os = "illumos")] + tracked_links, + registry, + }) + } + + /// Return a reference to the contained producer registry. + pub fn registry(&self) -> &ProducerRegistry { + &self.registry + } +} + +#[cfg(target_os = "illumos")] +impl MetricsManager { + /// Track metrics for a physical datalink. + pub async fn track_physical_link( + &mut self, + link_name: impl AsRef, + interval: Duration, + ) -> Result<(), Error> { + let hostname = self.hostname().await?; + let link = link::PhysicalDataLink { + rack_id: self.rack_id, + sled_id: self.sled_id, + serial: self.serial_number(), + hostname, + link_name: link_name.as_ref().to_string(), + }; + let details = CollectionDetails::never(interval); + let id = self + .kstat_sampler + .add_target(link, details) + .await + .map_err(Error::Kstat)?; + self.tracked_links.insert(link_name.as_ref().to_string(), id); + Ok(()) + } + + /// Stop tracking metrics for a datalink. + /// + /// This works for both physical and virtual links. + #[allow(dead_code)] + pub async fn stop_tracking_link( + &mut self, + link_name: impl AsRef, + ) -> Result<(), Error> { + if let Some(id) = self.tracked_links.remove(link_name.as_ref()) { + self.kstat_sampler.remove_target(id).await.map_err(Error::Kstat) + } else { + Ok(()) + } + } + + /// Track metrics for a virtual datalink. + #[allow(dead_code)] + pub async fn track_virtual_link( + &self, + link_name: impl AsRef, + hostname: impl AsRef, + interval: Duration, + ) -> Result<(), Error> { + let link = link::VirtualDataLink { + rack_id: self.rack_id, + sled_id: self.sled_id, + serial: self.serial_number(), + hostname: hostname.as_ref().to_string(), + link_name: link_name.as_ref().to_string(), + }; + let details = CollectionDetails::never(interval); + self.kstat_sampler + .add_target(link, details) + .await + .map(|_| ()) + .map_err(Error::Kstat) + } + + // Return the serial number out of the baseboard, if one exists. + fn serial_number(&self) -> String { + match &self.baseboard { + Baseboard::Gimlet { identifier, .. } => identifier.clone(), + Baseboard::Unknown => String::from("unknown"), + Baseboard::Pc { identifier, .. } => identifier.clone(), + } + } + + // Return the system's hostname. + // + // If we've failed to get it previously, we try again. If _that_ fails, + // return an error. + // + // TODO-cleanup: This will become much simpler once + // `OnceCell::get_or_try_init` is stabilized. + async fn hostname(&mut self) -> Result { + if let Some(hn) = &self.hostname { + return Ok(hn.clone()); + } + let hn = tokio::process::Command::new("hostname") + .env_clear() + .output() + .await + .map(|out| String::from_utf8_lossy(&out.stdout).trim().to_string()) + .map_err(Error::Hostname)?; + self.hostname.replace(hn.clone()); + Ok(hn) + } +} + +#[cfg(not(target_os = "illumos"))] +impl MetricsManager { + /// Track metrics for a physical datalink. + pub async fn track_physical_link( + &mut self, + _link_name: impl AsRef, + _interval: Duration, + ) -> Result<(), Error> { + Err(Error::Kstat(anyhow!( + "kstat metrics are not supported on this platform" + ))) + } + + /// Stop tracking metrics for a datalink. + /// + /// This works for both physical and virtual links. + #[allow(dead_code)] + pub async fn stop_tracking_link( + &mut self, + _link_name: impl AsRef, + ) -> Result<(), Error> { + Err(Error::Kstat(anyhow!( + "kstat metrics are not supported on this platform" + ))) + } + + /// Track metrics for a virtual datalink. + #[allow(dead_code)] + pub async fn track_virtual_link( + &self, + _link_name: impl AsRef, + _hostname: impl AsRef, + _interval: Duration, + ) -> Result<(), Error> { + Err(Error::Kstat(anyhow!( + "kstat metrics are not supported on this platform" + ))) + } +} diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 08569c6529c..b8852e2bbad 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -11,6 +11,7 @@ use crate::bootstrap::early_networking::{ use crate::bootstrap::params::StartSledAgentRequest; use crate::config::Config; use crate::instance_manager::InstanceManager; +use crate::metrics::MetricsManager; use crate::nexus::{NexusClientWithResolver, NexusRequestQueue}; use crate::params::{ DiskStateRequested, InstanceHardware, InstanceMigrationSourceParams, @@ -40,6 +41,7 @@ use omicron_common::address::{ get_sled_address, get_switch_zone_address, Ipv6Subnet, SLED_PREFIX, }; use omicron_common::api::external::Vni; +use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::api::internal::nexus::{ SledInstanceState, VmmRuntimeState, }; @@ -51,11 +53,13 @@ use omicron_common::api::{ internal::nexus::UpdateArtifactId, }; use omicron_common::backoff::{ - retry_notify, retry_notify_ext, retry_policy_internal_service_aggressive, - BackoffError, + retry_notify, retry_notify_ext, retry_policy_internal_service, + retry_policy_internal_service_aggressive, BackoffError, }; +use oximeter::types::ProducerRegistry; +use sled_hardware::underlay; use sled_hardware::HardwareManager; -use sled_hardware::{underlay, underlay::BootstrapInterface, Baseboard}; +use sled_hardware::{underlay::BootstrapInterface, Baseboard}; use slog::Logger; use std::collections::BTreeMap; use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; @@ -134,6 +138,9 @@ pub enum Error { #[error("Zone bundle error: {0}")] ZoneBundle(#[from] BundleError), + + #[error("Metrics error: {0}")] + Metrics(#[from] crate::metrics::Error), } impl From for omicron_common::api::external::Error { @@ -251,6 +258,9 @@ struct SledAgentInner { // A handle to the bootstore. bootstore: bootstore::NodeHandle, + + // Object handling production of metrics for oximeter. + metrics_manager: MetricsManager, } impl SledAgentInner { @@ -452,6 +462,46 @@ impl SledAgent { rack_network_config.clone(), )?; + let mut metrics_manager = MetricsManager::new( + request.body.id, + request.body.rack_id, + hardware.baseboard(), + log.new(o!("component" => "MetricsManager")), + )?; + + // Start tracking the underlay physical links. + for nic in underlay::find_nics(&config.data_links)? { + let link_name = nic.interface(); + if let Err(e) = metrics_manager + .track_physical_link( + link_name, + crate::metrics::LINK_SAMPLE_INTERVAL, + ) + .await + { + error!( + log, + "failed to start tracking physical link metrics"; + "link_name" => link_name, + "error" => ?e, + ); + } + } + + // Spawn a task in the background to register our metric producer with + // Nexus. This should not block progress here. + let endpoint = ProducerEndpoint { + id: request.body.id, + address: sled_address.into(), + base_route: String::from("/metrics/collect"), + interval: crate::metrics::METRIC_COLLECTION_INTERVAL, + }; + tokio::task::spawn(register_metric_producer_with_nexus( + log.clone(), + nexus_client.clone(), + endpoint, + )); + let zone_bundler = storage.zone_bundler().clone(); let sled_agent = SledAgent { inner: Arc::new(SledAgentInner { @@ -476,6 +526,7 @@ impl SledAgent { rack_network_config, zone_bundler, bootstore: bootstore.clone(), + metrics_manager, }), log: log.clone(), }; @@ -955,6 +1006,38 @@ impl SledAgent { pub fn bootstore(&self) -> bootstore::NodeHandle { self.inner.bootstore.clone() } + + /// Return the metric producer registry. + pub fn metrics_registry(&self) -> &ProducerRegistry { + self.inner.metrics_manager.registry() + } +} + +async fn register_metric_producer_with_nexus( + log: Logger, + client: NexusClientWithResolver, + endpoint: ProducerEndpoint, +) { + let endpoint = nexus_client::types::ProducerEndpoint::from(&endpoint); + let register_with_nexus = || async { + client.client().cpapi_producers_post(&endpoint).await.map_err(|e| { + BackoffError::transient(format!("Metric registration error: {e}")) + }) + }; + retry_notify( + retry_policy_internal_service(), + register_with_nexus, + |error, delay| { + warn!( + log, + "failed to register as a metric producer with Nexus"; + "error" => ?error, + "retry_after" => ?delay, + ); + }, + ) + .await + .expect("Expected an infinite retry loop registering with Nexus"); } #[derive(From, thiserror::Error, Debug)] @@ -976,7 +1059,7 @@ pub enum AddSledError { }, } -/// Add a sled to an +/// Add a sled to an initialized rack. pub async fn add_sled_to_initialized_rack( log: Logger, sled_id: Baseboard,