Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add background task to collect Cockroach zone node IDs #5857

Merged
merged 20 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ members = [
"bootstore",
"certificates",
"clients/bootstrap-agent-client",
"clients/cockroach-admin-client",
"clients/ddm-admin-client",
"clients/dns-service-client",
"clients/dpd-client",
Expand Down Expand Up @@ -89,6 +90,7 @@ default-members = [
"bootstore",
"certificates",
"clients/bootstrap-agent-client",
"clients/cockroach-admin-client",
"clients/ddm-admin-client",
"clients/dns-service-client",
"clients/dpd-client",
Expand Down Expand Up @@ -240,6 +242,7 @@ ciborium = "0.2.2"
cfg-if = "1.0"
chrono = { version = "0.4", features = [ "serde" ] }
clap = { version = "4.5", features = ["cargo", "derive", "env", "wrap_help"] }
cockroach-admin-client = { path = "clients/cockroach-admin-client" }
colored = "2.1"
const_format = "0.2.32"
cookie = "0.18"
Expand Down
18 changes: 18 additions & 0 deletions clients/cockroach-admin-client/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[package]
name = "cockroach-admin-client"
version = "0.1.0"
edition = "2021"
license = "MPL-2.0"

[lints]
workspace = true

[dependencies]
chrono.workspace = true
omicron-uuid-kinds.workspace = true
progenitor.workspace = true
reqwest = { workspace = true, features = [ "json", "rustls-tls", "stream" ] }
schemars.workspace = true
serde.workspace = true
slog.workspace = true
omicron-workspace-hack.workspace = true
24 changes: 24 additions & 0 deletions clients/cockroach-admin-client/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Interface for making API requests to a Bootstrap Agent
jgallagher marked this conversation as resolved.
Show resolved Hide resolved

progenitor::generate_api!(
spec = "../../openapi/cockroach-admin.json",
inner_type = slog::Logger,
pre_hook = (|log: &slog::Logger, request: &reqwest::Request| {
slog::debug!(log, "client request";
"method" => %request.method(),
"uri" => %request.url(),
"body" => ?&request.body(),
);
}),
post_hook = (|log: &slog::Logger, result: &Result<_, _>| {
slog::debug!(log, "client response"; "result" => ?result);
}),
derives = [schemars::JsonSchema],
replace = {
TypedUuidForOmicronZoneKind = omicron_uuid_kinds::OmicronZoneUuid,
}
);
8 changes: 8 additions & 0 deletions cockroach-admin/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ dropshot.workspace = true
http.workspace = true
illumos-utils.workspace = true
omicron-common.workspace = true
omicron-uuid-kinds.workspace = true
once_cell.workspace = true
# See omicron-rpaths for more about the "pq-sys" dependency.
pq-sys = "*"
schemars.workspace = true
Expand All @@ -27,13 +29,19 @@ slog-error-chain.workspace = true
serde.workspace = true
thiserror.workspace = true
tokio.workspace = true
tokio-postgres.workspace = true
toml.workspace = true

omicron-workspace-hack.workspace = true

[dev-dependencies]
expectorate.workspace = true
nexus-test-utils.workspace = true
omicron-test-utils.workspace = true
openapi-lint.workspace = true
openapiv3.workspace = true
serde_json.workspace = true
subprocess.workspace = true
url.workspace = true

[lints]
Expand Down
17 changes: 13 additions & 4 deletions cockroach-admin/src/bin/cockroach-admin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use omicron_cockroach_admin::CockroachCli;
use omicron_cockroach_admin::Config;
use omicron_common::cmd::fatal;
use omicron_common::cmd::CmdError;
use omicron_uuid_kinds::OmicronZoneUuid;
use std::net::SocketAddr;
use std::net::SocketAddrV6;

Expand All @@ -38,6 +39,10 @@ enum Args {
/// Path to the server config file
#[clap(long, action)]
config_file_path: Utf8PathBuf,

/// ID of the zone within which we're running
#[clap(long, action)]
zone_id: OmicronZoneUuid,
},
}

Expand All @@ -59,16 +64,20 @@ async fn main_impl() -> Result<(), CmdError> {
cockroach_address,
http_address,
config_file_path,
zone_id,
} => {
let cockroach_cli =
CockroachCli::new(path_to_cockroach_binary, cockroach_address);
let mut config = Config::from_file(&config_file_path)
.map_err(|err| CmdError::Failure(anyhow!(err)))?;
config.dropshot.bind_address = SocketAddr::V6(http_address);
let server =
omicron_cockroach_admin::start_server(cockroach_cli, config)
.await
.map_err(|err| CmdError::Failure(anyhow!(err)))?;
let server = omicron_cockroach_admin::start_server(
zone_id,
cockroach_cli,
config,
)
.await
.map_err(|err| CmdError::Failure(anyhow!(err)))?;
server.await.map_err(|err| {
CmdError::Failure(anyhow!(
"server failed after starting: {err}"
Expand Down
4 changes: 4 additions & 0 deletions cockroach-admin/src/cockroach_cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ impl CockroachCli {
Self { path_to_cockroach_binary, cockroach_address }
}

pub fn cockroach_address(&self) -> SocketAddrV6 {
self.cockroach_address
}

pub async fn node_status(
&self,
) -> Result<Vec<NodeStatus>, CockroachCliError> {
Expand Down
138 changes: 137 additions & 1 deletion cockroach-admin/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,143 @@
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use crate::CockroachCli;
use anyhow::Context;
use dropshot::HttpError;
use omicron_uuid_kinds::OmicronZoneUuid;
use slog::Logger;
use slog_error_chain::InlineErrorChain;
use tokio::sync::OnceCell;

pub struct ServerContext {
pub cockroach_cli: CockroachCli,
zone_id: OmicronZoneUuid,
cockroach_cli: CockroachCli,
// Cockroach node IDs never change; we defer contacting our local node to
// ask for its ID until we need to, but once we have it we don't need to ask
// again.
node_id: OnceCell<String>,
log: Logger,
}

impl ServerContext {
pub fn new(
zone_id: OmicronZoneUuid,
cockroach_cli: CockroachCli,
log: Logger,
) -> Self {
Self { zone_id, cockroach_cli, node_id: OnceCell::new(), log }
}

pub fn cockroach_cli(&self) -> &CockroachCli {
&self.cockroach_cli
}

pub fn zone_id(&self) -> OmicronZoneUuid {
self.zone_id
}

pub async fn node_id(&self) -> Result<&str, HttpError> {
match self
.node_id
.get_or_try_init(|| self.read_node_id_from_cockroach())
.await
{
Ok(id) => Ok(id.as_str()),
Err(err) => {
let message = format!(
"failed to read node ID from local cockroach instance: \
{err:#}",
);
Err(HttpError {
status_code: http::StatusCode::SERVICE_UNAVAILABLE,
error_code: None,
external_message: message.clone(),
internal_message: message,
})
}
}
}

async fn read_node_id_from_cockroach(&self) -> anyhow::Result<String> {
// TODO-cleanup This connection string is duplicated in Nexus - maybe we
// should centralize it? I'm not sure where we could put it;
// omicron_common, perhaps?
let connect_url = format!(
"postgresql://root@{}/omicron?sslmode=disable",
self.cockroach_cli.cockroach_address()
);
let (client, connection) =
tokio_postgres::connect(&connect_url, tokio_postgres::NoTls)
.await
.with_context(|| {
format!("failed to connect to {connect_url}")
})?;

let log = self.log.clone();
tokio::spawn(async move {
if let Err(e) = connection.await {
slog::warn!(
log, "connection error reading node ID";
"err" => InlineErrorChain::new(&e),
);
}
});

// This uses an undocumented internal function - not awesome, but we're
// told this is "unlikely to change for some time".
// https://github.com/cockroachdb/cockroach/issues/124988 requests that
// this be documented (or an alternative be documented / provided).
let row = client
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm slightly worried about this command being routed to another node, which can happen in some cases when a request is forwarded to the raft leader for the quorum containing the data.

While it would be super weird for that to happen in this case, I worry about accidental breakage. For extra assurance, maybe we should assert that this node id matches the current address we are contacting. I'm not sure if we can do that outside the status cli though.

Am I being too paranoid here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if you're being too paranoid, but I think this is easy enough to check for. I took a stab at it in 70c7558

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! This looks excellent.

.query_one("SELECT crdb_internal.node_id()::TEXT", &[])
.await
.context("failed to send node ID query")?;

row.try_get(0).context("failed to read results of node ID query")
}
}

#[cfg(test)]
mod tests {
use super::*;
use nexus_test_utils::db::test_setup_database;
use omicron_test_utils::dev;
use std::net::SocketAddrV6;
use url::Url;

#[tokio::test]
async fn test_node_id() {
let logctx = dev::test_setup_log("test_node_id");
let mut db = test_setup_database(&logctx.log).await;

// Construct a `ServerContext`.
let db_url = db.listen_url().to_string();
let url: Url = db_url.parse().expect("valid url");
let cockroach_address: SocketAddrV6 = format!(
"{}:{}",
url.host().expect("url has host"),
url.port().expect("url has port")
)
.parse()
.expect("valid SocketAddrV6");
let cli = CockroachCli::new("cockroach".into(), cockroach_address);
let context = ServerContext::new(
OmicronZoneUuid::new_v4(),
cli,
logctx.log.clone(),
);

// We should be able to fetch a node id, and it should be `1` (since we
// have a single-node test cockroach instance).
let node_id =
context.node_id().await.expect("successfully read node ID");
assert_eq!(node_id, "1");

// The `OnceCell` should be populated now; even if we shut down the DB,
// we can still fetch the node ID.
db.cleanup().await.unwrap();
let node_id =
context.node_id().await.expect("successfully read node ID");
assert_eq!(node_id, "1");

logctx.cleanup_successful();
}
}
Loading
Loading