Skip to content

Commit

Permalink
cockroachdb cluster version fixups (#6396)
Browse files Browse the repository at this point in the history
This is a followup to #5603, which contains various fixes I needed to
write when testing an actual cluster upgrade from v22.1 to v22.2.

We're currently deciding to park that plan, but I don't want these
fixups to be forgotten.
iliana authored Aug 22, 2024
1 parent 45fb575 commit 1903eee
Showing 10 changed files with 147 additions and 61 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion dev-tools/downloader/src/lib.rs
Original file line number Diff line number Diff line change
@@ -586,7 +586,10 @@ impl<'a> Downloader<'a> {
let version = version.trim();

let (url_base, suffix) = match os {
Os::Illumos => ("https://illumos.org/downloads", "tar.gz"),
Os::Illumos => (
"https://oxide-cockroachdb-build.s3.us-west-2.amazonaws.com",
"tar.gz",
),
Os::Linux | Os::Mac => ("https://binaries.cockroachdb.com", "tgz"),
};
let build = match os {
3 changes: 2 additions & 1 deletion dev-tools/omdb/Cargo.toml
Original file line number Diff line number Diff line change
@@ -62,13 +62,14 @@ multimap.workspace = true
indicatif.workspace = true

[dev-dependencies]
camino-tempfile.workspace = true
expectorate.workspace = true
http.workspace = true
nexus-test-utils.workspace = true
nexus-test-utils-macros.workspace = true
omicron-nexus.workspace = true
omicron-test-utils.workspace = true
subprocess.workspace = true
camino-tempfile.workspace = true

# Disable doc builds by default for our binaries to work around issue
# rust-lang/cargo#8373. These docs would not be very useful anyway.
12 changes: 6 additions & 6 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
@@ -1135,8 +1135,8 @@ WARNING: Zones exist without physical disks!


COCKROACHDB SETTINGS:
state fingerprint::::::::::::::::: d4d87aa2ad877a4cc2fddd0573952362739110de
cluster.preserve_downgrade_option: "22.1"
state fingerprint::::::::::::::::: <COCKROACHDB_FINGERPRINT_REDACTED>
cluster.preserve_downgrade_option: <COCKROACHDB_VERSION_REDACTED>

METADATA:
created by::::::::::: nexus-test-utils
@@ -1173,8 +1173,8 @@ WARNING: Zones exist without physical disks!


COCKROACHDB SETTINGS:
state fingerprint::::::::::::::::: d4d87aa2ad877a4cc2fddd0573952362739110de
cluster.preserve_downgrade_option: "22.1"
state fingerprint::::::::::::::::: <COCKROACHDB_FINGERPRINT_REDACTED>
cluster.preserve_downgrade_option: <COCKROACHDB_VERSION_REDACTED>

METADATA:
created by::::::::::: nexus-test-utils
@@ -1214,8 +1214,8 @@ to: blueprint ......<REDACTED_BLUEPRINT_ID>.......


COCKROACHDB SETTINGS:
state fingerprint::::::::::::::::: d4d87aa2ad877a4cc2fddd0573952362739110de (unchanged)
cluster.preserve_downgrade_option: "22.1" (unchanged)
state fingerprint::::::::::::::::: <COCKROACHDB_FINGERPRINT_REDACTED> (unchanged)
cluster.preserve_downgrade_option: <COCKROACHDB_VERSION_REDACTED> (unchanged)

METADATA:
internal DNS version: 1 (unchanged)
36 changes: 31 additions & 5 deletions dev-tools/omdb/tests/test_all_output.rs
Original file line number Diff line number Diff line change
@@ -7,9 +7,12 @@
//! Feel free to change the tool's output. This test just makes it easy to make
//! sure you're only breaking what you intend.
use dropshot::Method;
use expectorate::assert_contents;
use http::StatusCode;
use nexus_test_utils::{OXIMETER_UUID, PRODUCER_UUID};
use nexus_test_utils_macros::nexus_test;
use nexus_types::deployment::Blueprint;
use nexus_types::deployment::SledFilter;
use nexus_types::deployment::UnstableReconfiguratorState;
use omicron_test_utils::dev::test_cmds::path_to_executable;
@@ -134,6 +137,20 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
let tmppath = tmpdir.path().join("reconfigurator-save.out");
let initial_blueprint_id = cptestctx.initial_blueprint_id.to_string();

// Get the CockroachDB metadata from the blueprint so we can redact it
let initial_blueprint: Blueprint = dropshot::test_util::read_json(
&mut cptestctx
.internal_client
.make_request_no_body(
Method::GET,
&format!("/deployment/blueprints/all/{initial_blueprint_id}"),
StatusCode::OK,
)
.await
.unwrap(),
)
.await;

let mut output = String::new();

let invocations: &[&[&str]] = &[
@@ -186,6 +203,19 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
// ControlPlaneTestContext.
];

let mut redactions = ExtraRedactions::new();
redactions
.variable_length("tmp_path", tmppath.as_str())
.fixed_length("blueprint_id", &initial_blueprint_id)
.variable_length(
"cockroachdb_fingerprint",
&initial_blueprint.cockroachdb_fingerprint,
);
let crdb_version =
initial_blueprint.cockroachdb_setting_preserve_downgrade.to_string();
if initial_blueprint.cockroachdb_setting_preserve_downgrade.is_set() {
redactions.variable_length("cockroachdb_version", &crdb_version);
}
for args in invocations {
println!("running commands with args: {:?}", args);
let p = postgres_url.to_string();
@@ -204,11 +234,7 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
},
&cmd_path,
args,
Some(
ExtraRedactions::new()
.variable_length("tmp_path", tmppath.as_str())
.fixed_length("blueprint_id", &initial_blueprint_id),
),
Some(&redactions),
)
.await;
}
12 changes: 7 additions & 5 deletions docs/crdb-upgrades.adoc
Original file line number Diff line number Diff line change
@@ -60,13 +60,15 @@ a tick, but they must occur in that order.)
. Add an enum variant for the new version to `CockroachDbClusterVersion`
in `nexus/types/src/deployment/planning_input.rs`, and change the
associated constant `NEWLY_INITIALIZED` to that value.
. Run the test suite, which should catch any unexpected SQL
. Regenerate the Nexus internal OpenAPI document, which contains an enum
of CockroachDB versions:
+
....
EXPECTORATE=overwrite cargo nextest run -p omicron-nexus -- integration_tests::commands::test_nexus_openapi_internal
....
. Run the full test suite, which should catch any unexpected SQL
compatibility issues between releases and help validate that your
build works.
* You will need to run the `test_omdb_success_cases` test from
omicron-omdb with `EXPECTORATE=overwrite`; this file contains the
expected output of various omdb commands, including a fingerprint of
CockroachDB's cluster state.
. Submit a PR for your changes to garbage-compactor; when merged,
publish the final build to the `oxide-cockroachdb-build` S3 bucket.
. Update `tools/cockroachdb_checksums`. For non-illumos checksums, use
54 changes: 38 additions & 16 deletions nexus/db-queries/src/db/datastore/cockroachdb_settings.rs
Original file line number Diff line number Diff line change
@@ -153,10 +153,22 @@ mod test {
);

let settings = datastore.cockroachdb_settings(&opctx).await.unwrap();
// With a fresh cluster, this is the expected state
let version = CockroachDbClusterVersion::NEWLY_INITIALIZED.to_string();
assert_eq!(settings.version, version);
assert_eq!(settings.preserve_downgrade, "");
let version: CockroachDbClusterVersion =
settings.version.parse().expect("unexpected cluster version");
if settings.preserve_downgrade == "" {
// This is the expected value while running tests normally.
assert_eq!(version, CockroachDbClusterVersion::NEWLY_INITIALIZED);
} else if settings.preserve_downgrade == version.to_string() {
// This is the expected value if the cluster was created on a
// previous version and `cluster.preserve_downgrade_option` was set.
assert_eq!(version, CockroachDbClusterVersion::POLICY);
} else {
panic!(
"`cluster.preserve_downgrade_option` is {:?},
but it should be empty or \"{}\"",
settings.preserve_downgrade, version
);
}

// Verify that if a fingerprint is wrong, we get the expected SQL error
// back.
@@ -165,7 +177,7 @@ mod test {
&opctx,
String::new(),
"cluster.preserve_downgrade_option",
version.clone(),
version.to_string(),
)
.await
else {
@@ -190,16 +202,16 @@ mod test {
&opctx,
settings.state_fingerprint.clone(),
"cluster.preserve_downgrade_option",
version.clone(),
version.to_string(),
)
.await
.unwrap();
assert_eq!(
datastore.cockroachdb_settings(&opctx).await.unwrap(),
CockroachDbSettings {
state_fingerprint: settings.state_fingerprint.clone(),
version: version.clone(),
preserve_downgrade: version.clone(),
version: version.to_string(),
preserve_downgrade: version.to_string(),
}
);
}
@@ -215,14 +227,24 @@ mod test {
)
.await
.unwrap();
assert_eq!(
datastore.cockroachdb_settings(&opctx).await.unwrap(),
CockroachDbSettings {
state_fingerprint: settings.state_fingerprint.clone(),
version: version.clone(),
preserve_downgrade: String::new(),
}
);
let settings =
datastore.cockroachdb_settings(&opctx).await.unwrap();
if version == CockroachDbClusterVersion::NEWLY_INITIALIZED {
assert_eq!(
settings,
CockroachDbSettings {
state_fingerprint: settings.state_fingerprint.clone(),
version: version.to_string(),
preserve_downgrade: String::new(),
}
);
} else {
// Resetting it permits auto-finalization, so the state
// fingerprint and version are not predictable until that
// completes, but we can still verify that the variable was
// reset.
assert!(settings.preserve_downgrade.is_empty());
}
}

db.cleanup().await.unwrap();
28 changes: 15 additions & 13 deletions nexus/reconfigurator/execution/src/cockroachdb.rs
Original file line number Diff line number Diff line change
@@ -38,7 +38,7 @@ mod test {
use nexus_db_queries::authn;
use nexus_db_queries::authz;
use nexus_test_utils_macros::nexus_test;
use nexus_types::deployment::CockroachDbClusterVersion;
use nexus_types::deployment::CockroachDbPreserveDowngrade;
use std::sync::Arc;
use uuid::Uuid;

@@ -71,24 +71,26 @@ mod test {
.await
.expect("failed to get blueprint from datastore");
eprintln!("blueprint: {}", blueprint.display());
// The initial blueprint should already have these filled in.
// The initial blueprint should already have the state fingerprint
// filled in.
assert_eq!(
blueprint.cockroachdb_fingerprint,
settings.state_fingerprint
);
assert_eq!(
blueprint.cockroachdb_setting_preserve_downgrade,
CockroachDbClusterVersion::NEWLY_INITIALIZED.into()
);
// The cluster version, preserve downgrade setting, and
// `NEWLY_INITIALIZED` should all match.
assert_eq!(
settings.version,
CockroachDbClusterVersion::NEWLY_INITIALIZED.to_string()
);
// The initial blueprint should already have the preserve downgrade
// setting filled in. (It might be the current or previous version, but
// it should be `Set` regardless.)
let CockroachDbPreserveDowngrade::Set(bp_preserve_downgrade) =
blueprint.cockroachdb_setting_preserve_downgrade
else {
panic!("blueprint does not set preserve downgrade option");
};
// The cluster version, preserve downgrade setting, and the value in the
// blueprint should all match.
assert_eq!(settings.version, bp_preserve_downgrade.to_string());
assert_eq!(
settings.preserve_downgrade,
CockroachDbClusterVersion::NEWLY_INITIALIZED.to_string()
bp_preserve_downgrade.to_string()
);
// Record the zpools so we don't fail to ensure datasets (unrelated to
// crdb settings) during blueprint execution.
50 changes: 36 additions & 14 deletions nexus/src/app/rack.rs
Original file line number Diff line number Diff line change
@@ -242,22 +242,44 @@ impl super::Nexus {
.internal_context(
"fetching cockroachdb settings for rack initialization",
)?;
self.datastore()
.cockroachdb_setting_set_string(
opctx,
cockroachdb_settings.state_fingerprint.clone(),
"cluster.preserve_downgrade_option",
CockroachDbClusterVersion::NEWLY_INITIALIZED.to_string(),
)
.await
.internal_context(
"setting `cluster.preserve_downgrade_option` \
for rack initialization",
)?;
blueprint.cockroachdb_setting_preserve_downgrade =
if cockroachdb_settings.preserve_downgrade.is_empty() {
// Set the option to the current policy in both the database and
// the blueprint.
self.datastore()
.cockroachdb_setting_set_string(
opctx,
cockroachdb_settings.state_fingerprint.clone(),
"cluster.preserve_downgrade_option",
CockroachDbClusterVersion::NEWLY_INITIALIZED
.to_string(),
)
.await
.internal_context(
"setting `cluster.preserve_downgrade_option` \
for rack initialization",
)?;
CockroachDbClusterVersion::NEWLY_INITIALIZED
} else {
// `cluster.preserve_downgrade_option` is set, so fill in the
// blueprint with the current value. This branch should never
// be hit during normal rack initialization; it's here for
// eventual test cases where `cluster.preserve_downgrade_option`
// is set by a test harness prior to rack initialization.
CockroachDbClusterVersion::from_str(
&cockroachdb_settings.preserve_downgrade,
)
.map_err(|_| {
Error::internal_error(&format!(
"database has `cluster.preserve_downgrade_option` \
set to invalid version {}",
cockroachdb_settings.preserve_downgrade
))
})?
}
.into();
blueprint.cockroachdb_fingerprint =
cockroachdb_settings.state_fingerprint;
blueprint.cockroachdb_setting_preserve_downgrade =
CockroachDbClusterVersion::NEWLY_INITIALIZED.into();

// Administrators of the Recovery Silo are automatically made
// administrators of the Fleet.
7 changes: 7 additions & 0 deletions nexus/types/src/deployment/planning_input.rs
Original file line number Diff line number Diff line change
@@ -280,6 +280,13 @@ pub enum CockroachDbPreserveDowngrade {
}

impl CockroachDbPreserveDowngrade {
pub fn is_set(self) -> bool {
match self {
CockroachDbPreserveDowngrade::Set(_) => true,
_ => false,
}
}

pub fn from_optional_string(
value: &Option<String>,
) -> Result<Self, parse_display::ParseError> {

0 comments on commit 1903eee

Please sign in to comment.