From 9f51dcbeb19ac5d317477c1e5553d13f4ad329d5 Mon Sep 17 00:00:00 2001 From: Andy Fiddaman Date: Mon, 4 Mar 2024 18:15:05 +0000 Subject: [PATCH] Upgrade clickhouse from v22.8.9.24 to v23.8.7.24 (#5127) --- oximeter/db/src/client.rs | 8 +- oximeter/db/src/configs/keeper_config.xml | 2 +- smf/clickhouse_keeper/keeper_config.xml | 2 +- smf/clickhouse_keeper/method_script.sh | 30 ++--- smf/profile/profile | 4 +- test-utils/src/dev/clickhouse.rs | 131 ++++++++++++++-------- tools/ci_download_clickhouse | 4 +- tools/clickhouse_checksums | 6 +- tools/clickhouse_version | 2 +- tools/install_builder_prerequisites.sh | 6 +- 10 files changed, 122 insertions(+), 73 deletions(-) diff --git a/oximeter/db/src/client.rs b/oximeter/db/src/client.rs index 2e3029a938..abea11aa06 100644 --- a/oximeter/db/src/client.rs +++ b/oximeter/db/src/client.rs @@ -50,6 +50,10 @@ use tokio::fs; use tokio::sync::Mutex; use uuid::Uuid; +const CLICKHOUSE_DB_MISSING: &'static str = "Database oximeter does not exist"; +const CLICKHOUSE_DB_VERSION_MISSING: &'static str = + "Table oximeter.version does not exist"; + #[usdt::provider(provider = "clickhouse_client")] mod probes { fn query__start(_: &usdt::UniqueId, sql: &str) {} @@ -856,10 +860,10 @@ impl Client { })?, Err(Error::Database(err)) // Case 1: The database has not been created. - if err.contains("Database oximeter doesn't exist") || + if err.contains(CLICKHOUSE_DB_MISSING) || // Case 2: The database has been created, but it's old (exists // prior to the version table). - err.contains("Table oximeter.version doesn't exist") => + err.contains(CLICKHOUSE_DB_VERSION_MISSING) => { warn!(self.log, "oximeter database does not exist, or is out-of-date"); 0 diff --git a/oximeter/db/src/configs/keeper_config.xml b/oximeter/db/src/configs/keeper_config.xml index 19ab99f909..ae9d2a9cc5 100644 --- a/oximeter/db/src/configs/keeper_config.xml +++ b/oximeter/db/src/configs/keeper_config.xml @@ -9,7 +9,7 @@ - + diff --git a/smf/clickhouse_keeper/keeper_config.xml b/smf/clickhouse_keeper/keeper_config.xml index ec114694cc..2d64c2c0a0 100644 --- a/smf/clickhouse_keeper/keeper_config.xml +++ b/smf/clickhouse_keeper/keeper_config.xml @@ -9,7 +9,7 @@ - + diff --git a/smf/clickhouse_keeper/method_script.sh b/smf/clickhouse_keeper/method_script.sh index 8499e0001f..a017409a98 100755 --- a/smf/clickhouse_keeper/method_script.sh +++ b/smf/clickhouse_keeper/method_script.sh @@ -39,20 +39,22 @@ KEEPER_HOST_01="${keepers[0]}" KEEPER_HOST_02="${keepers[1]}" KEEPER_HOST_03="${keepers[2]}" -# Generate unique reproduceable number IDs by removing letters from KEEPER_IDENTIFIER_* -# Keeper IDs must be numbers, and they cannot be reused (i.e. when a keeper node is -# unrecoverable the ID must be changed to something new). -# By trimming the hosts we can make sure all keepers will always be up to date when -# a new keeper is spun up. Clickhouse does not allow very large numbers, so we will -# be reducing to 7 characters. This should be enough entropy given the small amount -# of keepers we have. +# Generate unique reproduceable number IDs by removing letters from +# KEEPER_IDENTIFIER_* Keeper IDs must be numbers, and they cannot be reused +# (i.e. when a keeper node is unrecoverable the ID must be changed to something +# new). By trimming the hosts we can make sure all keepers will always be up to +# date when a new keeper is spun up. Clickhouse does not allow very large +# numbers, so we will be reducing to 7 characters. This should be enough +# entropy given the small amount of keepers we have. KEEPER_ID_01="$( echo "${KEEPER_HOST_01}" | tr -dc [:digit:] | cut -c1-7)" KEEPER_ID_02="$( echo "${KEEPER_HOST_02}" | tr -dc [:digit:] | cut -c1-7)" KEEPER_ID_03="$( echo "${KEEPER_HOST_03}" | tr -dc [:digit:] | cut -c1-7)" -# Identify the node type this is as this will influence how the config is constructed -# TODO(https://github.com/oxidecomputer/omicron/issues/3824): There are probably much better ways to do this service name lookup, but this works -# for now. The services contain the same IDs as the hostnames. +# Identify the node type this is as this will influence how the config is +# constructed +# TODO(https://github.com/oxidecomputer/omicron/issues/3824): There are +# probably much better ways to do this service name lookup, but this works for +# now. The services contain the same IDs as the hostnames. KEEPER_SVC="$(zonename | tr -dc [:digit:] | cut -c1-7)" if [[ $KEEPER_ID_01 == $KEEPER_SVC ]] then @@ -68,9 +70,9 @@ else exit "$SMF_EXIT_ERR_CONFIG" fi -# Setting environment variables this way is best practice, but has the downside of -# obscuring the field values to anyone ssh=ing into the zone. To mitigate this, -# we will be saving them to ${DATASTORE}/config_env_vars +# Setting environment variables this way is best practice, but has the downside +# of obscuring the field values to anyone ssh=ing into the zone. To mitigate +# this, we will be saving them to ${DATASTORE}/config_env_vars export CH_LOG="${DATASTORE}/clickhouse-keeper.log" export CH_ERROR_LOG="${DATASTORE}/clickhouse-keeper.err.log" export CH_LISTEN_ADDR=${LISTEN_ADDR} @@ -103,7 +105,7 @@ CH_KEEPER_HOST_03="${CH_KEEPER_HOST_03}"" echo $content >> "${DATASTORE}/config_env_vars" -# The clickhouse binary must be run from within the directory that contains it. +# The clickhouse binary must be run from within the directory that contains it. # Otherwise, it does not automatically detect the configuration files, nor does # it append them when necessary cd /opt/oxide/clickhouse_keeper/ diff --git a/smf/profile/profile b/smf/profile/profile index 73256cd6fd..a8378f7403 100644 --- a/smf/profile/profile +++ b/smf/profile/profile @@ -12,8 +12,8 @@ case "$HOSTNAME" in oxz_crucible*) PATH+=:/opt/oxide/crucible/bin ;; - oxz_clockhouse*) - PATH+=:/opt/oxide/clickhouse + oxz_clickhouse*) + PATH+=:/opt/oxide/clickhouse:/opt/oxide/clickhouse_keeper ;; oxz_external_dns*|oxz_internal_dns*) PATH+=:/opt/oxide/dns-server/bin diff --git a/test-utils/src/dev/clickhouse.rs b/test-utils/src/dev/clickhouse.rs index 220662d9bb..8c415d949e 100644 --- a/test-utils/src/dev/clickhouse.rs +++ b/test-utils/src/dev/clickhouse.rs @@ -29,6 +29,19 @@ const CLICKHOUSE_TIMEOUT: Duration = Duration::from_secs(30); // Timeout used when starting a ClickHouse keeper subprocess. const CLICKHOUSE_KEEPER_TIMEOUT: Duration = Duration::from_secs(30); +// The string to look for in a keeper log file that indicates that the server +// is ready. +const KEEPER_READY: &'static str = "Server initialized, waiting for quorum"; + +// The string to look for in a clickhouse log file that indicates that the +// server is ready. +const CLICKHOUSE_READY: &'static str = + " Application: Ready for connections"; + +// The string to look for in a clickhouse log file when trying to determine the +// port number on which it is listening. +const CLICKHOUSE_PORT: &'static str = "Application: Listening for http://[::1]"; + /// A `ClickHouseInstance` is used to start and manage a ClickHouse single node server process. #[derive(Debug)] pub struct ClickHouseInstance { @@ -157,9 +170,11 @@ impl ClickHouseInstance { .env("CH_REPLICA_NUMBER", r_number) .env("CH_REPLICA_HOST_01", "::1") .env("CH_REPLICA_HOST_02", "::1") - // ClickHouse servers have a small quirk, where when setting the keeper hosts as IPv6 localhost - // addresses in the replica configuration file, they must be wrapped in square brackets - // Otherwise, when running any query, a "Service not found" error appears. + // ClickHouse servers have a small quirk, where when setting the + // keeper hosts as IPv6 localhost addresses in the replica + // configuration file, they must be wrapped in square brackets + // Otherwise, when running any query, a "Service not found" error + // appears. .env("CH_KEEPER_HOST_01", "[::1]") .env("CH_KEEPER_HOST_02", "[::1]") .env("CH_KEEPER_HOST_03", "[::1]") @@ -171,7 +186,12 @@ impl ClickHouseInstance { let data_path = data_dir.root_path().to_path_buf(); let address = SocketAddr::new(IpAddr::V6(Ipv6Addr::LOCALHOST), port); - let result = wait_for_ready(data_dir.log_path()).await; + let result = wait_for_ready( + data_dir.log_path(), + CLICKHOUSE_TIMEOUT, + CLICKHOUSE_READY, + ) + .await; match result { Ok(()) => Ok(Self { data_dir: Some(data_dir), @@ -192,9 +212,9 @@ impl ClickHouseInstance { k_id: u16, config_path: PathBuf, ) -> Result { - // We assume that only 3 keepers will be run, and the ID of the keeper can only - // be one of "1", "2" or "3". This is to avoid having to pass the IDs of the - // other keepers as part of the function's parameters. + // We assume that only 3 keepers will be run, and the ID of the keeper + // can only be one of "1", "2" or "3". This is to avoid having to pass + // the IDs of the other keepers as part of the function's parameters. if ![1, 2, 3].contains(&k_id) { return Err(ClickHouseError::InvalidKeeperId.into()); } @@ -240,7 +260,12 @@ impl ClickHouseInstance { let data_path = data_dir.root_path().to_path_buf(); let address = SocketAddr::new(IpAddr::V6(Ipv6Addr::LOCALHOST), port); - let result = wait_for_ready(data_dir.keeper_log_path()).await; + let result = wait_for_ready( + data_dir.keeper_log_path(), + CLICKHOUSE_KEEPER_TIMEOUT, + KEEPER_READY, + ) + .await; match result { Ok(()) => Ok(Self { data_dir: Some(data_dir), @@ -641,8 +666,9 @@ pub async fn wait_for_port( Ok(p) } -// Parse the ClickHouse log file at the given path, looking for a line reporting the port number of -// the HTTP server. This is only used if the port is chosen by the OS, not the caller. +// Parse the ClickHouse log file at the given path, looking for a line +// reporting the port number of the HTTP server. This is only used if the port +// is chosen by the OS, not the caller. async fn discover_local_listening_port( path: &Utf8Path, timeout: Duration, @@ -655,23 +681,21 @@ async fn discover_local_listening_port( // Parse the clickhouse log for a port number. // -// NOTE: This function loops forever until the expected line is found. It should be run under a -// timeout, or some other mechanism for cancelling it. +// NOTE: This function loops forever until the expected line is found. It +// should be run under a timeout, or some other mechanism for cancelling it. async fn find_clickhouse_port_in_log( path: &Utf8Path, ) -> Result { let mut reader = BufReader::new(File::open(path).await?); - const NEEDLE: &str = - " Application: Listening for http://[::1]"; let mut lines = reader.lines(); loop { let line = lines.next_line().await?; match line { Some(line) => { - if let Some(needle_start) = line.find(NEEDLE) { + if let Some(needle_start) = line.find(CLICKHOUSE_PORT) { // Our needle ends with `http://[::1]`; we'll split on the // colon we expect to follow it to find the port. - let address_start = needle_start + NEEDLE.len(); + let address_start = needle_start + CLICKHOUSE_PORT.len(); return line[address_start..] .trim() .split(':') @@ -697,11 +721,12 @@ async fn find_clickhouse_port_in_log( // Wait for the ClickHouse log file to report it is ready to receive connections pub async fn wait_for_ready( log_path: Utf8PathBuf, + timeout: Duration, + needle: &str, ) -> Result<(), anyhow::Error> { let p = poll::wait_for_condition( || async { - let result = - discover_ready(&log_path, CLICKHOUSE_KEEPER_TIMEOUT).await; + let result = discover_ready(&log_path, timeout, needle).await; match result { Ok(ready) => Ok(ready), Err(e) => { @@ -721,40 +746,41 @@ pub async fn wait_for_ready( } }, &Duration::from_millis(500), - &CLICKHOUSE_KEEPER_TIMEOUT, + &timeout, ) .await .context("waiting to discover if ClickHouse is ready for connections")?; Ok(p) } -// Parse the ClickHouse log file at the given path, looking for a line reporting that the server -// is ready for connections. +// Parse the ClickHouse log file at the given path, looking for a line +// reporting that the server is ready for connections. async fn discover_ready( path: &Utf8Path, timeout: Duration, + needle: &str, ) -> Result<(), ClickHouseError> { let timeout = Instant::now() + timeout; - tokio::time::timeout_at(timeout, clickhouse_ready_from_log(path)) + tokio::time::timeout_at(timeout, clickhouse_ready_from_log(path, needle)) .await .map_err(|_| ClickHouseError::Timeout)? } // Parse the clickhouse log to know if the server is ready for connections. // -// NOTE: This function loops forever until the expected line is found. It should be run under a -// timeout, or some other mechanism for cancelling it. +// NOTE: This function loops forever until the expected line is found. It +// should be run under a timeout, or some other mechanism for cancelling it. async fn clickhouse_ready_from_log( path: &Utf8Path, + needle: &str, ) -> Result<(), ClickHouseError> { let mut reader = BufReader::new(File::open(path).await?); - const READY: &str = " Application: Ready for connections"; let mut lines = reader.lines(); loop { let line = lines.next_line().await?; match line { Some(line) => { - if let Some(_) = line.find(READY) { + if let Some(_) = line.find(needle) { return Ok(()); } } @@ -775,7 +801,7 @@ async fn clickhouse_ready_from_log( mod tests { use super::{ discover_local_listening_port, discover_ready, ClickHouseError, - CLICKHOUSE_TIMEOUT, + CLICKHOUSE_PORT, CLICKHOUSE_READY, CLICKHOUSE_TIMEOUT, }; use camino_tempfile::NamedUtf8TempFile; use std::process::Stdio; @@ -824,14 +850,16 @@ mod tests { writeln!(file, "A garbage line").unwrap(); writeln!( file, - "2023.07.31 20:12:38.936192 [ 82373 ] Application: Ready for connections.", + "2023.07.31 20:12:38.936192 [ 82373 ] {}", + CLICKHOUSE_READY, ) .unwrap(); writeln!(file, "Another garbage line").unwrap(); file.flush().unwrap(); assert!(matches!( - discover_ready(file.path(), CLICKHOUSE_TIMEOUT).await, + discover_ready(file.path(), CLICKHOUSE_TIMEOUT, CLICKHOUSE_READY) + .await, Ok(()) )); } @@ -849,17 +877,23 @@ mod tests { writeln!(file, "Another garbage line").unwrap(); file.flush().unwrap(); assert!(matches!( - discover_ready(file.path(), Duration::from_secs(1)).await, + discover_ready( + file.path(), + Duration::from_secs(1), + CLICKHOUSE_READY + ) + .await, Err(ClickHouseError::Timeout {}) )); } // A regression test for #131. // - // The function `discover_local_listening_port` initially read from the log file until EOF, but - // there's no guarantee that ClickHouse has written the port we're searching for before the - // reader consumes the whole file. This test confirms that the file is read until the line is - // found, ignoring EOF, at least until the timeout is hit. + // The function `discover_local_listening_port` initially read from the log + // file until EOF, but there's no guarantee that ClickHouse has written the + // port we're searching for before the reader consumes the whole file. This + // test confirms that the file is read until the line is found, ignoring + // EOF, at least until the timeout is hit. #[tokio::test] async fn test_discover_local_listening_port_slow_write() { // In this case the writer is slightly "slower" than the reader. @@ -880,9 +914,11 @@ mod tests { assert!(read_log_file(reader_timeout, writer_interval).await.is_err()); } - // Implementation of the above tests, simulating simultaneous reading/writing of the log file + // Implementation of the above tests, simulating simultaneous + // reading/writing of the log file // - // This uses Tokio's test utilities to manage time, rather than relying on timeouts. + // This uses Tokio's test utilities to manage time, rather than relying on + // timeouts. async fn read_log_file( reader_timeout: Duration, writer_interval: Duration, @@ -904,12 +940,14 @@ mod tests { // Start a task that slowly writes lines to the log file. // - // NOTE: This looks overly complicated, and it is. We have to wrap this in a mutex because - // both this function, and the writer task we're spawning, need access to the file. They - // may complete in any order, and so it's not possible to give one of them ownership over - // the `NamedTempFile`. If the owning task completes, that may delete the file before the - // other task accesses it. So we need interior mutability (because one of the references is - // mutable for writing), and _this_ scope must own it. + // NOTE: This looks overly complicated, and it is. We have to wrap this + // in a mutex because both this function, and the writer task we're + // spawning, need access to the file. They may complete in any order, + // and so it's not possible to give one of them ownership over the + // `NamedTempFile`. If the owning task completes, that may delete the + // file before the other task accesses it. So we need interior + // mutability (because one of the references is mutable for writing), + // and _this_ scope must own it. let file = Arc::new(Mutex::new(NamedUtf8TempFile::new()?)); let path = file.lock().await.path().to_path_buf(); let writer_file = file.clone(); @@ -927,13 +965,13 @@ mod tests { // (https://github.com/oxidecomputer/omicron/issues/3580). write_and_wait( &mut file, - " Application: List".to_string(), + (&CLICKHOUSE_PORT[..30]).to_string(), writer_interval, ) .await; write_and_wait( &mut file, - format!("ening for http://[::1]:{}\n", EXPECTED_PORT), + format!("{}:{}\n", &CLICKHOUSE_PORT[30..], EXPECTED_PORT), writer_interval, ) .await; @@ -950,8 +988,9 @@ mod tests { // "Run" the test. // - // Note that the futures for the reader/writer tasks must be pinned to the stack, so that - // they may be polled on multiple passes through the select loop without consuming them. + // Note that the futures for the reader/writer tasks must be pinned to + // the stack, so that they may be polled on multiple passes through the + // select loop without consuming them. tokio::pin!(writer_task); tokio::pin!(reader_task); let mut poll_writer = true; diff --git a/tools/ci_download_clickhouse b/tools/ci_download_clickhouse index 675566fad7..93201a772b 100755 --- a/tools/ci_download_clickhouse +++ b/tools/ci_download_clickhouse @@ -19,7 +19,8 @@ DOWNLOAD_DIR="$TARGET_DIR/downloads" # Location where the final clickhouse directory should end up. DEST_DIR="./$TARGET_DIR/clickhouse" -# If you change this, you must also update the md5sums below +# If you change the version in clickhouse_version, you must also update the +# md5sums in clickhouse_checksums CIDL_VERSION="$(cat "$SOURCE_DIR/clickhouse_version")" source "$SOURCE_DIR/clickhouse_checksums" @@ -113,7 +114,6 @@ function configure_os CIDL_PLATFORM="illumos" CIDL_MD5="$CIDL_MD5_ILLUMOS" CIDL_MD5FUNC="do_md5sum" - CIDL_DASHREV=-1 ;; *) fail "unsupported OS: $1" diff --git a/tools/clickhouse_checksums b/tools/clickhouse_checksums index 92a5237301..afddb15cab 100644 --- a/tools/clickhouse_checksums +++ b/tools/clickhouse_checksums @@ -1,3 +1,3 @@ -CIDL_MD5_DARWIN="20603974a929926591fca70ff1df0e45" -CIDL_MD5_LINUX="ea909519bd9d989fd5d090fd9bdd42f1" -CIDL_MD5_ILLUMOS="7702939ce5b4b51846a1ba39f1392306" +CIDL_MD5_DARWIN="3e20c3284b7e6b0cfcfedf622ecf547a" +CIDL_MD5_LINUX="f6c30a25a86deac3bad6c50dcf758fd5" +CIDL_MD5_ILLUMOS="409222de8ecb59e5dd97dcc942ccdffe" diff --git a/tools/clickhouse_version b/tools/clickhouse_version index 93b98bf738..0a988071cd 100644 --- a/tools/clickhouse_version +++ b/tools/clickhouse_version @@ -1 +1 @@ -v22.8.9.24 \ No newline at end of file +v23.8.7.24 diff --git a/tools/install_builder_prerequisites.sh b/tools/install_builder_prerequisites.sh index 0427629960..5fa8ec11ba 100755 --- a/tools/install_builder_prerequisites.sh +++ b/tools/install_builder_prerequisites.sh @@ -128,6 +128,7 @@ function install_packages { fi elif [[ "${HOST_OS}" == "SunOS" ]]; then CLANGVER=15 + RTVER=13 PGVER=13 packages=( "pkg:/package/pkg" @@ -135,6 +136,8 @@ function install_packages { "library/postgresql-$PGVER" "pkg-config" "library/libxmlsec1" + "system/library/gcc-runtime@$RTVER" + "system/library/g++-runtime@$RTVER" # "bindgen leverages libclang to preprocess, parse, and type check C and C++ header files." "pkg:/ooce/developer/clang-$CLANGVER" "system/library/gcc-runtime" @@ -159,7 +162,8 @@ function install_packages { } pkg mediator -a - pkg list -v "${packages[@]}" + pkg publisher + pkg list -afv "${packages[@]}" elif [[ "${HOST_OS}" == "Darwin" ]]; then packages=( 'coreutils'